001: /*
002: * Copyright 2004-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.cyberneko.html.filters;
018:
019: import org.apache.xerces.util.XMLChar;
020: import org.apache.xerces.util.XMLStringBuffer;
021: import org.apache.xerces.xni.Augmentations;
022: import org.apache.xerces.xni.NamespaceContext;
023: import org.apache.xerces.xni.QName;
024: import org.apache.xerces.xni.XMLAttributes;
025: import org.apache.xerces.xni.XMLLocator;
026: import org.apache.xerces.xni.XMLString;
027: import org.apache.xerces.xni.XNIException;
028: import org.apache.xerces.xni.parser.XMLComponentManager;
029: import org.apache.xerces.xni.parser.XMLConfigurationException;
030: import org.cyberneko.html.HTMLAugmentations;
031: import org.cyberneko.html.HTMLEventInfo;
032:
033: /**
034: * This filter purifies the HTML input to ensure XML well-formedness.
035: * The purification process includes:
036: * <ul>
037: * <li>fixing illegal characters in the document, including
038: * <ul>
039: * <li>element and attribute names,
040: * <li>processing instruction target and data,
041: * <li>document text;
042: * </ul>
043: * <li>ensuring the string "--" does not appear in the content of
044: * a comment;
045: * <li>ensuring the string "]]>" does not appear in the content of
046: * a CDATA section;
047: * <li>ensuring that the XML declaration has required pseudo-attributes
048: * and that the values are correct;
049: * and
050: * <li>synthesized missing namespace bindings.
051: * </ul>
052: * <p>
053: * Illegal characters in XML names are converted to the character
054: * sequence "_u####_" where "####" is the value of the Unicode
055: * character represented in hexadecimal. Whereas illegal characters
056: * appearing in document content is converted to the character
057: * sequence "\\u####".
058: * <p>
059: * In comments, the character '-' is replaced by the character
060: * sequence "- " to prevent "--" from ever appearing in the comment
061: * content. For CDATA sections, the character ']' is replaced by
062: * the character sequence "] " to prevent "]]" from appearing.
063: * <p>
064: * The URI used for synthesized namespace bindings is
065: * "http://cyberneko.org/html/ns/synthesized/<i>number</i>" where
066: * <i>number</i> is generated to ensure uniqueness.
067: *
068: * @author Andy Clark
069: *
070: * @version $Id: Purifier.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
071: */
072: public class Purifier extends DefaultFilter {
073:
074: //
075: // Constants
076: //
077:
078: /** Synthesized namespace binding prefix. */
079: public static final String SYNTHESIZED_NAMESPACE_PREFX = "http://cyberneko.org/html/ns/synthesized/";
080:
081: /** Namespaces. */
082: protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
083:
084: /** Include infoset augmentations. */
085: protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
086:
087: /** Recognized features. */
088: private static final String[] RECOGNIZED_FEATURES = { NAMESPACES,
089: AUGMENTATIONS, };
090:
091: // static vars
092:
093: /** Synthesized event info item. */
094: protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
095:
096: //
097: // Data
098: //
099:
100: // features
101:
102: /** Namespaces. */
103: protected boolean fNamespaces;
104:
105: /** Augmentations. */
106: protected boolean fAugmentations;
107:
108: // state
109:
110: /** True if the doctype declaration was seen. */
111: protected boolean fSeenDoctype;
112:
113: /** True if root element was seen. */
114: protected boolean fSeenRootElement;
115:
116: /** True if inside a CDATA section. */
117: protected boolean fInCDATASection;
118:
119: // doctype declaration info
120:
121: /** Public identifier of doctype declaration. */
122: protected String fPublicId;
123:
124: /** System identifier of doctype declaration. */
125: protected String fSystemId;
126:
127: // namespace info
128:
129: /** Namespace information. */
130: protected NamespaceContext fNamespaceContext;
131:
132: /** Synthesized namespace binding count. */
133: protected int fSynthesizedNamespaceCount;
134:
135: // temp vars
136:
137: /** Qualified name. */
138: private QName fQName = new QName();
139:
140: /** Augmentations. */
141: private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
142:
143: /** String buffer. */
144: private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
145:
146: //
147: // XMLComponent methods
148: //
149:
150: public void reset(XMLComponentManager manager)
151: throws XMLConfigurationException {
152:
153: // state
154: fInCDATASection = false;
155:
156: // features
157: fNamespaces = manager.getFeature(NAMESPACES);
158: fAugmentations = manager.getFeature(AUGMENTATIONS);
159:
160: } // reset(XMLComponentManager)
161:
162: //
163: // XMLDocumentHandler methods
164: //
165:
166: /** Start document. */
167: public void startDocument(XMLLocator locator, String encoding,
168: Augmentations augs) throws XNIException {
169: fNamespaceContext = fNamespaces ? new NamespaceBinder.NamespaceSupport()
170: : null;
171: fSynthesizedNamespaceCount = 0;
172: handleStartDocument();
173: super .startDocument(locator, encoding, augs);
174: } // startDocument(XMLLocator,String,Augmentations)
175:
176: /** Start document. */
177: public void startDocument(XMLLocator locator, String encoding,
178: NamespaceContext nscontext, Augmentations augs)
179: throws XNIException {
180: fNamespaceContext = nscontext;
181: fSynthesizedNamespaceCount = 0;
182: handleStartDocument();
183: super .startDocument(locator, encoding, nscontext, augs);
184: } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)
185:
186: /** XML declaration. */
187: public void xmlDecl(String version, String encoding,
188: String standalone, Augmentations augs) throws XNIException {
189: if (version == null || !version.equals("1.0")) {
190: version = "1.0";
191: }
192: if (encoding != null && encoding.length() == 0) {
193: encoding = null;
194: }
195: if (standalone != null) {
196: if (!standalone.equalsIgnoreCase("true")
197: && !standalone.equalsIgnoreCase("false")) {
198: standalone = null;
199: } else {
200: standalone = standalone.toLowerCase();
201: }
202: }
203: super .xmlDecl(version, encoding, standalone, augs);
204: } // xmlDecl(String,String,String,Augmentations)
205:
206: /** Comment. */
207: public void comment(XMLString text, Augmentations augs)
208: throws XNIException {
209: StringBuffer str = new StringBuffer(purifyText(text).toString());
210: int length = str.length();
211: for (int i = length - 1; i >= 0; i--) {
212: char c = str.charAt(i);
213: if (c == '-') {
214: str.insert(i + 1, ' ');
215: }
216: }
217: fStringBuffer.length = 0;
218: fStringBuffer.append(str.toString());
219: text = fStringBuffer;
220: super .comment(text, augs);
221: } // comment(XMLString,Augmentations)
222:
223: /** Processing instruction. */
224: public void processingInstruction(String target, XMLString data,
225: Augmentations augs) throws XNIException {
226: target = purifyName(target, true);
227: data = purifyText(data);
228: super .processingInstruction(target, data, augs);
229: } // processingInstruction(String,XMLString,Augmentations)
230:
231: /** Doctype declaration. */
232: public void doctypeDecl(String root, String pubid, String sysid,
233: Augmentations augs) throws XNIException {
234: fSeenDoctype = true;
235: // NOTE: It doesn't matter what the root element name is because
236: // it must match the root element. -Ac
237: fPublicId = pubid;
238: fSystemId = sysid;
239: // NOTE: If the public identifier is specified, then a system
240: // identifier must also be specified. -Ac
241: if (fPublicId != null && fSystemId == null) {
242: fSystemId = "";
243: }
244: // NOTE: Can't save the augmentations because the object state
245: // is transient. -Ac
246: } // doctypeDecl(String,String,String,Augmentations)
247:
248: /** Start element. */
249: public void startElement(QName element, XMLAttributes attrs,
250: Augmentations augs) throws XNIException {
251: handleStartElement(element, attrs);
252: super .startElement(element, attrs, augs);
253: } // startElement(QName,XMLAttributes,Augmentations)
254:
255: /** Empty element. */
256: public void emptyElement(QName element, XMLAttributes attrs,
257: Augmentations augs) throws XNIException {
258: handleStartElement(element, attrs);
259: super .emptyElement(element, attrs, augs);
260: } // emptyElement(QName,XMLAttributes,Augmentations)
261:
262: /** Start CDATA section. */
263: public void startCDATA(Augmentations augs) throws XNIException {
264: fInCDATASection = true;
265: super .startCDATA(augs);
266: } // startCDATA(Augmentations)
267:
268: /** End CDATA section. */
269: public void endCDATA(Augmentations augs) throws XNIException {
270: fInCDATASection = false;
271: super .endCDATA(augs);
272: } // endCDATA(Augmentations)
273:
274: /** Characters. */
275: public void characters(XMLString text, Augmentations augs)
276: throws XNIException {
277: text = purifyText(text);
278: if (fInCDATASection) {
279: StringBuffer str = new StringBuffer(text.toString());
280: int length = str.length();
281: for (int i = length - 1; i >= 0; i--) {
282: char c = str.charAt(i);
283: if (c == ']') {
284: str.insert(i + 1, ' ');
285: }
286: }
287: fStringBuffer.length = 0;
288: fStringBuffer.append(str.toString());
289: text = fStringBuffer;
290: }
291: super .characters(text, augs);
292: } // characters(XMLString,Augmentations)
293:
294: /** End element. */
295: public void endElement(QName element, Augmentations augs)
296: throws XNIException {
297: element = purifyQName(element);
298: if (fNamespaces) {
299: if (element.prefix != null && element.uri == null) {
300: element.uri = fNamespaceContext.getURI(element.prefix);
301: }
302: }
303: super .endElement(element, augs);
304: } // endElement(QName,Augmentations)
305:
306: //
307: // Protected methods
308: //
309:
310: /** Handle start document. */
311: protected void handleStartDocument() {
312: fSeenDoctype = false;
313: fSeenRootElement = false;
314: } // handleStartDocument()
315:
316: /** Handle start element. */
317: protected void handleStartElement(QName element, XMLAttributes attrs) {
318:
319: // handle element and attributes
320: element = purifyQName(element);
321: int attrCount = attrs != null ? attrs.getLength() : 0;
322: for (int i = attrCount - 1; i >= 0; i--) {
323: // purify attribute name
324: attrs.getName(i, fQName);
325: attrs.setName(i, purifyQName(fQName));
326:
327: // synthesize namespace bindings
328: if (fNamespaces) {
329: if (!fQName.rawname.equals("xmlns")
330: && !fQName.rawname.startsWith("xmlns:")) {
331: // NOTE: Must get attribute name again because the
332: // purifyQName method does not guarantee that
333: // the same QName object is returned. -Ac
334: attrs.getName(i, fQName);
335: if (fQName.prefix != null && fQName.uri == null) {
336: synthesizeBinding(attrs, fQName.prefix);
337: }
338: }
339: }
340: }
341:
342: // synthesize namespace bindings
343: if (fNamespaces) {
344: if (element.prefix != null && element.uri == null) {
345: synthesizeBinding(attrs, element.prefix);
346: }
347: }
348:
349: // synthesize doctype declaration
350: if (!fSeenRootElement && fSeenDoctype) {
351: Augmentations augs = synthesizedAugs();
352: super .doctypeDecl(element.rawname, fPublicId, fSystemId,
353: augs);
354: }
355:
356: // mark start element as seen
357: fSeenRootElement = true;
358:
359: } // handleStartElement(QName,XMLAttributes)
360:
361: /** Synthesize namespace binding. */
362: protected void synthesizeBinding(XMLAttributes attrs, String ns) {
363: String prefix = "xmlns";
364: String localpart = ns;
365: String qname = prefix + ':' + localpart;
366: String uri = NamespaceBinder.NAMESPACES_URI;
367: String atype = "CDATA";
368: String avalue = SYNTHESIZED_NAMESPACE_PREFX
369: + fSynthesizedNamespaceCount++;
370:
371: // add attribute
372: fQName.setValues(prefix, localpart, qname, uri);
373: attrs.addAttribute(fQName, atype, avalue);
374:
375: // bind namespace
376: fNamespaceContext.declarePrefix(ns, avalue);
377:
378: } // synthesizeBinding(XMLAttributes,String)
379:
380: /** Returns an augmentations object with a synthesized item added. */
381: protected final Augmentations synthesizedAugs() {
382: HTMLAugmentations augs = null;
383: if (fAugmentations) {
384: augs = fInfosetAugs;
385: augs.removeAllItems();
386: augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
387: }
388: return augs;
389: } // synthesizedAugs():Augmentations
390:
391: //
392: // Protected methods
393: //
394:
395: /** Purify qualified name. */
396: protected QName purifyQName(QName qname) {
397: qname.prefix = purifyName(qname.prefix, true);
398: qname.localpart = purifyName(qname.localpart, true);
399: qname.rawname = purifyName(qname.rawname, false);
400: return qname;
401: } // purifyQName(QName):QName
402:
403: /** Purify name. */
404: protected String purifyName(String name, boolean localpart) {
405: if (name == null) {
406: return name;
407: }
408: StringBuffer str = new StringBuffer();
409: int length = name.length();
410: boolean seenColon = localpart;
411: for (int i = 0; i < length; i++) {
412: char c = name.charAt(i);
413: if (i == 0) {
414: if (!XMLChar.isNameStart(c)) {
415: str.append("_u" + toHexString(c, 4) + "_");
416: } else {
417: str.append(c);
418: }
419: } else {
420: if ((fNamespaces && c == ':' && seenColon)
421: || !XMLChar.isName(c)) {
422: str.append("_u" + toHexString(c, 4) + "_");
423: } else {
424: str.append(c);
425: }
426: seenColon = seenColon || c == ':';
427: }
428: }
429: return str.toString();
430: } // purifyName(String):String
431:
432: /** Purify content. */
433: protected XMLString purifyText(XMLString text) {
434: fStringBuffer.length = 0;
435: for (int i = 0; i < text.length; i++) {
436: char c = text.ch[text.offset + i];
437: if (XMLChar.isInvalid(c)) {
438: fStringBuffer.append("\\u" + toHexString(c, 4));
439: } else {
440: fStringBuffer.append(c);
441: }
442: }
443: return fStringBuffer;
444: } // purifyText(XMLString):XMLString
445:
446: //
447: // Protected static methods
448: //
449:
450: /** Returns a padded hexadecimal string for the given value. */
451: protected static String toHexString(int c, int padlen) {
452: StringBuffer str = new StringBuffer(padlen);
453: str.append(Integer.toHexString(c));
454: int len = padlen - str.length();
455: for (int i = 0; i < len; i++) {
456: str.insert(0, '0');
457: }
458: return str.toString().toUpperCase();
459: } // toHexString(int,int):String
460:
461: } // class Purifier
|