001: /*
002: * Copyright 2002-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.cyberneko.html;
018:
019: import java.io.IOException;
020: import java.lang.reflect.InvocationTargetException;
021: import java.lang.reflect.Method;
022: import java.text.MessageFormat;
023: import java.util.Locale;
024: import java.util.MissingResourceException;
025: import java.util.ResourceBundle;
026: import java.util.Vector;
027:
028: import org.apache.xerces.util.DefaultErrorHandler;
029: import org.apache.xerces.util.ParserConfigurationSettings;
030: import org.apache.xerces.xni.XMLDTDContentModelHandler;
031: import org.apache.xerces.xni.XMLDTDHandler;
032: import org.apache.xerces.xni.XMLDocumentHandler;
033: import org.apache.xerces.xni.XNIException;
034: import org.apache.xerces.xni.parser.XMLConfigurationException;
035: import org.apache.xerces.xni.parser.XMLDocumentFilter;
036: import org.apache.xerces.xni.parser.XMLDocumentSource;
037: import org.apache.xerces.xni.parser.XMLEntityResolver;
038: import org.apache.xerces.xni.parser.XMLErrorHandler;
039: import org.apache.xerces.xni.parser.XMLInputSource;
040: import org.apache.xerces.xni.parser.XMLParseException;
041: import org.apache.xerces.xni.parser.XMLPullParserConfiguration;
042: import org.cyberneko.html.filters.NamespaceBinder;
043:
044: /**
045: * An XNI-based parser configuration that can be used to parse HTML
046: * documents. This configuration can be used directly in order to
047: * parse HTML documents or can be used in conjunction with any XNI
048: * based tools, such as the Xerces2 implementation.
049: * <p>
050: * This configuration recognizes the following features:
051: * <ul>
052: * <li>http://cyberneko.org/html/features/augmentations
053: * <li>http://cyberneko.org/html/features/report-errors
054: * <li>http://cyberneko.org/html/features/report-errors/simple
055: * <li>http://cyberneko.org/html/features/balance-tags
056: * <li><i>and</i>
057: * <li>the features supported by the scanner and tag balancer components.
058: * </ul>
059: * <p>
060: * This configuration recognizes the following properties:
061: * <ul>
062: * <li>http://cyberneko.org/html/properties/names/elems
063: * <li>http://cyberneko.org/html/properties/names/attrs
064: * <li>http://cyberneko.org/html/properties/filters
065: * <li>http://cyberneko.org/html/properties/error-reporter
066: * <li><i>and</i>
067: * <li>the properties supported by the scanner and tag balancer.
068: * </ul>
069: * <p>
070: * For complete usage information, refer to the documentation.
071: *
072: * @see HTMLScanner
073: * @see HTMLTagBalancer
074: * @see HTMLErrorReporter
075: *
076: * @author Andy Clark
077: *
078: * @version $Id: HTMLConfiguration.java,v 1.9 2005/02/14 03:56:54 andyc Exp $
079: */
080: public class HTMLConfiguration extends ParserConfigurationSettings
081: implements XMLPullParserConfiguration {
082:
083: //
084: // Constants
085: //
086:
087: // features
088:
089: /** Namespaces. */
090: protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
091:
092: /** Include infoset augmentations. */
093: protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
094:
095: /** Report errors. */
096: protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
097:
098: /** Simple report format. */
099: protected static final String SIMPLE_ERROR_FORMAT = "http://cyberneko.org/html/features/report-errors/simple";
100:
101: /** Balance tags. */
102: protected static final String BALANCE_TAGS = "http://cyberneko.org/html/features/balance-tags";
103:
104: // properties
105:
106: /** Modify HTML element names: { "upper", "lower", "default" }. */
107: protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
108:
109: /** Modify HTML attribute names: { "upper", "lower", "default" }. */
110: protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
111:
112: /** Pipeline filters. */
113: protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
114:
115: /** Error reporter. */
116: protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
117:
118: // other
119:
120: /** Error domain. */
121: protected static final String ERROR_DOMAIN = "http://cyberneko.org/html";
122:
123: // private
124:
125: /** Document source class array. */
126: private static final Class[] DOCSOURCE = { XMLDocumentSource.class };
127:
128: //
129: // Data
130: //
131:
132: // handlers
133:
134: /** Document handler. */
135: protected XMLDocumentHandler fDocumentHandler;
136:
137: /** DTD handler. */
138: protected XMLDTDHandler fDTDHandler;
139:
140: /** DTD content model handler. */
141: protected XMLDTDContentModelHandler fDTDContentModelHandler;
142:
143: /** Error handler. */
144: protected XMLErrorHandler fErrorHandler = new DefaultErrorHandler();
145:
146: // other settings
147:
148: /** Entity resolver. */
149: protected XMLEntityResolver fEntityResolver;
150:
151: /** Locale. */
152: protected Locale fLocale = Locale.getDefault();
153:
154: // state
155:
156: /**
157: * Stream opened by parser. Therefore, must close stream manually upon
158: * termination of parsing.
159: */
160: protected boolean fCloseStream;
161:
162: // components
163:
164: /** Components. */
165: protected final Vector fHTMLComponents = new Vector(2);
166:
167: // pipeline
168:
169: /** Document scanner. */
170: protected final HTMLScanner fDocumentScanner = new HTMLScanner();
171:
172: /** HTML tag balancer. */
173: protected final HTMLTagBalancer fTagBalancer = new HTMLTagBalancer();
174:
175: /** Namespace binder. */
176: protected final NamespaceBinder fNamespaceBinder = new NamespaceBinder();
177:
178: // other components
179:
180: /** Error reporter. */
181: protected final HTMLErrorReporter fErrorReporter = new ErrorReporter();
182:
183: // HACK: workarounds Xerces 2.0.x problems
184:
185: /** Parser version is Xerces 2.0.0. */
186: protected static boolean XERCES_2_0_0 = false;
187:
188: /** Parser version is Xerces 2.0.1. */
189: protected static boolean XERCES_2_0_1 = false;
190:
191: /** Parser version is XML4J 4.0.x. */
192: protected static boolean XML4J_4_0_x = false;
193:
194: //
195: // Static initializer
196: //
197:
198: static {
199: try {
200: String VERSION = "org.apache.xerces.impl.Version";
201: Object version = ObjectFactory.createObject(VERSION,
202: VERSION);
203: java.lang.reflect.Field field = version.getClass()
204: .getField("fVersion");
205: String versionStr = String.valueOf(field.get(version));
206: XERCES_2_0_0 = versionStr.equals("Xerces-J 2.0.0");
207: XERCES_2_0_1 = versionStr.equals("Xerces-J 2.0.1");
208: XML4J_4_0_x = versionStr.startsWith("XML4J 4.0.");
209: } catch (Throwable e) {
210: // ignore
211: }
212: } // <clinit>()
213:
214: //
215: // Constructors
216: //
217:
218: /** Default constructor. */
219: public HTMLConfiguration() {
220:
221: // add components
222: addComponent(fDocumentScanner);
223: addComponent(fTagBalancer);
224: addComponent(fNamespaceBinder);
225:
226: //
227: // features
228: //
229:
230: // recognized features
231: String VALIDATION = "http://xml.org/sax/features/validation";
232: String[] recognizedFeatures = { AUGMENTATIONS, NAMESPACES,
233: VALIDATION, REPORT_ERRORS, SIMPLE_ERROR_FORMAT,
234: BALANCE_TAGS, };
235: addRecognizedFeatures(recognizedFeatures);
236: setFeature(AUGMENTATIONS, false);
237: setFeature(NAMESPACES, true);
238: setFeature(VALIDATION, false);
239: setFeature(REPORT_ERRORS, false);
240: setFeature(SIMPLE_ERROR_FORMAT, false);
241: setFeature(BALANCE_TAGS, true);
242:
243: // HACK: Xerces 2.0.0
244: if (XERCES_2_0_0) {
245: // NOTE: These features should not be required but it causes a
246: // problem if they're not there. This will be fixed in
247: // subsequent releases of Xerces.
248: recognizedFeatures = new String[] { "http://apache.org/xml/features/scanner/notify-builtin-refs", };
249: addRecognizedFeatures(recognizedFeatures);
250: }
251:
252: // HACK: Xerces 2.0.1
253: if (XERCES_2_0_0 || XERCES_2_0_1 || XML4J_4_0_x) {
254: // NOTE: These features should not be required but it causes a
255: // problem if they're not there. This should be fixed in
256: // subsequent releases of Xerces.
257: recognizedFeatures = new String[] {
258: "http://apache.org/xml/features/validation/schema/normalized-value",
259: "http://apache.org/xml/features/scanner/notify-char-refs", };
260: addRecognizedFeatures(recognizedFeatures);
261: }
262:
263: //
264: // properties
265: //
266:
267: // recognized properties
268: String[] recognizedProperties = { NAMES_ELEMS, NAMES_ATTRS,
269: FILTERS, ERROR_REPORTER, };
270: addRecognizedProperties(recognizedProperties);
271: setProperty(NAMES_ELEMS, "upper");
272: setProperty(NAMES_ATTRS, "lower");
273: setProperty(ERROR_REPORTER, fErrorReporter);
274:
275: // HACK: Xerces 2.0.0
276: if (XERCES_2_0_0) {
277: // NOTE: This is a hack to get around a problem in the Xerces 2.0.0
278: // AbstractSAXParser. If it uses a parser configuration that
279: // does not have a SymbolTable, then it will remove *all*
280: // attributes. This will be fixed in subsequent releases of
281: // Xerces.
282: String SYMBOL_TABLE = "http://apache.org/xml/properties/internal/symbol-table";
283: recognizedProperties = new String[] { SYMBOL_TABLE, };
284: addRecognizedProperties(recognizedProperties);
285: Object symbolTable = ObjectFactory.createObject(
286: "org.apache.xerces.util.SymbolTable",
287: "org.apache.xerces.util.SymbolTable");
288: setProperty(SYMBOL_TABLE, symbolTable);
289: }
290:
291: } // <init>()
292:
293: //
294: // Public methods
295: //
296:
297: /**
298: * Pushes an input source onto the current entity stack. This
299: * enables the scanner to transparently scan new content (e.g.
300: * the output written by an embedded script). At the end of the
301: * current entity, the scanner returns where it left off at the
302: * time this entity source was pushed.
303: * <p>
304: * <strong>Hint:</strong>
305: * To use this feature to insert the output of <SCRIPT>
306: * tags, remember to buffer the <em>entire</em> output of the
307: * processed instructions before pushing a new input source.
308: * Otherwise, events may appear out of sequence.
309: *
310: * @param inputSource The new input source to start scanning.
311: * @see #evaluateInputSource(XMLInputSource)
312: */
313: public void pushInputSource(XMLInputSource inputSource) {
314: fDocumentScanner.pushInputSource(inputSource);
315: } // pushInputSource(XMLInputSource)
316:
317: /**
318: * <font color="red">EXPERIMENTAL: may change in next release</font><br/>
319: * Immediately evaluates an input source and add the new content (e.g.
320: * the output written by an embedded script).
321: *
322: * @param inputSource The new input source to start scanning.
323: * @see #pushInputSource(XMLInputSource)
324: */
325: public void evaluateInputSource(XMLInputSource inputSource) {
326: fDocumentScanner.evaluateInputSource(inputSource);
327: } // evaluateInputSource(XMLInputSource)
328:
329: // XMLParserConfiguration methods
330: //
331:
332: /** Sets a feature. */
333: public void setFeature(String featureId, boolean state)
334: throws XMLConfigurationException {
335: super .setFeature(featureId, state);
336: int size = fHTMLComponents.size();
337: for (int i = 0; i < size; i++) {
338: HTMLComponent component = (HTMLComponent) fHTMLComponents
339: .elementAt(i);
340: component.setFeature(featureId, state);
341: }
342: } // setFeature(String,boolean)
343:
344: /** Sets a property. */
345: public void setProperty(String propertyId, Object value)
346: throws XMLConfigurationException {
347: super .setProperty(propertyId, value);
348:
349: if (propertyId.equals(FILTERS)) {
350: XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
351: if (filters != null) {
352: for (int i = 0; i < filters.length; i++) {
353: XMLDocumentFilter filter = filters[i];
354: if (filter instanceof HTMLComponent) {
355: addComponent((HTMLComponent) filter);
356: }
357: }
358: }
359: }
360:
361: int size = fHTMLComponents.size();
362: for (int i = 0; i < size; i++) {
363: HTMLComponent component = (HTMLComponent) fHTMLComponents
364: .elementAt(i);
365: component.setProperty(propertyId, value);
366: }
367: } // setProperty(String,Object)
368:
369: /** Sets the document handler. */
370: public void setDocumentHandler(XMLDocumentHandler handler) {
371: fDocumentHandler = handler;
372: if (handler instanceof HTMLTagBalancingListener) {
373: fTagBalancer
374: .setTagBalancingListener((HTMLTagBalancingListener) handler);
375: }
376: } // setDocumentHandler(XMLDocumentHandler)
377:
378: /** Returns the document handler. */
379: public XMLDocumentHandler getDocumentHandler() {
380: return fDocumentHandler;
381: } // getDocumentHandler():XMLDocumentHandler
382:
383: /** Sets the DTD handler. */
384: public void setDTDHandler(XMLDTDHandler handler) {
385: fDTDHandler = handler;
386: } // setDTDHandler(XMLDTDHandler)
387:
388: /** Returns the DTD handler. */
389: public XMLDTDHandler getDTDHandler() {
390: return fDTDHandler;
391: } // getDTDHandler():XMLDTDHandler
392:
393: /** Sets the DTD content model handler. */
394: public void setDTDContentModelHandler(
395: XMLDTDContentModelHandler handler) {
396: fDTDContentModelHandler = handler;
397: } // setDTDContentModelHandler(XMLDTDContentModelHandler)
398:
399: /** Returns the DTD content model handler. */
400: public XMLDTDContentModelHandler getDTDContentModelHandler() {
401: return fDTDContentModelHandler;
402: } // getDTDContentModelHandler():XMLDTDContentModelHandler
403:
404: /** Sets the error handler. */
405: public void setErrorHandler(XMLErrorHandler handler) {
406: fErrorHandler = handler;
407: } // setErrorHandler(XMLErrorHandler)
408:
409: /** Returns the error handler. */
410: public XMLErrorHandler getErrorHandler() {
411: return fErrorHandler;
412: } // getErrorHandler():XMLErrorHandler
413:
414: /** Sets the entity resolver. */
415: public void setEntityResolver(XMLEntityResolver resolver) {
416: fEntityResolver = resolver;
417: } // setEntityResolver(XMLEntityResolver)
418:
419: /** Returns the entity resolver. */
420: public XMLEntityResolver getEntityResolver() {
421: return fEntityResolver;
422: } // getEntityResolver():XMLEntityResolver
423:
424: /** Sets the locale. */
425: public void setLocale(Locale locale) {
426: if (locale == null) {
427: locale = Locale.getDefault();
428: }
429: fLocale = locale;
430: } // setLocale(Locale)
431:
432: /** Returns the locale. */
433: public Locale getLocale() {
434: return fLocale;
435: } // getLocale():Locale
436:
437: /** Parses a document. */
438: public void parse(XMLInputSource source) throws XNIException,
439: IOException {
440: setInputSource(source);
441: parse(true);
442: } // parse(XMLInputSource)
443:
444: //
445: // XMLPullParserConfiguration methods
446: //
447:
448: // parsing
449:
450: /**
451: * Sets the input source for the document to parse.
452: *
453: * @param inputSource The document's input source.
454: *
455: * @exception XMLConfigurationException Thrown if there is a
456: * configuration error when initializing the
457: * parser.
458: * @exception IOException Thrown on I/O error.
459: *
460: * @see #parse(boolean)
461: */
462: public void setInputSource(XMLInputSource inputSource)
463: throws XMLConfigurationException, IOException {
464: reset();
465: fCloseStream = inputSource.getByteStream() == null
466: && inputSource.getCharacterStream() == null;
467: fDocumentScanner.setInputSource(inputSource);
468: } // setInputSource(XMLInputSource)
469:
470: /**
471: * Parses the document in a pull parsing fashion.
472: *
473: * @param complete True if the pull parser should parse the
474: * remaining document completely.
475: *
476: * @return True if there is more document to parse.
477: *
478: * @exception XNIException Any XNI exception, possibly wrapping
479: * another exception.
480: * @exception IOException An IO exception from the parser, possibly
481: * from a byte stream or character stream
482: * supplied by the parser.
483: *
484: * @see #setInputSource
485: */
486: public boolean parse(boolean complete) throws XNIException,
487: IOException {
488: try {
489: boolean more = fDocumentScanner.scanDocument(complete);
490: if (!more) {
491: cleanup();
492: }
493: return more;
494: } catch (XNIException e) {
495: cleanup();
496: throw e;
497: } catch (IOException e) {
498: cleanup();
499: throw e;
500: }
501: } // parse(boolean):boolean
502:
503: /**
504: * If the application decides to terminate parsing before the xml document
505: * is fully parsed, the application should call this method to free any
506: * resource allocated during parsing. For example, close all opened streams.
507: */
508: public void cleanup() {
509: fDocumentScanner.cleanup(fCloseStream);
510: } // cleanup()
511:
512: //
513: // Protected methods
514: //
515:
516: /** Adds a component. */
517: protected void addComponent(HTMLComponent component) {
518:
519: // add component to list
520: fHTMLComponents.addElement(component);
521:
522: // add recognized features and set default states
523: String[] features = component.getRecognizedFeatures();
524: addRecognizedFeatures(features);
525: int featureCount = features != null ? features.length : 0;
526: for (int i = 0; i < featureCount; i++) {
527: Boolean state = component.getFeatureDefault(features[i]);
528: if (state != null) {
529: setFeature(features[i], state.booleanValue());
530: }
531: }
532:
533: // add recognized properties and set default values
534: String[] properties = component.getRecognizedProperties();
535: addRecognizedProperties(properties);
536: int propertyCount = properties != null ? properties.length : 0;
537: for (int i = 0; i < propertyCount; i++) {
538: Object value = component.getPropertyDefault(properties[i]);
539: if (value != null) {
540: setProperty(properties[i], value);
541: }
542: }
543:
544: } // addComponent(HTMLComponent)
545:
546: /** Resets the parser configuration. */
547: protected void reset() throws XMLConfigurationException {
548:
549: // reset components
550: int size = fHTMLComponents.size();
551: for (int i = 0; i < size; i++) {
552: HTMLComponent component = (HTMLComponent) fHTMLComponents
553: .elementAt(i);
554: component.reset(this );
555: }
556:
557: // configure pipeline
558: XMLDocumentSource lastSource = fDocumentScanner;
559: if (getFeature(BALANCE_TAGS)) {
560: lastSource.setDocumentHandler(fTagBalancer);
561: fTagBalancer.setDocumentSource(fDocumentScanner);
562: lastSource = fTagBalancer;
563: }
564: if (getFeature(NAMESPACES)) {
565: lastSource.setDocumentHandler(fNamespaceBinder);
566: fNamespaceBinder.setDocumentSource(fTagBalancer);
567: lastSource = fNamespaceBinder;
568: }
569: XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
570: if (filters != null) {
571: for (int i = 0; i < filters.length; i++) {
572: XMLDocumentFilter filter = filters[i];
573: Class filterClass = filter.getClass();
574: try {
575: Method filterMethod = filterClass.getMethod(
576: "setDocumentSource", DOCSOURCE);
577: if (filterMethod != null) {
578: filterMethod.invoke(filter,
579: new Object[] { lastSource });
580: }
581: } catch (IllegalAccessException e) {
582: // ignore
583: } catch (InvocationTargetException e) {
584: // ignore
585: } catch (NoSuchMethodException e) {
586: // ignore
587: }
588: lastSource.setDocumentHandler(filter);
589: lastSource = filter;
590: }
591: }
592: lastSource.setDocumentHandler(fDocumentHandler);
593:
594: } // reset()
595:
596: //
597: // Interfaces
598: //
599:
600: /**
601: * Defines an error reporter for reporting HTML errors. There is no such
602: * thing as a fatal error in parsing HTML. I/O errors are fatal but should
603: * throw an <code>IOException</code> directly instead of reporting an error.
604: * <p>
605: * When used in a configuration, the error reporter instance should be
606: * set as a property with the following property identifier:
607: * <pre>
608: * "http://cyberneko.org/html/internal/error-reporter" in the
609: * </pre>
610: * Components in the configuration can query the error reporter using this
611: * property identifier.
612: * <p>
613: * <strong>Note:</strong>
614: * All reported errors are within the domain "http://cyberneko.org/html".
615: *
616: * @author Andy Clark
617: */
618: protected class ErrorReporter implements HTMLErrorReporter {
619:
620: //
621: // Data
622: //
623:
624: /** Last locale. */
625: protected Locale fLastLocale;
626:
627: /** Error messages. */
628: protected ResourceBundle fErrorMessages;
629:
630: //
631: // HTMLErrorReporter methods
632: //
633:
634: /** Format message without reporting error. */
635: public String formatMessage(String key, Object[] args) {
636: if (!getFeature(SIMPLE_ERROR_FORMAT)) {
637: if (!fLocale.equals(fLastLocale)) {
638: fErrorMessages = null;
639: fLastLocale = fLocale;
640: }
641: if (fErrorMessages == null) {
642: fErrorMessages = ResourceBundle.getBundle(
643: "org/cyberneko/html/res/ErrorMessages",
644: fLocale);
645: }
646: try {
647: String value = fErrorMessages.getString(key);
648: String message = MessageFormat.format(value, args);
649: return message;
650: } catch (MissingResourceException e) {
651: // ignore and return a simple format
652: }
653: }
654: return formatSimpleMessage(key, args);
655: } // formatMessage(String,Object[]):String
656:
657: /** Reports a warning. */
658: public void reportWarning(String key, Object[] args)
659: throws XMLParseException {
660: if (fErrorHandler != null) {
661: fErrorHandler.warning(ERROR_DOMAIN, key,
662: createException(key, args));
663: }
664: } // reportWarning(String,Object[])
665:
666: /** Reports an error. */
667: public void reportError(String key, Object[] args)
668: throws XMLParseException {
669: if (fErrorHandler != null) {
670: fErrorHandler.error(ERROR_DOMAIN, key, createException(
671: key, args));
672: }
673: } // reportError(String,Object[])
674:
675: //
676: // Protected methods
677: //
678:
679: /** Creates parse exception. */
680: protected XMLParseException createException(String key,
681: Object[] args) {
682: String message = formatMessage(key, args);
683: return new XMLParseException(fDocumentScanner, message);
684: } // createException(String,Object[]):XMLParseException
685:
686: /** Format simple message. */
687: protected String formatSimpleMessage(String key, Object[] args) {
688: StringBuffer str = new StringBuffer();
689: str.append(ERROR_DOMAIN);
690: str.append('#');
691: str.append(key);
692: if (args != null && args.length > 0) {
693: str.append('\t');
694: for (int i = 0; i < args.length; i++) {
695: if (i > 0) {
696: str.append('\t');
697: }
698: str.append(String.valueOf(args[i]));
699: }
700: }
701: return str.toString();
702: } // formatSimpleMessage(String,
703:
704: } // class ErrorReporter
705:
706: } // class HTMLConfiguration
|