001: /*
002: * Copyright 2002-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: * ==============================================================
016: * This file contains some code from Apache Xerces-J which is
017: * used in accordance with the Apache license.
018: */
019:
020: package org.cyberneko.html.parsers;
021:
022: import java.io.IOException;
023: import java.io.InputStream;
024: import java.io.Reader;
025:
026: import org.apache.xerces.impl.Constants;
027: import org.apache.xerces.util.ErrorHandlerWrapper;
028: import org.apache.xerces.xni.Augmentations;
029: import org.apache.xerces.xni.NamespaceContext;
030: import org.apache.xerces.xni.QName;
031: import org.apache.xerces.xni.XMLAttributes;
032: import org.apache.xerces.xni.XMLDocumentHandler;
033: import org.apache.xerces.xni.XMLLocator;
034: import org.apache.xerces.xni.XMLResourceIdentifier;
035: import org.apache.xerces.xni.XMLString;
036: import org.apache.xerces.xni.XNIException;
037: import org.apache.xerces.xni.parser.XMLConfigurationException;
038: import org.apache.xerces.xni.parser.XMLDocumentSource;
039: import org.apache.xerces.xni.parser.XMLErrorHandler;
040: import org.apache.xerces.xni.parser.XMLInputSource;
041: import org.apache.xerces.xni.parser.XMLParseException;
042: import org.apache.xerces.xni.parser.XMLParserConfiguration;
043: import org.cyberneko.html.HTMLConfiguration;
044: import org.w3c.dom.CDATASection;
045: import org.w3c.dom.Comment;
046: import org.w3c.dom.Document;
047: import org.w3c.dom.DocumentFragment;
048: import org.w3c.dom.Element;
049: import org.w3c.dom.EntityReference;
050: import org.w3c.dom.Node;
051: import org.w3c.dom.ProcessingInstruction;
052: import org.w3c.dom.Text;
053: import org.xml.sax.ErrorHandler;
054: import org.xml.sax.InputSource;
055: import org.xml.sax.SAXException;
056: import org.xml.sax.SAXNotRecognizedException;
057: import org.xml.sax.SAXNotSupportedException;
058: import org.xml.sax.SAXParseException;
059:
060: /**
061: * A DOM parser for HTML fragments.
062: *
063: * @author Andy Clark
064: *
065: * @version $Id: DOMFragmentParser.java,v 1.8 2005/02/14 03:56:54 andyc Exp $
066: */
067: public class DOMFragmentParser implements XMLDocumentHandler {
068:
069: //
070: // Constants
071: //
072:
073: // features
074:
075: /** Document fragment balancing only. */
076: protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/document-fragment";
077:
078: /** Recognized features. */
079: protected static final String[] RECOGNIZED_FEATURES = { DOCUMENT_FRAGMENT, };
080:
081: // properties
082:
083: /** Property identifier: error handler. */
084: protected static final String ERROR_HANDLER = Constants.XERCES_PROPERTY_PREFIX
085: + Constants.ERROR_HANDLER_PROPERTY;
086:
087: /** Current element node. */
088: protected static final String CURRENT_ELEMENT_NODE = Constants.XERCES_PROPERTY_PREFIX
089: + Constants.CURRENT_ELEMENT_NODE_PROPERTY;
090:
091: /** Recognized properties. */
092: protected static final String[] RECOGNIZED_PROPERTIES = {
093: ERROR_HANDLER, CURRENT_ELEMENT_NODE, };
094:
095: //
096: // Data
097: //
098:
099: /** Parser configuration. */
100: protected XMLParserConfiguration fParserConfiguration;
101:
102: /** Document source. */
103: protected XMLDocumentSource fDocumentSource;
104:
105: /** DOM document fragment. */
106: protected DocumentFragment fDocumentFragment;
107:
108: /** Document. */
109: protected Document fDocument;
110:
111: /** Current node. */
112: protected Node fCurrentNode;
113:
114: /** True if within a CDATA section. */
115: protected boolean fInCDATASection;
116:
117: //
118: // Constructors
119: //
120:
121: /** Default constructor. */
122: public DOMFragmentParser() {
123: fParserConfiguration = new HTMLConfiguration();
124: fParserConfiguration.addRecognizedFeatures(RECOGNIZED_FEATURES);
125: fParserConfiguration
126: .addRecognizedProperties(RECOGNIZED_PROPERTIES);
127: fParserConfiguration.setFeature(DOCUMENT_FRAGMENT, true);
128: fParserConfiguration.setDocumentHandler(this );
129: } // <init>()
130:
131: //
132: // Public methods
133: //
134:
135: /** Parses a document fragment. */
136: public void parse(String systemId, DocumentFragment fragment)
137: throws SAXException, IOException {
138: parse(new InputSource(systemId), fragment);
139: } // parse(String,DocumentFragment)
140:
141: /** Parses a document fragment. */
142: public void parse(InputSource source, DocumentFragment fragment)
143: throws SAXException, IOException {
144:
145: fCurrentNode = fDocumentFragment = fragment;
146: fDocument = fDocumentFragment.getOwnerDocument();
147:
148: try {
149: String pubid = source.getPublicId();
150: String sysid = source.getSystemId();
151: String encoding = source.getEncoding();
152: InputStream stream = source.getByteStream();
153: Reader reader = source.getCharacterStream();
154:
155: XMLInputSource inputSource = new XMLInputSource(pubid,
156: sysid, sysid);
157: inputSource.setEncoding(encoding);
158: inputSource.setByteStream(stream);
159: inputSource.setCharacterStream(reader);
160:
161: fParserConfiguration.parse(inputSource);
162: } catch (XMLParseException e) {
163: Exception ex = e.getException();
164: if (ex != null) {
165: throw new SAXParseException(e.getMessage(), null, ex);
166: }
167: throw new SAXParseException(e.getMessage(), null);
168: }
169:
170: } // parse(InputSource,DocumentFragment)
171:
172: /**
173: * Allow an application to register an error event handler.
174: *
175: * <p>If the application does not register an error handler, all
176: * error events reported by the SAX parser will be silently
177: * ignored; however, normal processing may not continue. It is
178: * highly recommended that all SAX applications implement an
179: * error handler to avoid unexpected bugs.</p>
180: *
181: * <p>Applications may register a new or different handler in the
182: * middle of a parse, and the SAX parser must begin using the new
183: * handler immediately.</p>
184: *
185: * @param errorHandler The error handler.
186: * @exception java.lang.NullPointerException If the handler
187: * argument is null.
188: * @see #getErrorHandler
189: */
190: public void setErrorHandler(ErrorHandler errorHandler) {
191: fParserConfiguration.setErrorHandler(new ErrorHandlerWrapper(
192: errorHandler));
193: } // setErrorHandler(ErrorHandler)
194:
195: /**
196: * Return the current error handler.
197: *
198: * @return The current error handler, or null if none
199: * has been registered.
200: * @see #setErrorHandler
201: */
202: public ErrorHandler getErrorHandler() {
203:
204: ErrorHandler errorHandler = null;
205: try {
206: XMLErrorHandler xmlErrorHandler = (XMLErrorHandler) fParserConfiguration
207: .getProperty(ERROR_HANDLER);
208: if (xmlErrorHandler != null
209: && xmlErrorHandler instanceof ErrorHandlerWrapper) {
210: errorHandler = ((ErrorHandlerWrapper) xmlErrorHandler)
211: .getErrorHandler();
212: }
213: } catch (XMLConfigurationException e) {
214: // do nothing
215: }
216: return errorHandler;
217:
218: } // getErrorHandler():ErrorHandler
219:
220: /**
221: * Set the state of any feature in a SAX2 parser. The parser
222: * might not recognize the feature, and if it does recognize
223: * it, it might not be able to fulfill the request.
224: *
225: * @param featureId The unique identifier (URI) of the feature.
226: * @param state The requested state of the feature (true or false).
227: *
228: * @exception SAXNotRecognizedException If the
229: * requested feature is not known.
230: * @exception SAXNotSupportedException If the
231: * requested feature is known, but the requested
232: * state is not supported.
233: */
234: public void setFeature(String featureId, boolean state)
235: throws SAXNotRecognizedException, SAXNotSupportedException {
236:
237: try {
238: fParserConfiguration.setFeature(featureId, state);
239: } catch (XMLConfigurationException e) {
240: String message = e.getMessage();
241: if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
242: throw new SAXNotRecognizedException(message);
243: } else {
244: throw new SAXNotSupportedException(message);
245: }
246: }
247:
248: } // setFeature(String,boolean)
249:
250: /**
251: * Query the state of a feature.
252: *
253: * Query the current state of any feature in a SAX2 parser. The
254: * parser might not recognize the feature.
255: *
256: * @param featureId The unique identifier (URI) of the feature
257: * being set.
258: * @return The current state of the feature.
259: * @exception org.xml.sax.SAXNotRecognizedException If the
260: * requested feature is not known.
261: * @exception SAXNotSupportedException If the
262: * requested feature is known but not supported.
263: */
264: public boolean getFeature(String featureId)
265: throws SAXNotRecognizedException, SAXNotSupportedException {
266:
267: try {
268: return fParserConfiguration.getFeature(featureId);
269: } catch (XMLConfigurationException e) {
270: String message = e.getMessage();
271: if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
272: throw new SAXNotRecognizedException(message);
273: } else {
274: throw new SAXNotSupportedException(message);
275: }
276: }
277:
278: } // getFeature(String):boolean
279:
280: /**
281: * Set the value of any property in a SAX2 parser. The parser
282: * might not recognize the property, and if it does recognize
283: * it, it might not support the requested value.
284: *
285: * @param propertyId The unique identifier (URI) of the property
286: * being set.
287: * @param value The value to which the property is being set.
288: *
289: * @exception SAXNotRecognizedException If the
290: * requested property is not known.
291: * @exception SAXNotSupportedException If the
292: * requested property is known, but the requested
293: * value is not supported.
294: */
295: public void setProperty(String propertyId, Object value)
296: throws SAXNotRecognizedException, SAXNotSupportedException {
297:
298: try {
299: fParserConfiguration.setProperty(propertyId, value);
300: } catch (XMLConfigurationException e) {
301: String message = e.getMessage();
302: if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
303: throw new SAXNotRecognizedException(message);
304: } else {
305: throw new SAXNotSupportedException(message);
306: }
307: }
308:
309: } // setProperty(String,Object)
310:
311: /**
312: * Query the value of a property.
313: *
314: * Return the current value of a property in a SAX2 parser.
315: * The parser might not recognize the property.
316: *
317: * @param propertyId The unique identifier (URI) of the property
318: * being set.
319: * @return The current value of the property.
320: * @exception org.xml.sax.SAXNotRecognizedException If the
321: * requested property is not known.
322: * @exception SAXNotSupportedException If the
323: * requested property is known but not supported.
324: */
325: public Object getProperty(String propertyId)
326: throws SAXNotRecognizedException, SAXNotSupportedException {
327:
328: if (propertyId.equals(CURRENT_ELEMENT_NODE)) {
329: return (fCurrentNode != null && fCurrentNode.getNodeType() == Node.ELEMENT_NODE) ? fCurrentNode
330: : null;
331: }
332:
333: try {
334: return fParserConfiguration.getProperty(propertyId);
335: } catch (XMLConfigurationException e) {
336: String message = e.getMessage();
337: if (e.getType() == XMLConfigurationException.NOT_RECOGNIZED) {
338: throw new SAXNotRecognizedException(message);
339: } else {
340: throw new SAXNotSupportedException(message);
341: }
342: }
343:
344: } // getProperty(String):Object
345:
346: //
347: // XMLDocumentHandler methods
348: //
349:
350: /** Sets the document source. */
351: public void setDocumentSource(XMLDocumentSource source) {
352: fDocumentSource = source;
353: } // setDocumentSource(XMLDocumentSource)
354:
355: /** Returns the document source. */
356: public XMLDocumentSource getDocumentSource() {
357: return fDocumentSource;
358: } // getDocumentSource():XMLDocumentSource
359:
360: /** Start document. */
361: public void startDocument(XMLLocator locator, String encoding,
362: Augmentations augs) throws XNIException {
363: startDocument(locator, encoding, null, augs);
364: } // startDocument(XMLLocator,String,Augmentations)
365:
366: // since Xerces 2.2.0
367:
368: /** Start document. */
369: public void startDocument(XMLLocator locator, String encoding,
370: NamespaceContext nscontext, Augmentations augs)
371: throws XNIException {
372: fInCDATASection = false;
373: } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
374:
375: /** XML declaration. */
376: public void xmlDecl(String version, String encoding,
377: String standalone, Augmentations augs) throws XNIException {
378: } // xmlDecl(String,String,String,Augmentations)
379:
380: /** Document type declaration. */
381: public void doctypeDecl(String root, String pubid, String sysid,
382: Augmentations augs) throws XNIException {
383: } // doctypeDecl(String,String,String,Augmentations)
384:
385: /** Processing instruction. */
386: public void processingInstruction(String target, XMLString data,
387: Augmentations augs) throws XNIException {
388: ProcessingInstruction pi = fDocument
389: .createProcessingInstruction(target, data.toString());
390: fCurrentNode.appendChild(pi);
391: } // processingInstruction(String,XMLString,Augmentations)
392:
393: /** Comment. */
394: public void comment(XMLString text, Augmentations augs)
395: throws XNIException {
396: Comment comment = fDocument.createComment(text.toString());
397: fCurrentNode.appendChild(comment);
398: } // comment(XMLString,Augmentations)
399:
400: /** Start prefix mapping. @deprecated Since Xerces 2.2.0. */
401: public void startPrefixMapping(String prefix, String uri,
402: Augmentations augs) throws XNIException {
403: } // startPrefixMapping(String,String,Augmentations)
404:
405: /** End prefix mapping. @deprecated Since Xerces 2.2.0. */
406: public void endPrefixMapping(String prefix, Augmentations augs)
407: throws XNIException {
408: } // endPrefixMapping(String,Augmentations)
409:
410: /** Start element. */
411: public void startElement(QName element, XMLAttributes attrs,
412: Augmentations augs) throws XNIException {
413: Element elementNode = fDocument.createElement(element.rawname);
414: int count = attrs != null ? attrs.getLength() : 0;
415: for (int i = 0; i < count; i++) {
416: String aname = attrs.getQName(i);
417: String avalue = attrs.getValue(i);
418: elementNode.setAttribute(aname, avalue);
419: }
420: fCurrentNode.appendChild(elementNode);
421: fCurrentNode = elementNode;
422: } // startElement(QName,XMLAttributes,Augmentations)
423:
424: /** Empty element. */
425: public void emptyElement(QName element, XMLAttributes attrs,
426: Augmentations augs) throws XNIException {
427: startElement(element, attrs, augs);
428: endElement(element, augs);
429: } // emptyElement(QName,XMLAttributes,Augmentations)
430:
431: /** Characters. */
432: public void characters(XMLString text, Augmentations augs)
433: throws XNIException {
434:
435: if (fInCDATASection) {
436: Node node = fCurrentNode.getLastChild();
437: if (node != null
438: && node.getNodeType() == Node.CDATA_SECTION_NODE) {
439: CDATASection cdata = (CDATASection) node;
440: cdata.appendData(text.toString());
441: } else {
442: CDATASection cdata = fDocument.createCDATASection(text
443: .toString());
444: fCurrentNode.appendChild(cdata);
445: }
446: } else {
447: Node node = fCurrentNode.getLastChild();
448: if (node != null && node.getNodeType() == Node.TEXT_NODE) {
449: Text textNode = (Text) node;
450: textNode.appendData(text.toString());
451: } else {
452: Text textNode = fDocument.createTextNode(text
453: .toString());
454: fCurrentNode.appendChild(textNode);
455: }
456: }
457:
458: } // characters(XMLString,Augmentations)
459:
460: /** Ignorable whitespace. */
461: public void ignorableWhitespace(XMLString text, Augmentations augs)
462: throws XNIException {
463: characters(text, augs);
464: } // ignorableWhitespace(XMLString,Augmentations)
465:
466: /** Start general entity. */
467: public void startGeneralEntity(String name,
468: XMLResourceIdentifier id, String encoding,
469: Augmentations augs) throws XNIException {
470: EntityReference entityRef = fDocument
471: .createEntityReference(name);
472: fCurrentNode.appendChild(entityRef);
473: fCurrentNode = entityRef;
474: } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
475:
476: /** Text declaration. */
477: public void textDecl(String version, String encoding,
478: Augmentations augs) throws XNIException {
479: } // textDecl(String,String,Augmentations)
480:
481: /** End general entity. */
482: public void endGeneralEntity(String name, Augmentations augs)
483: throws XNIException {
484: fCurrentNode = fCurrentNode.getParentNode();
485: } // endGeneralEntity(String,Augmentations)
486:
487: /** Start CDATA section. */
488: public void startCDATA(Augmentations augs) throws XNIException {
489: fInCDATASection = true;
490: } // startCDATA(Augmentations)
491:
492: /** End CDATA section. */
493: public void endCDATA(Augmentations augs) throws XNIException {
494: fInCDATASection = false;
495: } // endCDATA(Augmentations)
496:
497: /** End element. */
498: public void endElement(QName element, Augmentations augs)
499: throws XNIException {
500: fCurrentNode = fCurrentNode.getParentNode();
501: } // endElement(QName,Augmentations)
502:
503: /** End document. */
504: public void endDocument(Augmentations augs) throws XNIException {
505: } // endDocument(Augmentations)
506:
507: //
508: // DEBUG
509: //
510:
511: /***
512: public static void print(Node node) {
513: short type = node.getNodeType();
514: switch (type) {
515: case Node.ELEMENT_NODE: {
516: System.out.print('<');
517: System.out.print(node.getNodeName());
518: org.w3c.dom.NamedNodeMap attrs = node.getAttributes();
519: int attrCount = attrs != null ? attrs.getLength() : 0;
520: for (int i = 0; i < attrCount; i++) {
521: Node attr = attrs.item(i);
522: System.out.print(' ');
523: System.out.print(attr.getNodeName());
524: System.out.print("='");
525: System.out.print(attr.getNodeValue());
526: System.out.print('\'');
527: }
528: System.out.print('>');
529: break;
530: }
531: case Node.TEXT_NODE: {
532: System.out.print(node.getNodeValue());
533: break;
534: }
535: }
536: Node child = node.getFirstChild();
537: while (child != null) {
538: print(child);
539: child = child.getNextSibling();
540: }
541: if (type == Node.ELEMENT_NODE) {
542: System.out.print("</");
543: System.out.print(node.getNodeName());
544: System.out.print('>');
545: }
546: else if (type == Node.DOCUMENT_NODE || type == Node.DOCUMENT_FRAGMENT_NODE) {
547: System.out.println();
548: }
549: System.out.flush();
550: }
551:
552: public static void main(String[] argv) throws Exception {
553: DOMFragmentParser parser = new DOMFragmentParser();
554: HTMLDocument document = new org.apache.html.dom.HTMLDocumentImpl();
555: for (int i = 0; i < argv.length; i++) {
556: String sysid = argv[i];
557: System.err.println("# "+sysid);
558: DocumentFragment fragment = document.createDocumentFragment();
559: parser.parse(sysid, fragment);
560: print(fragment);
561: }
562: }
563: /***/
564:
565: } // class DOMFragmentParser
|