001: /*
002: * Copyright 2005 John G. Wilson
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017:
018: package groovy.util;
019:
020: import groovy.util.slurpersupport.GPathResult;
021: import groovy.util.slurpersupport.Node;
022: import groovy.util.slurpersupport.NodeChild;
023: import groovy.xml.FactorySupport;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.io.Reader;
030: import java.io.StringReader;
031: import java.net.URL;
032: import java.util.HashMap;
033: import java.util.Hashtable;
034: import java.util.Map;
035: import java.util.Stack;
036:
037: import javax.xml.parsers.ParserConfigurationException;
038: import javax.xml.parsers.SAXParser;
039: import javax.xml.parsers.SAXParserFactory;
040:
041: import org.xml.sax.Attributes;
042: import org.xml.sax.DTDHandler;
043: import org.xml.sax.EntityResolver;
044: import org.xml.sax.ErrorHandler;
045: import org.xml.sax.InputSource;
046: import org.xml.sax.SAXException;
047: import org.xml.sax.SAXNotRecognizedException;
048: import org.xml.sax.SAXNotSupportedException;
049: import org.xml.sax.XMLReader;
050: import org.xml.sax.helpers.DefaultHandler;
051:
052: /**
053: * @author John Wilson
054: *
055: */
056:
057: public class XmlSlurper extends DefaultHandler {
058: private final XMLReader reader;
059: private Node currentNode = null;
060: private final Stack stack = new Stack();
061: private final StringBuffer charBuffer = new StringBuffer();
062: private final Map namespaceTagHints = new Hashtable();
063: private boolean keepWhitespace = false;
064:
065: public XmlSlurper() throws ParserConfigurationException,
066: SAXException {
067: this (false, true);
068: }
069:
070: public XmlSlurper(final boolean validating,
071: final boolean namespaceAware)
072: throws ParserConfigurationException, SAXException {
073: SAXParserFactory factory = FactorySupport
074: .createSaxParserFactory();
075: factory.setNamespaceAware(namespaceAware);
076: factory.setValidating(validating);
077: this .reader = factory.newSAXParser().getXMLReader();
078: }
079:
080: public XmlSlurper(final XMLReader reader) {
081: this .reader = reader;
082: }
083:
084: public XmlSlurper(final SAXParser parser) throws SAXException {
085: this (parser.getXMLReader());
086: }
087:
088: /**
089: * @param keepWhitespace
090: *
091: * If true then whitespace before elements is kept.
092: * The deafult is to discard the whitespace.
093: */
094: public void setKeepWhitespace(boolean keepWhitespace) {
095: this .keepWhitespace = keepWhitespace;
096: }
097:
098: /**
099: * @return The GPathResult instance created by consuming a stream of SAX events
100: * Note if one of the parse methods has been called then this returns null
101: * Note if this is called more than once all calls after the first will return null
102: *
103: */
104: public GPathResult getDocument() {
105: try {
106: return new NodeChild(this .currentNode, null,
107: this .namespaceTagHints);
108: } finally {
109: this .currentNode = null;
110: }
111: }
112:
113: /**
114: * Parse the content of the specified input source into a GPathResult object
115: *
116: * @param input
117: * @return An object which supports GPath expressions
118: * @throws IOException
119: * @throws SAXException
120: */
121: public GPathResult parse(final InputSource input)
122: throws IOException, SAXException {
123: this .reader.setContentHandler(this );
124: this .reader.parse(input);
125:
126: return getDocument();
127:
128: }
129:
130: /**
131: * Parses the content of the given file as XML turning it into a GPathResult object
132: *
133: * @param file
134: * @return An object which supports GPath expressions
135: * @throws IOException
136: * @throws SAXException
137: */
138: public GPathResult parse(final File file) throws IOException,
139: SAXException {
140: final InputSource input = new InputSource(new FileInputStream(
141: file));
142:
143: input.setSystemId("file://" + file.getAbsolutePath());
144:
145: return parse(input);
146:
147: }
148:
149: /**
150: * Parse the content of the specified input stream into an GPathResult Object.
151: * Note that using this method will not provide the parser with any URI
152: * for which to find DTDs etc
153: *
154: * @param input
155: * @return An object which supports GPath expressions
156: * @throws IOException
157: * @throws SAXException
158: */
159: public GPathResult parse(final InputStream input)
160: throws IOException, SAXException {
161: return parse(new InputSource(input));
162: }
163:
164: /**
165: * Parse the content of the specified reader into a GPathResult Object.
166: * Note that using this method will not provide the parser with any URI
167: * for which to find DTDs etc
168: *
169: * @param in
170: * @return An object which supports GPath expressions
171: * @throws IOException
172: * @throws SAXException
173: */
174: public GPathResult parse(final Reader in) throws IOException,
175: SAXException {
176: return parse(new InputSource(in));
177: }
178:
179: /**
180: * Parse the content of the specified URI into a GPathResult Object
181: *
182: * @param uri
183: * @return An object which supports GPath expressions
184: * @throws IOException
185: * @throws SAXException
186: */
187: public GPathResult parse(final String uri) throws IOException,
188: SAXException {
189: return parse(new InputSource(uri));
190: }
191:
192: /**
193: * A helper method to parse the given text as XML
194: *
195: * @param text
196: * @return An object which supports GPath expressions
197: */
198: public GPathResult parseText(final String text) throws IOException,
199: SAXException {
200: return parse(new StringReader(text));
201: }
202:
203: // Delegated XMLReader methods
204: //------------------------------------------------------------------------
205:
206: /* (non-Javadoc)
207: * @see org.xml.sax.XMLReader#getDTDHandler()
208: */
209: public DTDHandler getDTDHandler() {
210: return this .reader.getDTDHandler();
211: }
212:
213: /* (non-Javadoc)
214: * @see org.xml.sax.XMLReader#getEntityResolver()
215: */
216: public EntityResolver getEntityResolver() {
217: return this .reader.getEntityResolver();
218: }
219:
220: /* (non-Javadoc)
221: * @see org.xml.sax.XMLReader#getErrorHandler()
222: */
223: public ErrorHandler getErrorHandler() {
224: return this .reader.getErrorHandler();
225: }
226:
227: /* (non-Javadoc)
228: * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
229: */
230: public boolean getFeature(final String uri)
231: throws SAXNotRecognizedException, SAXNotSupportedException {
232: return this .reader.getFeature(uri);
233: }
234:
235: /* (non-Javadoc)
236: * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
237: */
238: public Object getProperty(final String uri)
239: throws SAXNotRecognizedException, SAXNotSupportedException {
240: return this .reader.getProperty(uri);
241: }
242:
243: /* (non-Javadoc)
244: * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
245: */
246: public void setDTDHandler(final DTDHandler dtdHandler) {
247: this .reader.setDTDHandler(dtdHandler);
248: }
249:
250: /* (non-Javadoc)
251: * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
252: */
253: public void setEntityResolver(final EntityResolver entityResolver) {
254: this .reader.setEntityResolver(entityResolver);
255: }
256:
257: /**
258: * Resolves entities against using the suppied URL as the base for relative URLs
259: *
260: * @param base
261: * The URL used to resolve relative URLs
262: */
263: public void setEntityBaseUrl(final URL base) {
264: this .reader.setEntityResolver(new EntityResolver() {
265: public InputSource resolveEntity(final String publicId,
266: final String systemId) throws IOException {
267: return new InputSource(new URL(base, systemId)
268: .openStream());
269: }
270: });
271: }
272:
273: /* (non-Javadoc)
274: * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
275: */
276: public void setErrorHandler(final ErrorHandler errorHandler) {
277: this .reader.setErrorHandler(errorHandler);
278: }
279:
280: /* (non-Javadoc)
281: * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
282: */
283: public void setFeature(final String uri, final boolean value)
284: throws SAXNotRecognizedException, SAXNotSupportedException {
285: this .reader.setFeature(uri, value);
286: }
287:
288: /* (non-Javadoc)
289: * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
290: */
291: public void setProperty(final String uri, final Object value)
292: throws SAXNotRecognizedException, SAXNotSupportedException {
293: this .reader.setProperty(uri, value);
294: }
295:
296: // ContentHandler interface
297: //-------------------------------------------------------------------------
298:
299: /* (non-Javadoc)
300: * @see org.xml.sax.ContentHandler#startDocument()
301: */
302: public void startDocument() throws SAXException {
303: this .currentNode = null;
304: this .charBuffer.setLength(0);
305: }
306:
307: /* (non-Javadoc)
308: * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
309: */
310: public void startPrefixMapping(final String tag, final String uri)
311: throws SAXException {
312: this .namespaceTagHints.put(tag, uri);
313: }
314:
315: /* (non-Javadoc)
316: * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
317: */
318: public void startElement(final String namespaceURI,
319: final String localName, final String qName,
320: final Attributes atts) throws SAXException {
321: addCdata();
322:
323: final Map attributes = new HashMap();
324: final Map attributeNamespaces = new HashMap();
325:
326: for (int i = atts.getLength() - 1; i != -1; i--) {
327: if (atts.getURI(i).length() == 0) {
328: attributes.put(atts.getQName(i), atts.getValue(i));
329: } else {
330: attributes.put(atts.getLocalName(i), atts.getValue(i));
331: attributeNamespaces.put(atts.getLocalName(i), atts
332: .getURI(i));
333: }
334:
335: }
336:
337: final Node newElement;
338:
339: if (namespaceURI.length() == 0) {
340: newElement = new Node(this .currentNode, qName, attributes,
341: attributeNamespaces, namespaceURI);
342: } else {
343: newElement = new Node(this .currentNode, localName,
344: attributes, attributeNamespaces, namespaceURI);
345: }
346:
347: if (this .currentNode != null) {
348: this .currentNode.addChild(newElement);
349: }
350:
351: this .stack.push(this .currentNode);
352: this .currentNode = newElement;
353: }
354:
355: /* (non-Javadoc)
356: * @see org.xml.sax.ContentHandler#characters(char[], int, int)
357: */
358: public void characters(final char[] ch, final int start,
359: final int length) throws SAXException {
360: this .charBuffer.append(ch, start, length);
361: }
362:
363: /* (non-Javadoc)
364: * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
365: */
366: public void endElement(final String namespaceURI,
367: final String localName, final String qName)
368: throws SAXException {
369: addCdata();
370:
371: final Object oldCurrentNode = this .stack.pop();
372:
373: if (oldCurrentNode != null) {
374: this .currentNode = (Node) oldCurrentNode;
375: }
376: }
377:
378: /* (non-Javadoc)
379: * @see org.xml.sax.ContentHandler#endDocument()
380: */
381: public void endDocument() throws SAXException {
382: }
383:
384: // Implementation methods
385: //-------------------------------------------------------------------------
386:
387: /**
388: *
389: */
390: private void addCdata() {
391: if (this .charBuffer.length() != 0) {
392: //
393: // This element is preceeded by CDATA if keepWhitespace is false (the default setting) and
394: // it's not whitespace add it to the body
395: // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
396: // but for the sort of work I'm doing ignoring the whitespace is preferable
397: //
398: final String cdata = this .charBuffer.toString();
399:
400: this .charBuffer.setLength(0);
401: if (this .keepWhitespace || cdata.trim().length() != 0) {
402: this.currentNode.addChild(cdata);
403: }
404: }
405: }
406: }
|