001: /*
002: * Copyright 2004 Sun Microsystems, Inc.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.sun.syndication.io;
018:
019: import java.io.ByteArrayInputStream;
020: import java.io.File;
021: import java.io.FileNotFoundException;
022: import java.io.FileReader;
023: import java.io.IOException;
024: import java.io.Reader;
025: import java.util.List;
026:
027: import org.jdom.Document;
028: import org.jdom.JDOMException;
029: import org.jdom.input.DOMBuilder;
030: import org.jdom.input.JDOMParseException;
031: import org.xml.sax.EntityResolver;
032: import org.xml.sax.InputSource;
033: import org.xml.sax.SAXNotRecognizedException;
034: import org.xml.sax.SAXNotSupportedException;
035: import org.xml.sax.XMLReader;
036:
037: import com.sun.syndication.feed.WireFeed;
038: import com.sun.syndication.io.impl.FeedParsers;
039: import com.sun.syndication.io.impl.XmlFixerReader;
040:
041: /**
042: * Parses an XML document (File, InputStream, Reader, W3C SAX InputSource, W3C DOM Document or JDom DOcument)
043: * into an WireFeed (RSS/Atom).
044: * <p>
045: * It accepts all flavors of RSS (0.90, 0.91, 0.92, 0.93, 0.94, 1.0 and 2.0) and
046: * Atom 0.3 feeds. Parsers are plugable (they must implement the WireFeedParser interface).
047: * <p>
048: * The WireFeedInput useds liberal parsers.
049: * <p>
050: * @author Alejandro Abdelnur
051: *
052: */
053: public class WireFeedInput {
054: private static FeedParsers FEED_PARSERS = new FeedParsers();
055: private static final InputSource EMPTY_INPUTSOURCE = new InputSource(
056: new ByteArrayInputStream(new byte[0]));
057: private static final EntityResolver RESOLVER = new EmptyEntityResolver();
058:
059: private static class EmptyEntityResolver implements EntityResolver {
060: public InputSource resolveEntity(String publicId,
061: String systemId) {
062: if (systemId != null && systemId.endsWith(".dtd"))
063: return EMPTY_INPUTSOURCE;
064: return null;
065: }
066: }
067:
068: private boolean _validate;
069:
070: private boolean _xmlHealerOn;
071:
072: /**
073: * Returns the list of supported input feed types.
074: * <p>
075: * @see WireFeed for details on the format of these strings.
076: * <p>
077: * @return a list of String elements with the supported input feed types.
078: *
079: */
080: public static List getSupportedFeedTypes() {
081: return FEED_PARSERS.getSupportedFeedTypes();
082: }
083:
084: /**
085: * Creates a WireFeedInput instance with input validation turned off.
086: * <p>
087: *
088: */
089: public WireFeedInput() {
090: this (false);
091: }
092:
093: /**
094: * Creates a WireFeedInput instance.
095: * <p>
096: * @param validate indicates if the input should be validated. NOT IMPLEMENTED YET (validation does not happen)
097: *
098: */
099: public WireFeedInput(boolean validate) {
100: _validate = false; // TODO FIX THIS THINGY
101: _xmlHealerOn = true;
102: }
103:
104: /**
105: * Enables XML healing in the WiredFeedInput instance.
106: * <p>
107: * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
108: * <p>
109: * Healing resolves HTML entities (from literal to code number) in the reader.
110: * <p>
111: * The healing is done only with the build(File) and build(Reader) signatures.
112: * <p>
113: * By default is TRUE.
114: * <p>
115: * @param heals TRUE enables stream healing, FALSE disables it.
116: *
117: */
118: public void setXmlHealerOn(boolean heals) {
119: _xmlHealerOn = heals;
120: }
121:
122: /**
123: * Indicates if the WiredFeedInput instance will XML heal (if necessary) the character stream.
124: * <p>
125: * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
126: * <p>
127: * Healing resolves HTML entities (from literal to code number) in the reader.
128: * <p>
129: * The healing is done only with the build(File) and build(Reader) signatures.
130: * <p>
131: * By default is TRUE.
132: * <p>
133: * @return TRUE if healing is enabled, FALSE if not.
134: *
135: */
136: public boolean getXmlHealerOn() {
137: return _xmlHealerOn;
138: }
139:
140: /**
141: * Builds an WireFeed (RSS or Atom) from a file.
142: * <p>
143: * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
144: * <p>
145: * @param file file to read to create the WireFeed.
146: * @return the WireFeed read from the file.
147: * @throws FileNotFoundException thrown if the file could not be found.
148: * @throws IOException thrown if there is problem reading the file.
149: * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
150: * @throws FeedException if the feed could not be parsed
151: *
152: */
153: public WireFeed build(File file) throws FileNotFoundException,
154: IOException, IllegalArgumentException, FeedException {
155: WireFeed feed;
156: Reader reader = new FileReader(file);
157: if (_xmlHealerOn) {
158: reader = new XmlFixerReader(reader);
159: }
160: feed = build(reader);
161: reader.close();
162: return feed;
163: }
164:
165: /**
166: * Builds an WireFeed (RSS or Atom) from an Reader.
167: * <p>
168: * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
169: * <p>
170: * @param reader Reader to read to create the WireFeed.
171: * @return the WireFeed read from the Reader.
172: * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
173: * @throws FeedException if the feed could not be parsed
174: *
175: */
176: public WireFeed build(Reader reader)
177: throws IllegalArgumentException, FeedException {
178: SAXBuilder saxBuilder = createSAXBuilder();
179: try {
180: if (_xmlHealerOn) {
181: reader = new XmlFixerReader(reader);
182: }
183: Document document = saxBuilder.build(reader);
184: return build(document);
185: } catch (JDOMParseException ex) {
186: throw new ParsingFeedException("Invalid XML: "
187: + ex.getMessage(), ex);
188: } catch (Exception ex) {
189: throw new ParsingFeedException("Invalid XML", ex);
190: }
191: }
192:
193: /**
194: * Builds an WireFeed (RSS or Atom) from an W3C SAX InputSource.
195: * <p>
196: * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
197: * <p>
198: * @param is W3C SAX InputSource to read to create the WireFeed.
199: * @return the WireFeed read from the W3C SAX InputSource.
200: * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
201: * @throws FeedException if the feed could not be parsed
202: *
203: */
204: public WireFeed build(InputSource is)
205: throws IllegalArgumentException, FeedException {
206: SAXBuilder saxBuilder = createSAXBuilder();
207: try {
208: Document document = saxBuilder.build(is);
209: return build(document);
210: } catch (JDOMParseException ex) {
211: throw new ParsingFeedException("Invalid XML: "
212: + ex.getMessage(), ex);
213: } catch (Exception ex) {
214: throw new ParsingFeedException("Invalid XML", ex);
215: }
216: }
217:
218: /**
219: * Builds an WireFeed (RSS or Atom) from an W3C DOM document.
220: * <p>
221: * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
222: * <p>
223: * @param document W3C DOM document to read to create the WireFeed.
224: * @return the WireFeed read from the W3C DOM document.
225: * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
226: * @throws FeedException if the feed could not be parsed
227: *
228: */
229: public WireFeed build(org.w3c.dom.Document document)
230: throws IllegalArgumentException, FeedException {
231: DOMBuilder domBuilder = new DOMBuilder();
232: try {
233: Document jdomDoc = domBuilder.build(document);
234: return build(jdomDoc);
235: } catch (Exception ex) {
236: throw new ParsingFeedException("Invalid XML", ex);
237: }
238: }
239:
240: /**
241: * Builds an WireFeed (RSS or Atom) from an JDOM document.
242: * <p>
243: * NOTE: All other build methods delegate to this method.
244: * <p>
245: * @param document JDOM document to read to create the WireFeed.
246: * @return the WireFeed read from the JDOM document.
247: * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
248: * @throws FeedException if the feed could not be parsed
249: *
250: */
251: public WireFeed build(Document document)
252: throws IllegalArgumentException, FeedException {
253: WireFeedParser parser = FEED_PARSERS.getParserFor(document);
254: if (parser == null) {
255: throw new IllegalArgumentException("Invalid document");
256: }
257: return parser.parse(document, _validate);
258: }
259:
260: /**
261: * Creates and sets up a org.jdom.input.SAXBuilder for parsing.
262: *
263: * @return a new org.jdom.input.SAXBuilder object
264: */
265: protected SAXBuilder createSAXBuilder() {
266: SAXBuilder saxBuilder = new SAXBuilder(_validate);
267: saxBuilder.setEntityResolver(RESOLVER);
268:
269: //
270: // This code is needed to fix the security problem outlined in http://www.securityfocus.com/archive/1/297714
271: //
272: // Unfortunately there isn't an easy way to check if an XML parser supports a particular feature, so
273: // we need to set it and catch the exception if it fails. We also need to subclass the JDom SAXBuilder
274: // class in order to get access to the underlying SAX parser - otherwise the features don't get set until
275: // we are already building the document, by which time it's too late to fix the problem.
276: //
277: // Crimson is one parser which is known not to support these features.
278: try {
279: XMLReader parser = saxBuilder.createParser();
280: try {
281: parser
282: .setFeature(
283: "http://xml.org/sax/features/external-general-entities",
284: false);
285: saxBuilder
286: .setFeature(
287: "http://xml.org/sax/features/external-general-entities",
288: false);
289: } catch (SAXNotRecognizedException e) {
290: // ignore
291: } catch (SAXNotSupportedException e) {
292: // ignore
293: }
294:
295: try {
296: parser
297: .setFeature(
298: "http://xml.org/sax/features/external-parameter-entities",
299: false);
300: saxBuilder
301: .setFeature(
302: "http://xml.org/sax/features/external-parameter-entities",
303: false);
304: } catch (SAXNotRecognizedException e) {
305: // ignore
306: } catch (SAXNotSupportedException e) {
307: // ignore
308: }
309:
310: try {
311: parser
312: .setFeature(
313: "http://apache.org/xml/features/nonvalidating/load-external-dtd",
314: false);
315: saxBuilder
316: .setFeature(
317: "http://apache.org/xml/features/nonvalidating/load-external-dtd",
318: false);
319: } catch (SAXNotRecognizedException e) {
320: // ignore
321: } catch (SAXNotSupportedException e) {
322: // ignore
323: }
324:
325: } catch (JDOMException e) {
326: throw new IllegalStateException(
327: "JDOM could not create a SAX parser");
328: }
329:
330: saxBuilder.setExpandEntities(false);
331: return saxBuilder;
332: }
333: }
|