001: /**
002: * RSS framework and reader
003: * Copyright (C) 2004 Christian Robert
004: *
005: * This library is free software; you can redistribute it and/or
006: * modify it under the terms of the GNU Lesser General Public
007: * License as published by the Free Software Foundation; either
008: * version 2.1 of the License, or (at your option) any later version.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
013: * Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General Public
016: * License along with this library; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: */package org.jperdian.rss2;
019:
020: import java.io.BufferedInputStream;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.net.URL;
024: import java.util.List;
025:
026: import javax.xml.parsers.DocumentBuilder;
027: import javax.xml.parsers.DocumentBuilderFactory;
028: import javax.xml.parsers.ParserConfigurationException;
029:
030: import org.jperdian.rss2.dom.RssChannel;
031: import org.jperdian.rss2.dom.RssCloud;
032: import org.jperdian.rss2.dom.RssConstants;
033: import org.jperdian.rss2.dom.RssEnclosure;
034: import org.jperdian.rss2.dom.RssGuid;
035: import org.jperdian.rss2.dom.RssImage;
036: import org.jperdian.rss2.dom.RssItem;
037: import org.jperdian.rss2.dom.RssTextInput;
038: import org.w3c.dom.Document;
039: import org.w3c.dom.Element;
040: import org.w3c.dom.Node;
041: import org.w3c.dom.NodeList;
042: import org.xml.sax.SAXException;
043:
044: /**
045: * The parser to process an XML document and transfer it into an
046: * <code>RssMessage</code>
047: *
048: * @author Christian Robert
049: */
050:
051: public class RssParser {
052:
053: private DocumentBuilder myDocumentBuilder = null;
054:
055: public RssParser() {
056: try {
057: DocumentBuilder builder = DocumentBuilderFactory
058: .newInstance().newDocumentBuilder();
059: this .setDocumentBuilder(builder);
060: } catch (ParserConfigurationException e) {
061: throw new RuntimeException("Cannot create DocumentBuilder",
062: e);
063: }
064: }
065:
066: /**
067: * Transfers the given XML document into a value <code>RssMessage</code>
068: * object
069: */
070: public RssChannel parse(URL sourceURL, RssChannel targetChannel)
071: throws RssException {
072: try {
073: InputStream inStream = new BufferedInputStream(sourceURL
074: .openStream());
075: Document document = this .getDocumentBuilder().parse(
076: inStream);
077: inStream.close();
078: return this .parse(document, targetChannel);
079: } catch (SAXException e) {
080: throw new RssParseException("Illegal XML format \n["
081: + e.getMessage() + "]", e);
082: } catch (IOException e) {
083: throw new RssException("Cannot connect to source URL: "
084: + sourceURL, e);
085: }
086: }
087:
088: /**
089: * Transfers the given XML document into a value <code>RssMessage</code>
090: * object
091: */
092: public RssChannel parse(Document xmlDocument,
093: RssChannel targetChannel) throws RssParseException {
094: RssChannel resultChannel = null;
095: Element rootElement = xmlDocument.getDocumentElement();
096: NodeList rootSubNodes = rootElement.getChildNodes();
097: for (int i = 0; i < rootSubNodes.getLength(); i++) {
098: Node subNode = rootSubNodes.item(i);
099: String subNodeName = subNode.getNodeName();
100: if (subNode.getNodeType() == Node.ELEMENT_NODE
101: && subNodeName.equalsIgnoreCase("channel")) {
102: resultChannel = this .parseChannel(
103: (Element) rootSubNodes.item(i), targetChannel);
104: } else if (subNodeName.equalsIgnoreCase("item")) {
105: RssItem item = this .parseItem((Element) subNode);
106: item.setSource(resultChannel);
107: resultChannel.addItem(item);
108: }
109: }
110: if (resultChannel != null) {
111: return resultChannel;
112: } else {
113: throw new RssParseException(
114: "No channel element found in message");
115: }
116: }
117:
118: /**
119: * Parses the content of the given <code>channel</code> element, analyze
120: * it's content and generate a valid <code>RssChannel</code> object
121: */
122: protected RssChannel parseChannel(Element channelElement,
123: RssChannel channel) throws RssParseException {
124: List itemList = channel.getItemList();
125: if (itemList != null && itemList.size() > 0) {
126: itemList.clear();
127: }
128: NodeList subNodes = channelElement.getChildNodes();
129: for (int i = 0; i < subNodes.getLength(); i++) {
130: if (subNodes.item(i).getNodeType() == Node.ELEMENT_NODE) {
131: Element subElement = (Element) subNodes.item(i);
132: String elemName = subElement.getNodeName();
133: if (elemName.equalsIgnoreCase("title")) {
134: channel.setTitle(RssParseHelper
135: .parseContentChildren(subElement));
136: } else if (elemName.equalsIgnoreCase("link")) {
137: channel.setLink(RssParseHelper
138: .parseContentURL(subElement));
139: } else if (elemName.equalsIgnoreCase("description")) {
140: channel.setDescription(RssParseHelper
141: .parseContentChildren(subElement));
142: } else if (elemName.equalsIgnoreCase("copyright")) {
143: channel.setCopyright(RssParseHelper
144: .parseContentChildren(subElement));
145: } else if (elemName.equalsIgnoreCase("managingEditor")) {
146: channel.setManagingEditor(RssParseHelper
147: .parseContentChildren(subElement));
148: } else if (elemName.equalsIgnoreCase("webMaster")) {
149: channel.setWebmaster(RssParseHelper
150: .parseContentChildren(subElement));
151: } else if (elemName.equalsIgnoreCase("pubDate")) {
152: channel.setPubDate(RssParseHelper
153: .parseContentDate(subElement));
154: } else if (elemName.equalsIgnoreCase("lastBuildDate")) {
155: channel.setLastBuildDate(RssParseHelper
156: .parseContentDate(subElement));
157: } else if (elemName.equalsIgnoreCase("category")) {
158: channel.addCategory(RssParseHelper
159: .parseContentChildren(subElement));
160: } else if (elemName.equalsIgnoreCase("generator")) {
161: channel.setGenerator(RssParseHelper
162: .parseContentChildren(subElement));
163: } else if (elemName.equalsIgnoreCase("docs")) {
164: channel.setDocs(RssParseHelper
165: .parseContentURL(subElement));
166: } else if (elemName.equalsIgnoreCase("cloud")) {
167: channel.setCloud(this .parseCloud(subElement));
168: } else if (elemName.equalsIgnoreCase("ttl")) {
169: channel.setTtl(RssParseHelper
170: .parseContentInt(subElement));
171: } else if (elemName.equalsIgnoreCase("image")) {
172: channel.setImage(this .parseImage(subElement));
173: } else if (elemName.equalsIgnoreCase("rating")) {
174: channel.setRating(RssParseHelper
175: .parseContentChildren(subElement));
176: } else if (elemName.equalsIgnoreCase("textInput")) {
177: channel.setTextInput(this
178: .parseTextInput(subElement));
179: } else if (elemName.equalsIgnoreCase("skipHours")) {
180: channel.addSkipHour(RssParseHelper
181: .parseContentInt(subElement));
182: } else if (elemName.equalsIgnoreCase("skipDays")) {
183: channel.addSkipDay(RssParseHelper
184: .parseContentChildren(subElement));
185: } else if (elemName.equalsIgnoreCase("item")) {
186: RssItem item = this .parseItem(subElement);
187: item.setSource(channel);
188: channel.addItem(item);
189: }
190: }
191: }
192: return channel;
193: }
194:
195: /**
196: * Parses a <tt>cloud</tt> element
197: */
198: protected RssCloud parseCloud(Element cloudElement)
199: throws RssParseException {
200: RssCloud cloud = new RssCloud();
201: cloud.setDomain(cloudElement.getAttribute("domain"));
202: try {
203: cloud.setPort(Integer.parseInt(cloudElement
204: .getAttribute("port")));
205: } catch (NumberFormatException e) {
206: throw new RssParseException(
207: "Illegal port entered for cloud: "
208: + cloudElement.getAttribute("port"));
209: }
210: cloud.setPath(cloudElement.getAttribute("path"));
211: cloud.setRegisterProcedure(cloudElement
212: .getAttribute("registerProcedure"));
213: return cloud;
214: }
215:
216: /**
217: * Parses a <tt>textInput</tt> element
218: */
219: protected RssTextInput parseTextInput(Element textInputElement)
220: throws RssParseException {
221: RssTextInput textInput = new RssTextInput();
222: NodeList subNodes = textInputElement.getChildNodes();
223: for (int i = 0; i < subNodes.getLength(); i++) {
224: if (subNodes.item(i).getNodeType() == Node.ELEMENT_NODE) {
225: Element subElement = (Element) subNodes.item(i);
226: String elemName = subElement.getNodeName();
227: if (elemName.equalsIgnoreCase("title")) {
228: textInput.setTitle(RssParseHelper
229: .parseContentChildren(subElement));
230: } else if (elemName.equalsIgnoreCase("description")) {
231: textInput.setDescription(RssParseHelper
232: .parseContentChildren(subElement));
233: } else if (elemName.equalsIgnoreCase("name")) {
234: textInput.setName(RssParseHelper
235: .parseContentChildren(subElement));
236: } else if (elemName.equalsIgnoreCase("link")) {
237: textInput.setLink(RssParseHelper
238: .parseContentURL(subElement));
239: }
240: }
241: }
242: return textInput;
243: }
244:
245: /**
246: * Parses a <tt>image</tt> element
247: */
248: protected RssImage parseImage(Element textInputElement)
249: throws RssParseException {
250: RssImage image = new RssImage();
251: NodeList subNodes = textInputElement.getChildNodes();
252: for (int i = 0; i < subNodes.getLength(); i++) {
253: if (subNodes.item(i).getNodeType() == Node.ELEMENT_NODE) {
254: Element subElement = (Element) subNodes.item(i);
255: String elemName = subElement.getNodeName();
256: if (elemName.equalsIgnoreCase("title")) {
257: image.setTitle(RssParseHelper
258: .parseContentChildren(subElement));
259: } else if (elemName.equalsIgnoreCase("url")) {
260: image.setURL(RssParseHelper
261: .parseContentURL(subElement));
262: } else if (elemName.equalsIgnoreCase("title")) {
263: image.setTitle(RssParseHelper
264: .parseContentChildren(subElement));
265: } else if (elemName.equalsIgnoreCase("link")) {
266: image.setLink(RssParseHelper
267: .parseContentURL(subElement));
268: } else if (elemName.equalsIgnoreCase("description")) {
269: image.setDescription(RssParseHelper
270: .parseContentChildren(subElement));
271: } else if (elemName.equalsIgnoreCase("width")) {
272: image.setWidth(RssParseHelper.parseContentInt(
273: subElement,
274: RssConstants.DEFAULT_IMAGE_WIDTH));
275: } else if (elemName.equalsIgnoreCase("height")) {
276: image.setHeight(RssParseHelper.parseContentInt(
277: subElement,
278: RssConstants.DEFAULT_IMAGE_HEIGHT));
279: }
280: }
281: }
282: return image;
283: }
284:
285: /**
286: * Parses a <tt>item</tt> element
287: */
288: protected RssItem parseItem(Element itemElement)
289: throws RssParseException {
290: RssItem item = new RssItem();
291: NodeList subNodes = itemElement.getChildNodes();
292: for (int i = 0; i < subNodes.getLength(); i++) {
293: if (subNodes.item(i).getNodeType() == Node.ELEMENT_NODE) {
294: Element subElement = (Element) subNodes.item(i);
295: String elemName = subElement.getNodeName();
296: if (elemName.equalsIgnoreCase("title")) {
297: item.setTitle(RssParseHelper
298: .parseContentChildren(subElement));
299: } else if (elemName.equalsIgnoreCase("link")) {
300: item.setLink(RssParseHelper
301: .parseContentURL(subElement));
302: } else if (elemName.equalsIgnoreCase("description")) {
303: item.setDescription(RssParseHelper
304: .parseContentChildren(subElement));
305: } else if (elemName.equalsIgnoreCase("author")) {
306: item.setAuthor(RssParseHelper
307: .parseContentChildren(subElement));
308: } else if (elemName.equalsIgnoreCase("category")) {
309: item.addCategory(RssParseHelper
310: .parseContentChildren(subElement));
311: } else if (elemName.equalsIgnoreCase("comments")) {
312: item.setComments(RssParseHelper
313: .parseContentChildren(subElement));
314: } else if (elemName.equalsIgnoreCase("enclosure")) {
315: item.setEnclosure(this .parseEnclosure(subElement));
316: } else if (elemName.equalsIgnoreCase("guid")) {
317: item.setGuid(this .parseGuid(subElement));
318: } else if (elemName.equalsIgnoreCase("pubDate")) {
319: item.setPubDate(RssParseHelper
320: .parseContentDate(subElement));
321: }
322: }
323: }
324: return item;
325: }
326:
327: /**
328: * Parses a <tt>enclosure</tt> element
329: */
330: protected RssEnclosure parseEnclosure(Element enclosureElement)
331: throws RssParseException {
332: RssEnclosure enclosure = new RssEnclosure();
333: enclosure.setURL(RssParseHelper.parseURL(enclosureElement
334: .getAttribute("url")));
335: try {
336: enclosure.setLength(Long.parseLong(enclosureElement
337: .getAttribute("length")));
338: } catch (NumberFormatException e) {
339: throw new RssParseException(
340: "Illegal length entered for enclosure: "
341: + enclosureElement.getAttribute("length"));
342: }
343: enclosure.setType(enclosureElement.getAttribute("type"));
344: return enclosure;
345: }
346:
347: /**
348: * Parses a <tt>guid</tt> element
349: */
350: protected RssGuid parseGuid(Element guidElement)
351: throws RssParseException {
352: RssGuid guid = new RssGuid();
353: String permaLink = guidElement.getAttribute("isPermaLink");
354: if (permaLink != null) {
355: guid.setIsPermaLink(permaLink.equalsIgnoreCase("true"));
356: }
357: guid.setGuid(RssParseHelper.parseContentChildren(guidElement));
358: return guid;
359: }
360:
361: // --------------------------------------------------------------------------
362: // -- Property access methods ---------------------------------------------
363: // --------------------------------------------------------------------------
364:
365: /**
366: * Sets the <code>DocumentBuilder</code> used for XML parsing
367: */
368: protected void setDocumentBuilder(DocumentBuilder builder) {
369: this .myDocumentBuilder = builder;
370: }
371:
372: /**
373: * Gets the <code>DocumentBuilder</code> used for XML parsing
374: */
375: protected DocumentBuilder getDocumentBuilder() {
376: return this.myDocumentBuilder;
377: }
378:
379: }
|