001: //
002: // Informa -- RSS Library for Java
003: // Copyright (c) 2002 by Niko Schmuck
004: //
005: // Niko Schmuck
006: // http://sourceforge.net/projects/informa
007: // mailto:niko_schmuck@users.sourceforge.net
008: //
009: // This library is free software.
010: //
011: // You may redistribute it and/or modify it under the terms of the GNU
012: // Lesser General Public License as published by the Free Software Foundation.
013: //
014: // Version 2.1 of the license should be included with this distribution in
015: // the file LICENSE. If the license is not included with this distribution,
016: // you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
017: // or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
018: // MA 02139 USA.
019: //
020: // This library is distributed in the hope that it will be useful,
021: // but WITHOUT ANY WARRANTY; without even the implied waranty of
022: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
023: // Lesser General Public License for more details.
024: //
025:
026: package de.nava.informa.parsers;
027:
028: import java.net.URL;
029: import java.util.ArrayList;
030: import java.util.Date;
031: import java.util.Enumeration;
032: import java.util.Hashtable;
033: import java.util.Iterator;
034: import java.util.List;
035:
036: import org.apache.commons.logging.Log;
037: import org.apache.commons.logging.LogFactory;
038: import org.jdom.Attribute;
039: import org.jdom.Element;
040: import org.jdom.Namespace;
041:
042: import de.nava.informa.core.CategoryIF;
043: import de.nava.informa.core.ChannelBuilderIF;
044: import de.nava.informa.core.ChannelFormat;
045: import de.nava.informa.core.ChannelIF;
046: import de.nava.informa.core.ChannelParserIF;
047: import de.nava.informa.core.ImageIF;
048: import de.nava.informa.core.ItemEnclosureIF;
049: import de.nava.informa.core.ItemGuidIF;
050: import de.nava.informa.core.ItemIF;
051: import de.nava.informa.core.ItemSourceIF;
052: import de.nava.informa.core.ParseException;
053: import de.nava.informa.core.TextInputIF;
054: import de.nava.informa.impl.basic.ChannelBuilder;
055: import de.nava.informa.utils.ParserUtils;
056:
057: /**
058: * Parser which reads in document instances according to the RSS 2.0
059: * specification and generates a news channel object.
060: *
061: * @author Anthony Eden, Niko Schmuck
062: */
063: class RSS_2_0_Parser implements ChannelParserIF {
064:
065: private static Log logger = LogFactory.getLog(RSS_2_0_Parser.class);
066:
067: /**
068: * Private constructor suppresses generation of a (public) default constructor.
069: */
070: private RSS_2_0_Parser() {
071: }
072:
073: /**
074: * Holder of the RSS_2_0_Parser instance.
075: */
076: private static class RSS_2_0_ParserHolder {
077: private static RSS_2_0_Parser instance = new RSS_2_0_Parser();
078: }
079:
080: /**
081: * Get the RSS_2_0_Parser instance.
082: */
083: public static RSS_2_0_Parser getInstance() {
084: return RSS_2_0_ParserHolder.instance;
085: }
086:
087: private CategoryIF getCategoryList(CategoryIF parent, String title,
088: Hashtable children) {
089: // Assuming category hierarchy for each category element
090: // is already mapped out into Hashtable tree; Hense the children Hashtable
091:
092: // create channel builder to help create CategoryIF objects
093: ChannelBuilder builder = new ChannelBuilder();
094:
095: // create current CategoryIF object; Parent may be null if at top level
096: CategoryIF cat = builder.createCategory(parent, title);
097: // iterate off list of keys from children list
098: Enumeration itChild = children.keys();
099: while (itChild.hasMoreElements()) {
100: String childKey = (String) itChild.nextElement();
101: // don't need to keep track of return CategoryIF since it will be added as child of another instance
102: getCategoryList(cat, childKey, (Hashtable) children
103: .get(childKey));
104: }
105: return cat;
106: }
107:
108: /**
109: * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
110: */
111: public ChannelIF parse(ChannelBuilderIF cBuilder, Element root)
112: throws ParseException {
113: if (cBuilder == null) {
114: throw new RuntimeException(
115: "Without builder no channel can " + "be created.");
116: }
117: Date dateParsed = new Date();
118: logger.debug("start parsing.");
119:
120: Namespace defNS = ParserUtils.getDefaultNS(root);
121: if (defNS == null) {
122: defNS = Namespace.NO_NAMESPACE;
123: logger.info("No default namespace found.");
124: }
125: Namespace dcNS = ParserUtils.getNamespace(root, "dc");
126: // fall back to default name space
127: if (dcNS == null) {
128: dcNS = defNS;
129: }
130:
131: // Content namespace
132: Namespace contentNS = ParserUtils.getNamespace(root, "content");
133:
134: ParserUtils.matchCaseOfChildren(root, "channel");
135:
136: // Get the channel element (only one occurs)
137: Element channel = root.getChild("channel", defNS);
138: if (channel == null) {
139: logger
140: .warn("Channel element could not be retrieved from feed.");
141: throw new ParseException(
142: "No channel element found in feed.");
143: }
144:
145: // --- read in channel information
146:
147: ParserUtils.matchCaseOfChildren(channel, new String[] {
148: "title", "description", "link", "language", "item",
149: "image", "textinput", "copyright", "rating", "docs",
150: "generator", "pubDate", "lastBuildDate", "category",
151: "managingEditor", "webMaster", "cloud" });
152:
153: // 1 title element
154: ChannelIF chnl = cBuilder.createChannel(channel, channel
155: .getChildTextTrim("title", defNS));
156:
157: // set channel format
158: chnl.setFormat(ChannelFormat.RSS_2_0);
159:
160: // 1 description element
161: chnl.setDescription(channel.getChildTextTrim("description",
162: defNS));
163:
164: // 1 link element
165: chnl.setSite(ParserUtils.getURL(channel.getChildTextTrim(
166: "link", defNS)));
167:
168: // 1 language element
169: chnl.setLanguage(channel.getChildTextTrim("language", defNS));
170:
171: // 1..n item elements
172: List items = channel.getChildren("item", defNS);
173: Iterator i = items.iterator();
174: while (i.hasNext()) {
175: Element item = (Element) i.next();
176:
177: ParserUtils.matchCaseOfChildren(item, new String[] {
178: "title", "link", "encoded", "description",
179: "subject", "category", "pubDate", "date", "author",
180: "creator", "comments", "guid", "source",
181: "enclosure" });
182:
183: // get title element
184: Element elTitle = item.getChild("title", defNS);
185: String strTitle = "<No Title>";
186: if (elTitle != null) {
187: strTitle = elTitle.getTextTrim();
188: }
189: if (logger.isDebugEnabled()) {
190: logger.debug("Item element found (" + strTitle + ").");
191: }
192:
193: // get link element
194: Element elLink = item.getChild("link", defNS);
195: String strLink = "";
196: if (elLink != null) {
197: strLink = elLink.getTextTrim();
198: }
199:
200: // get description element
201: Element elDesc = item.getChild("encoded", contentNS);
202: if (elDesc == null) {
203: elDesc = item.getChild("description", defNS);
204: }
205: String strDesc = "";
206: if (elDesc != null) {
207: strDesc = elDesc.getTextTrim();
208: }
209:
210: // generate new RSS item (link to article)
211: ItemIF rssItem = cBuilder.createItem(item, chnl, strTitle,
212: strDesc, ParserUtils.getURL(strLink));
213:
214: // get subject element
215: Element elSubject = item.getChild("subject", defNS);
216: if (elSubject == null) {
217: // fallback mechanism: get dc:subject element
218: elSubject = item.getChild("subject", dcNS);
219: }
220: if (elSubject != null) {
221: rssItem.setSubject(elSubject.getTextTrim());
222: }
223:
224: // get category list
225: // get list of <category> elements
226: List listCategory = item.getChildren("category", defNS);
227: if (listCategory.size() < 1) {
228: // fallback mechanism: get dc:category element
229: listCategory = item.getChildren("category", dcNS);
230: }
231: if (listCategory.size() > 0) {
232: RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();
233:
234: // for each category, parse hierarchy
235: Iterator itCat = listCategory.iterator();
236: while (itCat.hasNext()) {
237: RecursiveHashtable<String> currTable = catTable;
238: Element elCategory = (Element) itCat.next();
239: // get contents of category element
240: String[] titles = elCategory.getTextNormalize()
241: .split("/");
242: for (int x = 0; x < titles.length; x++) {
243: // tokenize category string to extract out hierarchy
244: if (currTable.containsKey(titles[x]) == false) {
245: // if token does not exist in current map, add it with child Hashtable
246: currTable.put(titles[x],
247: new RecursiveHashtable<String>());
248: }
249: // reset current Hashtable to child's Hashtable then iterate to next token
250: currTable = currTable.get(titles[x]);
251: }
252: }
253: ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
254: // transform cat list & hierarchy into list of CategoryIF elements
255: Enumeration<String> enumCategories = catTable.keys();
256: while (enumCategories.hasMoreElements()) {
257: String key = enumCategories.nextElement();
258: // build category list: getCategoryList(parent, title, children)
259: CategoryIF cat = getCategoryList(null, key,
260: catTable.get(key));
261: catList.add(cat);
262: }
263: if (catList.size() > 0) {
264: // if categories were actually created, then add list to item node
265: rssItem.setCategories(catList);
266: }
267: }
268:
269: // get publication date
270: Element elDate = item.getChild("pubDate", defNS);
271: if (elDate == null) {
272: // fallback mechanism: get dc:date element
273: elDate = item.getChild("date", dcNS);
274: }
275: if (elDate != null) {
276: rssItem.setDate(ParserUtils.getDate(elDate
277: .getTextTrim()));
278: }
279:
280: rssItem.setFound(dateParsed);
281:
282: // get Author element
283: Element elAuthor = item.getChild("author", defNS);
284: if (elAuthor == null) {
285: // fallback mechanism: get dc:creator element
286: elAuthor = item.getChild("creator", dcNS);
287: }
288: if (elAuthor != null)
289: rssItem.setCreator(elAuthor.getTextTrim());
290:
291: // get Comments element
292: Element elComments = item.getChild("comments", defNS);
293: String strComments = "";
294: if (elComments != null) {
295: strComments = elComments.getTextTrim();
296: }
297: rssItem.setComments(ParserUtils.getURL(strComments));
298:
299: // get guid element
300: Element elGuid = item.getChild("guid", defNS);
301: if (elGuid != null) {
302: String guidUrl = elGuid.getTextTrim();
303: if (guidUrl != null) {
304: boolean permaLink = true;
305: Attribute permaLinkAttribute = elGuid.getAttribute(
306: "isPermaLink", defNS);
307: if (permaLinkAttribute != null) {
308: String permaLinkStr = permaLinkAttribute
309: .getValue();
310: if (permaLinkStr != null) {
311: permaLink = Boolean.valueOf(permaLinkStr)
312: .booleanValue();
313: }
314: }
315: ItemGuidIF itemGuid = cBuilder.createItemGuid(
316: rssItem, guidUrl, permaLink);
317: rssItem.setGuid(itemGuid);
318: }
319: }
320:
321: // get source element
322: Element elSource = item.getChild("source", defNS);
323: if (elSource != null) {
324: String sourceName = elSource.getTextTrim();
325: Attribute sourceAttribute = elSource.getAttribute(
326: "url", defNS);
327: if (sourceAttribute != null) {
328: String sourceLocation = sourceAttribute.getValue()
329: .trim();
330: ItemSourceIF itemSource = cBuilder
331: .createItemSource(rssItem, sourceName,
332: sourceLocation, null);
333: rssItem.setSource(itemSource);
334: }
335: }
336:
337: // get enclosure element
338: Element elEnclosure = item.getChild("enclosure", defNS);
339: if (elEnclosure != null) {
340: URL location = null;
341: String type = null;
342: int length = -1;
343: Attribute urlAttribute = elEnclosure.getAttribute(
344: "url", defNS);
345: if (urlAttribute != null) {
346: location = ParserUtils.getURL(urlAttribute
347: .getValue().trim());
348: }
349: Attribute typeAttribute = elEnclosure.getAttribute(
350: "type", defNS);
351: if (typeAttribute != null) {
352: type = typeAttribute.getValue().trim();
353: }
354: Attribute lengthAttribute = elEnclosure.getAttribute(
355: "length", defNS);
356: if (lengthAttribute != null) {
357: try {
358: length = Integer.parseInt(lengthAttribute
359: .getValue().trim());
360: } catch (NumberFormatException e) {
361: logger.warn(e);
362: }
363: }
364: ItemEnclosureIF itemEnclosure = cBuilder
365: .createItemEnclosure(rssItem, location, type,
366: length);
367: rssItem.setEnclosure(itemEnclosure);
368: }
369: }
370:
371: // 0..1 image element
372: Element image = channel.getChild("image", defNS);
373: if (image != null) {
374:
375: ParserUtils.matchCaseOfChildren(image, new String[] {
376: "title", "url", "link", "width", "height",
377: "description" });
378:
379: ImageIF rssImage = cBuilder.createImage(image
380: .getChildTextTrim("title", defNS), ParserUtils
381: .getURL(image.getChildTextTrim("url", defNS)),
382: ParserUtils.getURL(image.getChildTextTrim("link",
383: defNS)));
384: Element imgWidth = image.getChild("width", defNS);
385: if (imgWidth != null) {
386: try {
387: rssImage.setWidth(Integer.parseInt(imgWidth
388: .getTextTrim()));
389: } catch (NumberFormatException e) {
390: logger.warn("Error parsing width: "
391: + e.getMessage());
392: }
393: }
394: Element imgHeight = image.getChild("height", defNS);
395: if (imgHeight != null) {
396: try {
397: rssImage.setHeight(Integer.parseInt(imgHeight
398: .getTextTrim()));
399: } catch (NumberFormatException e) {
400: logger.warn("Error parsing height: "
401: + e.getMessage());
402: }
403: }
404: Element imgDescr = image.getChild("description", defNS);
405: if (imgDescr != null) {
406: rssImage.setDescription(imgDescr.getTextTrim());
407: }
408: chnl.setImage(rssImage);
409: }
410:
411: // 0..1 textinput element
412: Element txtinp = channel.getChild("textinput", defNS);
413: if (txtinp != null) {
414:
415: ParserUtils.matchCaseOfChildren(txtinp, new String[] {
416: "title", "description", "name", "link" });
417:
418: TextInputIF rssTextInput = cBuilder.createTextInput(txtinp
419: .getChildTextTrim("title", defNS), txtinp
420: .getChildTextTrim("description", defNS), txtinp
421: .getChildTextTrim("name", defNS), ParserUtils
422: .getURL(txtinp.getChildTextTrim("link", defNS)));
423: chnl.setTextInput(rssTextInput);
424: }
425:
426: // 0..1 copyright element
427: Element copyright = channel.getChild("copyright", defNS);
428: if (copyright != null) {
429: chnl.setCopyright(copyright.getTextTrim());
430: }
431:
432: // 0..1 Rating element
433: Element rating = channel.getChild("rating", defNS);
434: if (rating != null) {
435: chnl.setRating(rating.getTextTrim());
436: }
437:
438: // 0..1 Docs element
439: Element docs = channel.getChild("docs", defNS);
440: if (docs != null) {
441: chnl.setDocs(docs.getTextTrim());
442: }
443:
444: // 0..1 Generator element
445: Element generator = channel.getChild("generator", defNS);
446: if (generator != null) {
447: chnl.setGenerator(generator.getTextTrim());
448: }
449:
450: // 0..1 ttl element
451: Element ttl = channel.getChild("ttl", defNS);
452: if (ttl != null) {
453: String ttlValue = ttl.getTextTrim();
454: try {
455: chnl.setTtl(Integer.parseInt(ttlValue));
456: } catch (NumberFormatException e) {
457: logger.warn("Invalid TTL format: '" + ttlValue + "'");
458: }
459: }
460:
461: // 0..1 pubDate element
462: Element pubDate = channel.getChild("pubDate", defNS);
463: if (pubDate != null) {
464: chnl.setPubDate(ParserUtils.getDate(pubDate.getTextTrim()));
465: }
466:
467: // 0..1 lastBuildDate element
468: Element lastBuildDate = channel
469: .getChild("lastBuildDate", defNS);
470: if (lastBuildDate != null) {
471: chnl.setLastBuildDate(ParserUtils.getDate(lastBuildDate
472: .getTextTrim()));
473: }
474:
475: // get category list
476: // get list of <category> elements
477: List listCategory = channel.getChildren("category", defNS);
478: if (listCategory.size() < 1) {
479: // fallback mechanism: get dc:category element
480: listCategory = channel.getChildren("category", dcNS);
481: }
482: if (listCategory.size() > 0) {
483: RecursiveHashtable<String> catTable = new RecursiveHashtable<String>();
484: // for each category, parse hierarchy
485: Iterator itCat = listCategory.iterator();
486: while (itCat.hasNext()) {
487: RecursiveHashtable<String> currTable = catTable;
488: Element elCategory = (Element) itCat.next();
489: // get contents of category element
490: String[] titles = elCategory.getTextNormalize().split(
491: "/");
492: for (int x = 0; x < titles.length; x++) {
493: // tokenize category string to extract out hierarchy
494: if (currTable.containsKey(titles[x]) == false) {
495: // if token does not exist in current map, add it with child Hashtable
496: currTable.put(titles[x],
497: new RecursiveHashtable<String>());
498: }
499: // reset current Hashtable to child's Hashtable then iterate to next token
500: currTable = currTable.get(titles[x]);
501: }
502: }
503: ArrayList<CategoryIF> catList = new ArrayList<CategoryIF>();
504: // transform cat list & hierarchy into list of CategoryIF elements
505: Enumeration<String> enumCategories = catTable.keys();
506: while (enumCategories.hasMoreElements()) {
507: String key = enumCategories.nextElement();
508: // build category list: getCategoryList(parent, title, children)
509: CategoryIF cat = getCategoryList(null, key, catTable
510: .get(key));
511: catList.add(cat);
512: }
513: if (catList.size() > 0) {
514: // if categories were actually created, then add list to item node
515: chnl.setCategories(catList);
516: }
517: }
518:
519: // 0..1 managingEditor element
520: Element managingEditor = channel.getChild("managingEditor",
521: defNS);
522: if (managingEditor != null) {
523: chnl.setCreator(managingEditor.getTextTrim());
524: }
525:
526: // 0..1 webMaster element
527: Element webMaster = channel.getChild("webMaster", defNS);
528: if (webMaster != null) {
529: chnl.setPublisher(webMaster.getTextTrim());
530: }
531:
532: // 0..1 cloud element
533: Element cloud = channel.getChild("cloud", defNS);
534: if (cloud != null) {
535: String _port = cloud.getAttributeValue("port", defNS);
536: int port = -1;
537: if (_port != null) {
538: try {
539: port = Integer.parseInt(_port);
540: } catch (NumberFormatException e) {
541: logger.warn(e);
542: }
543: }
544: chnl.setCloud(cBuilder.createCloud(cloud.getAttributeValue(
545: "domain", defNS), port, cloud.getAttributeValue(
546: "path", defNS), cloud.getAttributeValue(
547: "registerProcedure", defNS), cloud
548: .getAttributeValue("protocol", defNS)));
549: }
550:
551: chnl.setLastUpdated(dateParsed);
552:
553: // 0..1 skipHours element
554: // 0..1 skipDays element
555:
556: return chnl;
557: }
558:
559: /**
560: * Implement type safety in a hashtable of hashtables.
561: * @author Italo Borssatto
562: */
563: private static class RecursiveHashtable<T> extends
564: Hashtable<T, RecursiveHashtable<T>> {
565: /**
566: * <code>serialVersionUID</code>
567: */
568: private static final long serialVersionUID = -3748524793347081535L;
569:
570: /**
571: * @see java.util.Hashtable#put(java.lang.Object, java.lang.Object)
572: */
573: @Override
574: public synchronized RecursiveHashtable<T> put(T key,
575: RecursiveHashtable<T> value) {
576: return super.put(key, value);
577: }
578: }
579:
580: }
|