001: /*
002: * Copyright 2004 Sun Microsystems, Inc.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.sun.syndication.io.impl;
018:
019: import java.util.ArrayList;
020: import java.util.Iterator;
021: import java.util.List;
022:
023: import org.jdom.Document;
024: import org.jdom.Element;
025: import org.jdom.Namespace;
026: import org.jdom.output.XMLOutputter;
027:
028: import com.sun.syndication.feed.WireFeed;
029: import com.sun.syndication.feed.atom.Category;
030: import com.sun.syndication.feed.atom.Content;
031: import com.sun.syndication.feed.atom.Entry;
032: import com.sun.syndication.feed.atom.Feed;
033: import com.sun.syndication.feed.atom.Generator;
034: import com.sun.syndication.feed.atom.Link;
035: import com.sun.syndication.feed.atom.Person;
036: import com.sun.syndication.io.FeedException;
037: import java.net.MalformedURLException;
038: import java.net.URL;
039: import java.util.regex.Pattern;
040: import org.jdom.Attribute;
041: import org.jdom.Parent;
042:
043: /**
044: * Parser for Atom 1.0
045: * @author Dave Johnson
046: */
047: public class Atom10Parser extends BaseWireFeedParser {
048: private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
049: Namespace ns = Namespace.getNamespace(ATOM_10_URI);
050:
051: public Atom10Parser() {
052: this ("atom_1.0");
053: }
054:
055: protected Atom10Parser(String type) {
056: super (type);
057: }
058:
059: protected Namespace getAtomNamespace() {
060: return ns;
061: }
062:
063: public boolean isMyType(Document document) {
064: Element rssRoot = document.getRootElement();
065: Namespace defaultNS = rssRoot.getNamespace();
066: return (defaultNS != null)
067: && defaultNS.equals(getAtomNamespace());
068: }
069:
070: public WireFeed parse(Document document, boolean validate)
071: throws IllegalArgumentException, FeedException {
072: if (validate) {
073: validateFeed(document);
074: }
075: Element rssRoot = document.getRootElement();
076: return parseFeed(rssRoot);
077: }
078:
079: protected void validateFeed(Document document) throws FeedException {
080: // TBD
081: // here we have to validate the Feed against a schema or whatever
082: // not sure how to do it
083: // one posibility would be to produce an ouput and attempt to parse it again
084: // with validation turned on.
085: // otherwise will have to check the document elements by hand.
086: }
087:
088: protected WireFeed parseFeed(Element eFeed) throws FeedException {
089:
090: com.sun.syndication.feed.atom.Feed feed = new com.sun.syndication.feed.atom.Feed(
091: getType());
092:
093: String baseURI = null;
094: try {
095: baseURI = findBaseURI(eFeed);
096: } catch (Exception e) {
097: throw new FeedException(
098: "ERROR while finding base URI of feed", e);
099: }
100:
101: String xmlBase = eFeed.getAttributeValue("base",
102: Namespace.XML_NAMESPACE);
103: if (xmlBase != null) {
104: feed.setXmlBase(xmlBase);
105: }
106:
107: Element e = eFeed.getChild("title", getAtomNamespace());
108: if (e != null) {
109: Content c = new Content();
110: c.setValue(parseTextConstructToString(e));
111: c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
112: feed.setTitleEx(c);
113: }
114:
115: List eList = eFeed.getChildren("link", getAtomNamespace());
116: feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI,
117: eList));
118: feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, eList));
119:
120: List cList = eFeed.getChildren("category", getAtomNamespace());
121: feed.setCategories(parseCategories(baseURI, cList));
122:
123: eList = eFeed.getChildren("author", getAtomNamespace());
124: if (eList.size() > 0) {
125: feed.setAuthors(parsePersons(baseURI, eList));
126: }
127:
128: eList = eFeed.getChildren("contributor", getAtomNamespace());
129: if (eList.size() > 0) {
130: feed.setContributors(parsePersons(baseURI, eList));
131: }
132:
133: e = eFeed.getChild("subtitle", getAtomNamespace());
134: if (e != null) {
135: Content subtitle = new Content();
136: subtitle.setValue(parseTextConstructToString(e));
137: subtitle.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
138: feed.setSubtitle(subtitle);
139: }
140:
141: e = eFeed.getChild("id", getAtomNamespace());
142: if (e != null) {
143: feed.setId(e.getText());
144: }
145:
146: e = eFeed.getChild("generator", getAtomNamespace());
147: if (e != null) {
148: Generator gen = new Generator();
149: gen.setValue(e.getText());
150: String att = e.getAttributeValue("uri");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
151: if (att != null) {
152: gen.setUrl(att);
153: }
154: att = e.getAttributeValue("version");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
155: if (att != null) {
156: gen.setVersion(att);
157: }
158: feed.setGenerator(gen);
159: }
160:
161: e = eFeed.getChild("rights", getAtomNamespace());
162: if (e != null) {
163: feed.setRights(parseTextConstructToString(e));
164: }
165:
166: e = eFeed.getChild("icon", getAtomNamespace());
167: if (e != null) {
168: feed.setIcon(e.getText());
169: }
170:
171: e = eFeed.getChild("logo", getAtomNamespace());
172: if (e != null) {
173: feed.setLogo(e.getText());
174: }
175:
176: e = eFeed.getChild("updated", getAtomNamespace());
177: if (e != null) {
178: feed.setUpdated(DateParser.parseDate(e.getText()));
179: }
180:
181: feed.setModules(parseFeedModules(eFeed));
182:
183: eList = eFeed.getChildren("entry", getAtomNamespace());
184: if (eList.size() > 0) {
185: feed.setEntries(parseEntries(feed, baseURI, eList));
186: }
187:
188: List foreignMarkup = extractForeignMarkup(eFeed, feed,
189: getAtomNamespace());
190: if (foreignMarkup.size() > 0) {
191: feed.setForeignMarkup(foreignMarkup);
192: }
193: return feed;
194: }
195:
196: private Link parseLink(Feed feed, Entry entry, String baseURI,
197: Element eLink) {
198: Link link = new Link();
199: String att = eLink.getAttributeValue("rel");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
200: if (att != null) {
201: link.setRel(att);
202: }
203: att = eLink.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
204: if (att != null) {
205: link.setType(att);
206: }
207: att = eLink.getAttributeValue("href");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
208: if (att != null) {
209: if (isRelativeURI(att)) {
210: link.setHref(resolveURI(baseURI, eLink, att));
211: } else {
212: link.setHref(att);
213: }
214: }
215: att = eLink.getAttributeValue("title");
216: if (att != null) {
217: link.setTitle(att);
218: }
219: att = eLink.getAttributeValue("hreflang");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
220: if (att != null) {
221: link.setHreflang(att);
222: }
223: att = eLink.getAttributeValue("length");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
224: if (att != null) {
225: link.setLength(Long.parseLong(att));
226: }
227: return link;
228: }
229:
230: // List(Elements) -> List(Link)
231: private List parseAlternateLinks(Feed feed, Entry entry,
232: String baseURI, List eLinks) {
233: List links = new ArrayList();
234: for (int i = 0; i < eLinks.size(); i++) {
235: Element eLink = (Element) eLinks.get(i);
236: Link link = parseLink(feed, entry, baseURI, eLink);
237: if (link.getRel() == null
238: || "".equals(link.getRel().trim())
239: || "alternate".equals(link.getRel())) {
240: links.add(link);
241: }
242: }
243: return (links.size() > 0) ? links : null;
244: }
245:
246: private List parseOtherLinks(Feed feed, Entry entry,
247: String baseURI, List eLinks) {
248: List links = new ArrayList();
249: for (int i = 0; i < eLinks.size(); i++) {
250: Element eLink = (Element) eLinks.get(i);
251: Link link = parseLink(feed, entry, baseURI, eLink);
252: if (!"alternate".equals(link.getRel())) {
253: links.add(link);
254: }
255: }
256: return (links.size() > 0) ? links : null;
257: }
258:
259: private Person parsePerson(String baseURI, Element ePerson) {
260: Person person = new Person();
261: Element e = ePerson.getChild("name", getAtomNamespace());
262: if (e != null) {
263: person.setName(e.getText());
264: }
265: e = ePerson.getChild("uri", getAtomNamespace());
266: if (e != null) {
267: person.setUri(resolveURI(baseURI, ePerson, e.getText()));
268: }
269: e = ePerson.getChild("email", getAtomNamespace());
270: if (e != null) {
271: person.setEmail(e.getText());
272: }
273: return person;
274: }
275:
276: // List(Elements) -> List(Persons)
277: private List parsePersons(String baseURI, List ePersons) {
278: List persons = new ArrayList();
279: for (int i = 0; i < ePersons.size(); i++) {
280: persons
281: .add(parsePerson(baseURI, (Element) ePersons.get(i)));
282: }
283: return (persons.size() > 0) ? persons : null;
284: }
285:
286: private Content parseContent(Element e) {
287: String value = parseTextConstructToString(e);
288: String src = e.getAttributeValue("src");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
289: String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
290: Content content = new Content();
291: content.setSrc(src);
292: content.setType(type);
293: content.setValue(value);
294: return content;
295: }
296:
297: private String parseTextConstructToString(Element e) {
298: String value = null;
299: String type = e.getAttributeValue("type");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
300: type = (type != null) ? type : Content.TEXT;
301: if (type.equals(Content.XHTML)) {
302: // XHTML content needs special handling
303: XMLOutputter outputter = new XMLOutputter();
304: List eContent = e.getContent();
305: Iterator i = eContent.iterator();
306: while (i.hasNext()) {
307: org.jdom.Content c = (org.jdom.Content) i.next();
308: if (c instanceof Element) {
309: Element eC = (Element) c;
310: if (eC.getNamespace().equals(getAtomNamespace())) {
311: ((Element) c)
312: .setNamespace(Namespace.NO_NAMESPACE);
313: }
314: }
315: }
316: value = outputter.outputString(eContent);
317: } else {
318: // Everything else comes in verbatim
319: value = e.getText();
320: }
321: return value;
322: }
323:
324: // List(Elements) -> List(Entries)
325: protected List parseEntries(Feed feed, String baseURI, List eEntries) {
326: List entries = new ArrayList();
327: for (int i = 0; i < eEntries.size(); i++) {
328: entries.add(parseEntry(feed, (Element) eEntries.get(i),
329: baseURI));
330: }
331: return (entries.size() > 0) ? entries : null;
332: }
333:
334: protected Entry parseEntry(Feed feed, Element eEntry, String baseURI) {
335: Entry entry = new Entry();
336:
337: String xmlBase = eEntry.getAttributeValue("base",
338: Namespace.XML_NAMESPACE);
339: if (xmlBase != null) {
340: entry.setXmlBase(xmlBase);
341: }
342:
343: Element e = eEntry.getChild("title", getAtomNamespace());
344: if (e != null) {
345: Content c = new Content();
346: c.setValue(parseTextConstructToString(e));
347: c.setType(e.getAttributeValue("type")); //, Namespace.XML_NAMESPACE));
348: entry.setTitleEx(c);
349: }
350:
351: List eList = eEntry.getChildren("link", getAtomNamespace());
352: entry.setAlternateLinks(parseAlternateLinks(feed, entry,
353: baseURI, eList));
354: entry
355: .setOtherLinks(parseOtherLinks(feed, entry, baseURI,
356: eList));
357:
358: eList = eEntry.getChildren("author", getAtomNamespace());
359: if (eList.size() > 0) {
360: entry.setAuthors(parsePersons(baseURI, eList));
361: }
362:
363: eList = eEntry.getChildren("contributor", getAtomNamespace());
364: if (eList.size() > 0) {
365: entry.setContributors(parsePersons(baseURI, eList));
366: }
367:
368: e = eEntry.getChild("id", getAtomNamespace());
369: if (e != null) {
370: entry.setId(e.getText());
371: }
372:
373: e = eEntry.getChild("updated", getAtomNamespace());
374: if (e != null) {
375: entry.setUpdated(DateParser.parseW3CDateTime(e.getText()));
376: }
377:
378: e = eEntry.getChild("published", getAtomNamespace());
379: if (e != null) {
380: entry
381: .setPublished(DateParser.parseW3CDateTime(e
382: .getText()));
383: }
384:
385: e = eEntry.getChild("summary", getAtomNamespace());
386: if (e != null) {
387: entry.setSummary(parseContent(e));
388: }
389:
390: e = eEntry.getChild("content", getAtomNamespace());
391: if (e != null) {
392: List contents = new ArrayList();
393: contents.add(parseContent(e));
394: entry.setContents(contents);
395: }
396:
397: e = eEntry.getChild("rights", getAtomNamespace());
398: if (e != null) {
399: entry.setRights(e.getText());
400: }
401:
402: List cList = eEntry.getChildren("category", getAtomNamespace());
403: entry.setCategories(parseCategories(baseURI, cList));
404:
405: // TODO: SHOULD handle Atom entry source element
406:
407: entry.setModules(parseItemModules(eEntry));
408:
409: List foreignMarkup = extractForeignMarkup(eEntry, entry,
410: getAtomNamespace());
411: if (foreignMarkup.size() > 0) {
412: entry.setForeignMarkup(foreignMarkup);
413: }
414: return entry;
415: }
416:
417: private List parseCategories(String baseURI, List eCategories) {
418: List cats = new ArrayList();
419: for (int i = 0; i < eCategories.size(); i++) {
420: Element eCategory = (Element) eCategories.get(i);
421: cats.add(parseCategory(baseURI, eCategory));
422: }
423: return (cats.size() > 0) ? cats : null;
424: }
425:
426: private Category parseCategory(String baseURI, Element eCategory) {
427: Category category = new Category();
428: String att = eCategory.getAttributeValue("term");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
429: if (att != null) {
430: category.setTerm(att);
431: }
432: att = eCategory.getAttributeValue("scheme");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
433: if (att != null) {
434: category.setScheme(resolveURI(baseURI, eCategory, att));
435: }
436: att = eCategory.getAttributeValue("label");//getAtomNamespace()); DONT KNOW WHY DOESN'T WORK
437: if (att != null) {
438: category.setLabel(att);
439: }
440: return category;
441:
442: }
443:
444: // Fix for issue #34 "valid IRI href attributes are stripped for atom:link"
445: // URI's that didn't start with http were being treated as relative URIs.
446: // So now consider an absolute URI to be any alpha-numeric string followed
447: // by a colon, followed by anything -- specified by this regex:
448: static Pattern absoluteURIPattern = Pattern
449: .compile("^[a-z0-9]*:.*$");
450:
451: private boolean isAbsoluteURI(String uri) {
452: return absoluteURIPattern.matcher(uri).find();
453: }
454:
455: private boolean isRelativeURI(String uri) {
456: return !isAbsoluteURI(uri);
457: }
458:
459: /**
460: * }
461: * Resolve URI based considering xml:base and baseURI.
462: * @param baseURI Base URI of feed
463: * @param parent Parent from which to consider xml:base
464: * @param url URL to be resolved
465: */
466: private String resolveURI(String baseURI, Parent parent, String url) {
467: if (isRelativeURI(url)) {
468: url = (!".".equals(url) && !"./".equals(url)) ? url : "";
469:
470: // Relative URI with parent
471: if (parent != null && parent instanceof Element) {
472:
473: // Do we have an xml:base?
474: String xmlbase = ((Element) parent).getAttributeValue(
475: "base", Namespace.XML_NAMESPACE);
476: if (xmlbase != null && xmlbase.trim().length() > 0) {
477: if (isAbsoluteURI(xmlbase)) {
478: // Absolute xml:base, so form URI right now
479: if (url.startsWith("/")) {
480: // Host relative URI
481: int slashslash = xmlbase.indexOf("//");
482: int nextslash = xmlbase.indexOf("/",
483: slashslash + 2);
484: if (nextslash != -1)
485: xmlbase = xmlbase.substring(0,
486: nextslash);
487: return formURI(xmlbase, url);
488: }
489: if (!xmlbase.endsWith("/")) {
490: // Base URI is filename, strip it off
491: xmlbase = xmlbase.substring(0, xmlbase
492: .lastIndexOf("/"));
493: }
494: return formURI(xmlbase, url);
495: } else {
496: // Relative xml:base, so walk up tree
497: return resolveURI(baseURI, parent.getParent(),
498: stripTrailingSlash(xmlbase) + "/"
499: + stripStartingSlash(url));
500: }
501: }
502: // No xml:base so walk up tree
503: return resolveURI(baseURI, parent.getParent(), url);
504:
505: // Relative URI with no parent (i.e. top of tree), so form URI right now
506: } else if (parent == null || parent instanceof Document) {
507: return formURI(baseURI, url);
508: }
509: }
510: return url;
511: }
512:
513: /**
514: * Find base URI of feed considering relative URIs.
515: * @param root Root element of feed.
516: */
517: private String findBaseURI(Element root)
518: throws MalformedURLException {
519: String ret = findAtomLink(root, "alternate");
520: if (ret != null && isRelativeURI(ret)) {
521: String self = findAtomLink(root, "self");
522: if (self != null) {
523: self = resolveURI(null, root, self);
524: self = self.substring(0, self.lastIndexOf("/"));
525: ret = resolveURI(self, root, ret);
526: }
527: }
528: return ret;
529: }
530:
531: /**
532: * Return URL string of Atom link element under parent element.
533: * Link with no rel attribute is considered to be rel="alternate"
534: * @param parent Consider only children of this parent element
535: * @param rel Consider only links with this relationship
536: */
537: private String findAtomLink(Element parent, String rel) {
538: String ret = null;
539: List linksList = parent.getChildren("link", ns);
540: if (linksList != null) {
541: for (Iterator links = linksList.iterator(); links.hasNext();) {
542: Element link = (Element) links.next();
543: Attribute relAtt = link.getAttribute("rel");
544: Attribute hrefAtt = link.getAttribute("href");
545: if ((relAtt == null && "alternate".equals(rel))
546: || (relAtt != null && relAtt.getValue().equals(
547: rel))) {
548: ret = hrefAtt.getValue();
549: break;
550: }
551: }
552: }
553: return ret;
554: }
555:
556: /**
557: * Form URI by combining base with append portion and giving
558: * special consideration to append portions that begin with ".."
559: * @param base Base of URI, may end with trailing slash
560: * @param append String to append, may begin with slash or ".."
561: */
562: private static String formURI(String base, String append) {
563: base = stripTrailingSlash(base);
564: append = stripStartingSlash(append);
565: if (append.startsWith("..")) {
566: String ret = null;
567: String[] parts = append.split("/");
568: for (int i = 0; i < parts.length; i++) {
569: if ("..".equals(parts[i])) {
570: int last = base.lastIndexOf("/");
571: if (last != -1) {
572: base = base.substring(0, last);
573: append = append.substring(3, append.length());
574: } else
575: break;
576: }
577: }
578: }
579: return base + "/" + append;
580: }
581:
582: /**
583: * Strip starting slash from beginning of string.
584: */
585: private static String stripStartingSlash(String s) {
586: if (s != null && s.startsWith("/")) {
587: s = s.substring(1, s.length());
588: }
589: return s;
590: }
591:
592: /**
593: * Strip trailing slash from end of string.
594: */
595: private static String stripTrailingSlash(String s) {
596: if (s != null && s.endsWith("/")) {
597: s = s.substring(0, s.length() - 1);
598: }
599: return s;
600: }
601: }
|