001: // rssReader.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 16.07.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.xml;
028:
029: import java.io.ByteArrayInputStream;
030: import java.io.IOException;
031: import java.io.InputStream;
032: import java.util.ArrayList;
033: import java.util.HashMap;
034: import java.util.HashSet;
035:
036: import javax.xml.parsers.SAXParser;
037: import javax.xml.parsers.SAXParserFactory;
038:
039: import org.xml.sax.Attributes;
040: import org.xml.sax.SAXException;
041: import org.xml.sax.helpers.DefaultHandler;
042:
043: import de.anomic.server.serverByteBuffer;
044: import de.anomic.server.logging.serverLog;
045:
046: public class rssReader extends DefaultHandler {
047:
048: // statics for item generation and automatic categorization
049: private static int guidcount = 0;
050: private static final String[] tagsDef = new String[] { "author", //
051: "copyright", //
052: "category", //
053: "title", //
054: "link", //
055: "referrer", //
056: "language", //
057: "description", //
058: "creator", //
059: "pubDate", //
060: "guid", //
061: "docs" //
062: };
063:
064: private static final HashSet<String> tags = new HashSet<String>();
065: static {
066: for (int i = 0; i < tagsDef.length; i++) {
067: tags.add(tagsDef[i]);
068: }
069: }
070:
071: // class variables
072: private Item channel, item;
073: private StringBuffer buffer;
074: private boolean parsingChannel, parsingImage, parsingItem;
075: private String imageURL;
076: private ArrayList<String> itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
077: private HashMap<String, Item> items; // a guid:Item map
078:
079: public rssReader() {
080: itemsGUID = new ArrayList<String>();
081: items = new HashMap<String, Item>();
082: buffer = new StringBuffer();
083: item = null;
084: channel = null;
085: parsingChannel = false;
086: parsingImage = false;
087: parsingItem = false;
088: }
089:
090: public rssReader(String path) {
091: this ();
092: try {
093: SAXParserFactory factory = SAXParserFactory.newInstance();
094: SAXParser saxParser = factory.newSAXParser();
095: saxParser.parse(path, this );
096: } catch (Exception e) {
097: e.printStackTrace();
098: }
099: }
100:
101: public rssReader(InputStream stream) {
102: this ();
103: try {
104: SAXParserFactory factory = SAXParserFactory.newInstance();
105: SAXParser saxParser = factory.newSAXParser();
106: saxParser.parse(stream, this );
107: } catch (Exception e) {
108: e.printStackTrace();
109: }
110: }
111:
112: public static rssReader parse(byte[] a) {
113:
114: // check integrity of array
115: if ((a == null) || (a.length == 0)) {
116: serverLog.logWarning("rssReader", "response=null");
117: return null;
118: }
119: if (a.length < 100) {
120: serverLog.logWarning("rssReader", "response="
121: + new String(a));
122: return null;
123: }
124: if (!serverByteBuffer.equals(a, "<?xml".getBytes())) {
125: serverLog.logWarning("rssReader",
126: "response does not contain valid xml");
127: return null;
128: }
129: String end = new String(a, a.length - 10, 10);
130: if (end.indexOf("rss") < 0) {
131: serverLog.logWarning("rssReader", "response incomplete");
132: return null;
133: }
134:
135: // make input stream
136: ByteArrayInputStream bais = new ByteArrayInputStream(a);
137:
138: // parse stream
139: rssReader reader = null;
140: try {
141: reader = new rssReader(bais);
142: } catch (Exception e) {
143: serverLog.logWarning("rssReader", "parse exception: " + e);
144: return null;
145: }
146: try {
147: bais.close();
148: } catch (IOException e) {
149: }
150: return reader;
151: }
152:
153: public void startElement(String uri, String name, String tag,
154: Attributes atts) throws SAXException {
155: if ("channel".equals(tag)) {
156: channel = new Item();
157: parsingChannel = true;
158: } else if ("item".equals(tag)) {
159: item = new Item();
160: parsingItem = true;
161: } else if ("image".equals(tag)) {
162: parsingImage = true;
163: }
164: }
165:
166: public void endElement(String uri, String name, String tag) {
167: if (tag == null)
168: return;
169: if ("channel".equals(tag)) {
170: parsingChannel = false;
171: } else if ("item".equals(tag)) {
172: String guid = item.getGuid();
173: itemsGUID.add(guid);
174: items.put(guid, item);
175: parsingItem = false;
176: } else if ("image".equals(tag)) {
177: parsingImage = false;
178: } else if ((parsingImage) && (parsingChannel)) {
179: String value = buffer.toString().trim();
180: buffer.setLength(0);
181: if ("url".equals(tag))
182: imageURL = value;
183: } else if (parsingItem) {
184: String value = buffer.toString().trim();
185: buffer.setLength(0);
186: if (tags.contains(tag))
187: item.setValue(tag, value);
188: } else if (parsingChannel) {
189: String value = buffer.toString().trim();
190: buffer.setLength(0);
191: if (tags.contains(tag))
192: channel.setValue(tag, value);
193: }
194: }
195:
196: public void characters(char ch[], int start, int length) {
197: if (parsingItem || parsingChannel) {
198: buffer.append(ch, start, length);
199: }
200: }
201:
202: public Item getChannel() {
203: return channel;
204: }
205:
206: public Item getItem(int i) {
207: // retrieve item by order number
208: return getItem((String) itemsGUID.get(i));
209: }
210:
211: public Item getItem(String guid) {
212: // retrieve item by guid
213: return (Item) items.get(guid);
214: }
215:
216: public int items() {
217: return items.size();
218: }
219:
220: public String getImage() {
221: return this .imageURL;
222: }
223:
224: public static class Item {
225:
226: private HashMap<String, String> map;
227:
228: public Item() {
229: this .map = new HashMap<String, String>();
230: this .map.put("guid", Long.toHexString(System
231: .currentTimeMillis())
232: + ":" + guidcount++);
233: }
234:
235: public void setValue(String name, String value) {
236: map.put(name, value);
237: }
238:
239: public String getAuthor() {
240: return (String) map.get("author");
241: }
242:
243: public String getCopyright() {
244: return (String) map.get("copyright");
245: }
246:
247: public String getCategory() {
248: return (String) map.get("category");
249: }
250:
251: public String getTitle() {
252: return (String) map.get("title");
253: }
254:
255: public String getLink() {
256: return (String) map.get("link");
257: }
258:
259: public String getReferrer() {
260: return (String) map.get("referrer");
261: }
262:
263: public String getLanguage() {
264: return (String) map.get("language");
265: }
266:
267: public String getDescription() {
268: return (String) map.get("description");
269: }
270:
271: public String getCreator() {
272: return (String) map.get("creator");
273: }
274:
275: public String getPubDate() {
276: return (String) map.get("pubDate");
277: }
278:
279: public String getGuid() {
280: return (String) map.get("guid");
281: }
282:
283: public String getDocs() {
284: return (String) map.get("docs");
285: }
286: }
287: }
|