001: // crawlHandler.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 24.07.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.xml;
028:
029: import java.io.InputStream;
030: import java.util.ArrayList;
031: import java.util.HashMap;
032: import java.util.HashSet;
033:
034: import javax.xml.parsers.SAXParser;
035: import javax.xml.parsers.SAXParserFactory;
036:
037: import org.xml.sax.helpers.DefaultHandler;
038:
039: public class crawlHandler extends DefaultHandler {
040:
041: // statics for item generation and automatic categorization
042: private static int guidcount = 0;
043: private static final String[] startpointTags = new String[] {
044: "author", //
045: "copyright", //
046: "category", //
047: "title", //
048: "link", //
049: "language", //
050: "description", //
051: "creator", //
052: "pubDate", //
053: "guid", //
054: "docs" //
055: };
056:
057: private static final HashSet<String> startpointTagsSet = new HashSet<String>();
058: static {
059: for (int i = 0; i < startpointTags.length; i++) {
060: startpointTagsSet.add(startpointTags[i]);
061: }
062: }
063:
064: // class variables
065: private Startpoint channel, startpoint;
066: private StringBuffer buffer;
067: private boolean parsingAttributes, parsingStartpoint;
068: private ArrayList<String> startpointsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
069: private HashMap<String, Startpoint> startpoints; // a guid:Item map
070:
071: public crawlHandler(String path) {
072: init();
073: parse(path);
074: }
075:
076: public crawlHandler(InputStream stream) {
077: init();
078: parse(stream);
079: }
080:
081: private void init() {
082: startpointsGUID = new ArrayList<String>();
083: startpoints = new HashMap<String, Startpoint>();
084: buffer = new StringBuffer();
085: startpoint = null;
086: channel = null;
087: parsingAttributes = false;
088: parsingStartpoint = false;
089: }
090:
091: private void parse(String path) {
092: try {
093: SAXParserFactory factory = SAXParserFactory.newInstance();
094: SAXParser saxParser = factory.newSAXParser();
095: saxParser.parse(path, this );
096: } catch (Exception e) {
097: e.printStackTrace();
098: }
099: }
100:
101: private void parse(InputStream stream) {
102: try {
103: SAXParserFactory factory = SAXParserFactory.newInstance();
104: SAXParser saxParser = factory.newSAXParser();
105: saxParser.parse(stream, this );
106: } catch (Exception e) {
107: e.printStackTrace();
108: }
109: }
110:
111: public void startElement(String uri, String name, String tag,
112: Attributes atts) {
113: if ("channel".equals(tag)) {
114: channel = new Startpoint();
115: parsingAttributes = true;
116: } else if ("item".equals(tag)) {
117: startpoint = new Startpoint();
118: parsingStartpoint = true;
119: }
120: }
121:
122: public void endElement(String uri, String name, String tag) {
123: if (tag == null)
124: return;
125: if ("channel".equals(tag)) {
126: parsingAttributes = false;
127: } else if ("item".equals(tag)) {
128: String guid = startpoint.getGuid();
129: startpointsGUID.add(guid);
130: startpoints.put(guid, startpoint);
131: parsingStartpoint = false;
132: } else if (parsingStartpoint) {
133: String value = buffer.toString().trim();
134: buffer.setLength(0);
135: if (startpointTagsSet.contains(tag))
136: startpoint.setValue(tag, value);
137: } else if (parsingAttributes) {
138: String value = buffer.toString().trim();
139: buffer.setLength(0);
140: if (startpointTagsSet.contains(tag))
141: channel.setValue(tag, value);
142: }
143: }
144:
145: public void characters(char ch[], int start, int length) {
146: if (parsingStartpoint || parsingAttributes) {
147: buffer.append(ch, start, length);
148: }
149: }
150:
151: public Startpoint getChannel() {
152: return channel;
153: }
154:
155: public Startpoint getStartpoint(int i) {
156: // retrieve item by order number
157: return getStartpoint((String) startpointsGUID.get(i));
158: }
159:
160: public Startpoint getStartpoint(String guid) {
161: // retrieve item by guid
162: return (Startpoint) startpoints.get(guid);
163: }
164:
165: public int startpoints() {
166: return startpoints.size();
167: }
168:
169: public static class Attributes {
170:
171: private HashMap<String, String> map;
172:
173: public Attributes() {
174: this .map = new HashMap<String, String>();
175: }
176:
177: public void setValue(String name, String value) {
178: map.put(name, value);
179: }
180:
181: public String getAuthor() {
182: return (String) map.get("author");
183: }
184:
185: public String getCopyright() {
186: return (String) map.get("copyright");
187: }
188:
189: public String getCategory() {
190: return (String) map.get("category");
191: }
192:
193: public String getTitle() {
194: return (String) map.get("title");
195: }
196:
197: public String getLink() {
198: return (String) map.get("link");
199: }
200:
201: public String getLanguage() {
202: return (String) map.get("language");
203: }
204:
205: public String getDescription() {
206: return (String) map.get("description");
207: }
208:
209: public String getCreator() {
210: return (String) map.get("creator");
211: }
212:
213: public String getPubDate() {
214: return (String) map.get("pubDate");
215: }
216:
217: public String getGuid() {
218: return (String) map.get("guid");
219: }
220:
221: public String getDocs() {
222: return (String) map.get("docs");
223: }
224: }
225:
226: public static class Startpoint {
227:
228: private HashMap<String, String> map;
229:
230: public Startpoint() {
231: this .map = new HashMap<String, String>();
232: this .map.put("guid", Long.toHexString(System
233: .currentTimeMillis())
234: + ":" + guidcount++);
235: }
236:
237: public void setValue(String name, String value) {
238: map.put(name, value);
239: }
240:
241: public String getAuthor() {
242: return (String) map.get("author");
243: }
244:
245: public String getCopyright() {
246: return (String) map.get("copyright");
247: }
248:
249: public String getCategory() {
250: return (String) map.get("category");
251: }
252:
253: public String getTitle() {
254: return (String) map.get("title");
255: }
256:
257: public String getLink() {
258: return (String) map.get("link");
259: }
260:
261: public String getLanguage() {
262: return (String) map.get("language");
263: }
264:
265: public String getDescription() {
266: return (String) map.get("description");
267: }
268:
269: public String getCreator() {
270: return (String) map.get("creator");
271: }
272:
273: public String getPubDate() {
274: return (String) map.get("pubDate");
275: }
276:
277: public String getGuid() {
278: return (String) map.get("guid");
279: }
280:
281: public String getDocs() {
282: return (String) map.get("docs");
283: }
284: }
285: }
|