001: //SitemapParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2007
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: $LastChangedDate$ by $LastChangedBy$
010: //Revision: $LastChangedRevision$
011: //
012: //This program is free software; you can redistribute it and/or modify
013: //it under the terms of the GNU General Public License as published by
014: //the Free Software Foundation; either version 2 of the License, or
015: //(at your option) any later version.
016: //
017: //This program is distributed in the hope that it will be useful,
018: //but WITHOUT ANY WARRANTY; without even the implied warranty of
019: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: //GNU General Public License for more details.
021: //
022: //You should have received a copy of the GNU General Public License
023: //along with this program; if not, write to the Free Software
024: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: //
026: //Using this software in any meaning (reading, learning, copying, compiling,
027: //running) means that you agree that the Author(s) is (are) not responsible
028: //for cost, loss of data or any harm that may be caused directly or indirectly
029: //by usage of this softare or this documentation. The usage of this software
030: //is on your own risk. The installation and usage (starting/running) of this
031: //software may allow other people or application to access your computer and
032: //any attached devices and is highly dependent on the configuration of the
033: //software which must be done by the user of the software; the author(s) is
034: //(are) also not responsible for proper configuration and usage of the
035: //software, even if provoked by documentation provided together with
036: //the software.
037: //
038: //Any changes to this file according to the GPL as documented in the file
039: //gpl.txt aside this file in the shipment you received can be done to the
040: //lines that follows this copyright notice here, but changes must not be
041: //done inside the copyright notive above. A re-distribution must contain
042: //the intact and unchanged copyright notice.
043: //Contributions and changes to the program code must be marked as such.
044:
045: package de.anomic.data;
046:
047: import java.io.InputStream;
048: import java.net.MalformedURLException;
049: import java.text.ParseException;
050: import java.util.Date;
051: import java.util.zip.GZIPInputStream;
052:
053: import javax.xml.parsers.SAXParser;
054: import javax.xml.parsers.SAXParserFactory;
055:
056: import org.xml.sax.Attributes;
057: import org.xml.sax.SAXException;
058: import org.xml.sax.helpers.DefaultHandler;
059:
060: import de.anomic.http.httpc;
061: import de.anomic.http.httpdByteCountInputStream;
062: import de.anomic.index.indexURLEntry;
063: import de.anomic.plasma.plasmaCrawlProfile;
064: import de.anomic.plasma.plasmaCrawlZURL;
065: import de.anomic.plasma.plasmaSwitchboard;
066: import de.anomic.server.serverDate;
067: import de.anomic.server.logging.serverLog;
068: import de.anomic.yacy.yacyCore;
069: import de.anomic.yacy.yacyURL;
070:
071: /**
072: * Class to parse a sitemap file.<br>
073: * An example sitemap file is depicted below:<br>
074: * <pre>
075: * <?xml version="1.0" encoding="UTF-8"?>
076: * <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
077: * <url>
078: * <loc>http://www.example.com/</loc>
079: * <lastmod>2005-01-01</lastmod>
080: * <changefreq>monthly</changefreq>
081: * <priority>0.8</priority>
082: * </url>
083: * </urlset>
084: * </pre>
085: *
086: * A real example can be found here: http://www.xt-service.de/sitemap.xml
087: * An example robots.txt containing a sitemap URL: http://notepad.emaillink.de/robots.txt
088: *
089: * @see Protocol at sitemaps.org <a href="http://www.sitemaps.org/protocol.php">http://www.sitemaps.org/protocol.php</a>
090: * @see Protocol at google.com <a href="https://www.google.com/webmasters/tools/docs/en/protocol.html">https://www.google.com/webmasters/tools/docs/en/protocol.html</a>
091: */
092: public class SitemapParser extends DefaultHandler {
093: public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9";
094: public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84";
095:
096: public static final String SITEMAP_XMLNS = "xmlns";
097: public static final String SITEMAP_URLSET = "urlset";
098: public static final String SITEMAP_URL = "url";
099: public static final String SITEMAP_URL_LOC = "loc";
100: public static final String SITEMAP_URL_LASTMOD = "lastmod";
101: public static final String SITEMAP_URL_CHANGEFREQ = "changefreq";
102: public static final String SITEMAP_URL_PRIORITY = "priority";
103:
104: /**
105: * The crawling profile used to parse the URLs contained in the sitemap file
106: */
107: private plasmaCrawlProfile.entry crawlingProfile = null;
108:
109: /**
110: * Reference to the plasmaswitchboard.
111: */
112: private plasmaSwitchboard switchboard = null;
113:
114: /**
115: * Name of the current XML element
116: */
117: private String currentElement = null;
118:
119: /**
120: * A special stream to count how many bytes were processed so far
121: */
122: private httpdByteCountInputStream counterStream;
123:
124: /**
125: * The total length of the sitemap file
126: */
127: private long contentLength;
128:
129: /**
130: * The amount of urls processes so far
131: */
132: private int urlCounter = 0;
133:
134: /**
135: * the logger
136: */
137: private serverLog logger = new serverLog("SITEMAP");
138:
139: /**
140: * The location of the sitemap file
141: */
142: private yacyURL siteMapURL = null;
143:
144: /**
145: * The next URL to enqueue
146: */
147: private String nextURL = null;
148:
149: /**
150: * last modification date of the {@link #nextURL}
151: */
152: private Date lastMod = null;
153:
154: public SitemapParser(plasmaSwitchboard sb, yacyURL sitemap,
155: plasmaCrawlProfile.entry theCrawlingProfile) {
156: if (sb == null)
157: throw new NullPointerException(
158: "The switchboard must not be null");
159: if (sitemap == null)
160: throw new NullPointerException(
161: "The sitemap URL must not be null");
162: this .switchboard = sb;
163: this .siteMapURL = sitemap;
164:
165: if (theCrawlingProfile == null) {
166: // create a new profile
167: this .crawlingProfile = createProfile(this .siteMapURL
168: .getHost(), this .siteMapURL);
169: } else {
170: // use an existing profile
171: this .crawlingProfile = theCrawlingProfile;
172: }
173: }
174:
175: /**
176: * Function to download and parse the sitemap file
177: */
178: public void parse() {
179: // download document
180: httpc remote = null;
181: try {
182: remote = new httpc(this .siteMapURL.getHost(),
183: this .siteMapURL.getHost(), this .siteMapURL
184: .getPort(), 5000, this .siteMapURL
185: .getProtocol().equalsIgnoreCase("https"),
186: switchboard.remoteProxyConfig, null, null);
187:
188: httpc.response res = remote.GET(this .siteMapURL.getFile(),
189: null);
190: if (res.statusCode != 200) {
191: this .logger
192: .logWarning("Unable to download the sitemap file "
193: + this .siteMapURL
194: + "\nServer returned status: "
195: + res.status);
196: return;
197: }
198:
199: // getting some metadata
200: String contentMimeType = res.responseHeader.mime();
201: this .contentLength = res.responseHeader.contentLength();
202:
203: InputStream contentStream = res.getContentInputStream();
204: if ((contentMimeType != null)
205: && (contentMimeType.equals("application/x-gzip") || contentMimeType
206: .equals("application/gzip"))) {
207: this .logger.logFine("Sitemap file has mimetype "
208: + contentMimeType);
209: contentStream = new GZIPInputStream(contentStream);
210: }
211:
212: this .counterStream = new httpdByteCountInputStream(
213: contentStream, null);
214:
215: // parse it
216: this .logger.logInfo("Start parsing sitemap file "
217: + this .siteMapURL + "\n\tMimeType: "
218: + contentMimeType + "\n\tLength: "
219: + this .contentLength);
220: SAXParser saxParser = SAXParserFactory.newInstance()
221: .newSAXParser();
222: saxParser.parse(this .counterStream, this );
223: remote.close();
224: } catch (Exception e) {
225: this .logger.logWarning("Unable to parse sitemap file "
226: + this .siteMapURL, e);
227: }
228: }
229:
230: /**
231: * @return the total length of the sitemap file in bytes or <code>-1</code> if the length is unknown
232: */
233: public long getTotalLength() {
234: return this .contentLength;
235: }
236:
237: /**
238: * @return the amount of bytes of the sitemap file that were downloaded so far
239: */
240: public long getProcessedLength() {
241: return (this .counterStream == null) ? 0 : this .counterStream
242: .getCount();
243: }
244:
245: /**
246: * @return the amount of URLs that were successfully enqueued so far
247: */
248: public long getUrlcount() {
249: return this .urlCounter;
250: }
251:
252: /**
253: * @param localName local name
254: * @param qName qualified name
255: * @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
256: */
257: public void startElement(String namespaceURI, String localName,
258: String qName, Attributes attrs) throws SAXException {
259: this .currentElement = qName;
260:
261: // testing if the namespace is known
262: if (qName.equalsIgnoreCase(SITEMAP_URLSET)) {
263: String namespace = attrs.getValue(SITEMAP_XMLNS);
264: if ((namespace == null)
265: || ((!namespace.equals(XMLNS_SITEMAPS_ORG)) && (!namespace
266: .equals(XMLNS_SITEMAPS_GOOGLE))))
267: throw new SAXException("Unknown sitemap namespace: "
268: + namespace);
269: }
270: }
271:
272: /**
273: * @param localName local name
274: * @param qName qualified name
275: * @throws SAXException
276: * @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
277: */
278: public void endElement(String namespaceURI, String localName,
279: String qName) throws SAXException {
280: this .currentElement = "";
281:
282: if (qName.equalsIgnoreCase(SITEMAP_URL)) {
283: if (this .nextURL == null)
284: return;
285:
286: // get the url hash
287: String nexturlhash = null;
288: yacyURL url = null;
289: try {
290: url = new yacyURL(this .nextURL, null);
291: nexturlhash = url.hash();
292: } catch (MalformedURLException e1) {
293: }
294:
295: // check if the url is known and needs to be recrawled
296: if (this .lastMod != null) {
297: String dbocc = this .switchboard.urlExists(nexturlhash);
298: if ((dbocc != null)
299: && (dbocc.equalsIgnoreCase("loaded"))) {
300: // the url was already loaded. we need to check the date
301: indexURLEntry oldEntry = this .switchboard.wordIndex.loadedURL
302: .load(nexturlhash, null, 0);
303: if (oldEntry != null) {
304: Date modDate = oldEntry.moddate();
305: // check if modDate is null
306: if (modDate.after(this .lastMod))
307: return;
308: }
309: }
310: }
311:
312: // URL needs to crawled
313: String error = null;
314: error = this .switchboard.crawlStacker.stackCrawl(url,
315: null, // this.siteMapURL.toString(),
316: yacyCore.seedDB.mySeed().hash, this .nextURL,
317: new Date(), 0, this .crawlingProfile);
318:
319: if (error != null) {
320: try {
321: this .logger.logInfo("The URL '" + this .nextURL
322: + "' can not be crawled. Reason: " + error);
323:
324: // insert URL into the error DB
325: plasmaCrawlZURL.Entry ee = this .switchboard.crawlQueues.errorURL
326: .newEntry(new yacyURL(this .nextURL, null),
327: error);
328: ee.store();
329: this .switchboard.crawlQueues.errorURL.push(ee);
330: } catch (MalformedURLException e) {/* ignore this */
331: }
332: } else {
333: this .logger.logInfo("New URL '" + this .nextURL
334: + "' added for crawling.");
335:
336: // count successfully added URLs
337: this .urlCounter++;
338: }
339: }
340: }
341:
342: public void characters(char[] buf, int offset, int len)
343: throws SAXException {
344: if (this .currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {
345: // TODO: we need to decode the URL here
346: this .nextURL = (new String(buf, offset, len)).trim();
347: if (!this .nextURL.startsWith("http")
348: && !this .nextURL.startsWith("https")) {
349: this .logger.logInfo("The url '" + this .nextURL
350: + "' has a wrong format. Ignore it.");
351: this .nextURL = null;
352: }
353: } else if (this .currentElement
354: .equalsIgnoreCase(SITEMAP_URL_LASTMOD)) {
355: String dateStr = new String(buf, offset, len);
356: try {
357: this .lastMod = serverDate.parseISO8601(dateStr);
358: } catch (ParseException e) {
359: this .logger.logInfo("Unable to parse datestring '"
360: + dateStr + "'");
361: }
362: }
363: }
364:
365: private plasmaCrawlProfile.entry createProfile(String domainName,
366: yacyURL sitemapURL) {
367: return this .switchboard.profilesActiveCrawls.newEntry(
368: domainName, sitemapURL,
369: // crawlingFilter
370: ".*", ".*",
371: // Depth
372: 0, 0,
373: // force recrawling
374: 0,
375: // disable Auto-Dom-Filter
376: -1, -1,
377: // allow crawling of dynamic URLs
378: true,
379: // index text + media
380: true, true,
381: // don't store downloaded pages to Web Cache
382: false,
383: // store to TX cache
384: true,
385: // remote Indexing disabled
386: false,
387: // exclude stop-words
388: true, true, true);
389: }
390: }
|