001: //robotsParser.java
002: //-------------------------------------
003: //part of YACY
004: //
005: //(C) 2005, 2006 by Alexander Schier
006: // Martin Thelian
007: //
008: //last change: $LastChangedDate: 2008-01-29 10:12:48 +0000 (Di, 29 Jan 2008) $ by $LastChangedBy: orbiter $
009: //Revision: $LastChangedRevision: 4414 $
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.data;
045:
046: import java.io.BufferedReader;
047: import java.io.ByteArrayInputStream;
048: import java.io.File;
049: import java.io.FileReader;
050: import java.io.IOException;
051: import java.io.InputStreamReader;
052: import java.net.MalformedURLException;
053: import java.net.URLDecoder;
054: import java.util.ArrayList;
055: import java.util.Date;
056:
057: import de.anomic.http.httpHeader;
058: import de.anomic.http.httpc;
059: import de.anomic.plasma.plasmaCrawlRobotsTxt;
060: import de.anomic.plasma.plasmaSwitchboard;
061: import de.anomic.server.serverByteBuffer;
062: import de.anomic.server.logging.serverLog;
063: import de.anomic.yacy.yacyURL;
064:
065: /*
066: * A class for Parsing robots.txt files.
067: * It only parses the Deny Part, yet.
068: *
069: * Robots RFC
070: * http://www.robotstxt.org/wc/norobots-rfc.html
071: *
072: * TODO:
073: * - On the request attempt resulted in temporary failure a robot
074: * should defer visits to the site until such time as the resource
075: * can be retrieved.
076: *
077: * - Extended Standard for Robot Exclusion
078: * See: http://www.conman.org/people/spc/robots2.html
079: *
080: * - Robot Exclusion Standard Revisited
081: * See: http://www.kollar.com/robots.html
082: */
083: public final class robotsParser {
084: public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
085: public static final int DOWNLOAD_ROBOTS_TXT = 1;
086: public static final int DOWNLOAD_ETAG = 2;
087: public static final int DOWNLOAD_MODDATE = 3;
088:
089: public static final String ROBOTS_USER_AGENT = "User-agent:"
090: .toUpperCase();
091: public static final String ROBOTS_DISALLOW = "Disallow:"
092: .toUpperCase();
093: public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
094: public static final String ROBOTS_COMMENT = "#";
095: public static final String ROBOTS_SITEMAP = "Sitemap:"
096: .toUpperCase();
097: public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:"
098: .toUpperCase();
099:
100: /*public robotsParser(URL robotsUrl){
101: }*/
102: /*
103: * this parses the robots.txt.
104: * at the Moment it only creates a list of Deny Paths
105: */
106:
107: public static Object[] parse(File robotsFile) throws IOException {
108: BufferedReader reader = null;
109: try {
110: reader = new BufferedReader(new FileReader(robotsFile));
111: return parse(reader);
112: } finally {
113: if (reader != null)
114: try {
115: reader.close();
116: } catch (Exception e) {/* ignore this */
117: }
118: }
119: }
120:
121: @SuppressWarnings("unchecked")
122: public static Object[] parse(byte[] robotsTxt) throws IOException {
123: if ((robotsTxt == null) || (robotsTxt.length == 0))
124: return new Object[] { new ArrayList(0), null, null };
125: ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
126: BufferedReader reader = new BufferedReader(
127: new InputStreamReader(bin));
128: return parse(reader);
129: }
130:
131: public static Object[] parse(BufferedReader reader)
132: throws IOException {
133: ArrayList<String> deny4AllAgents = new ArrayList<String>();
134: ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
135:
136: int pos;
137: String line = null, lineUpper = null, sitemap = null;
138: Integer crawlDelay = null;
139: boolean isRuleBlock4AllAgents = false, isRuleBlock4YaCyAgent = false, rule4YaCyFound = false, inBlock = false;
140:
141: while ((line = reader.readLine()) != null) {
142: line = line.trim();
143: lineUpper = line.toUpperCase();
144:
145: if (line.length() == 0) {
146: // OLD: we have reached the end of the rule block
147: // rule4Yacy = false; inBlock = false;
148:
149: // NEW: just ignore it
150: } else if (line.startsWith(ROBOTS_COMMENT)) {
151: // we can ignore this. Just a comment line
152: } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
153: pos = line.indexOf(" ");
154: if (pos != -1) {
155: sitemap = line.substring(pos).trim();
156: }
157: } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
158:
159: if (inBlock) {
160: // we have detected the start of a new block
161: inBlock = false;
162: isRuleBlock4AllAgents = false;
163: isRuleBlock4YaCyAgent = false;
164: crawlDelay = null; // each block has a separate delay
165: }
166:
167: // cutting off comments at the line end
168: pos = line.indexOf(ROBOTS_COMMENT);
169: if (pos != -1)
170: line = line.substring(0, pos).trim();
171:
172: // replacing all tabs with spaces
173: line = line.replaceAll("\t", " ");
174:
175: // getting out the robots name
176: pos = line.indexOf(" ");
177: if (pos != -1) {
178: String userAgent = line.substring(pos).trim();
179: isRuleBlock4AllAgents |= userAgent.equals("*");
180: isRuleBlock4YaCyAgent |= userAgent.toLowerCase()
181: .indexOf("yacy") >= 0;
182: if (isRuleBlock4YaCyAgent)
183: rule4YaCyFound = true;
184: }
185: } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
186: pos = line.indexOf(" ");
187: if (pos != -1) {
188: try {
189: crawlDelay = Integer.valueOf(line
190: .substring(pos).trim());
191: } catch (NumberFormatException e) {
192: // invalid crawling delay
193: }
194: }
195: } else if (lineUpper.startsWith(ROBOTS_DISALLOW)
196: || lineUpper.startsWith(ROBOTS_ALLOW)) {
197: inBlock = true;
198: boolean isDisallowRule = lineUpper
199: .startsWith(ROBOTS_DISALLOW);
200:
201: if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
202: // cutting off comments at the line end
203: pos = line.indexOf(ROBOTS_COMMENT);
204: if (pos != -1)
205: line = line.substring(0, pos).trim();
206:
207: // cutting of tailing *
208: if (line.endsWith("*"))
209: line = line.substring(0, line.length() - 1);
210:
211: // replacing all tabs with spaces
212: line = line.replaceAll("\t", " ");
213:
214: // getting the path
215: pos = line.indexOf(" ");
216: if (pos != -1) {
217: // getting the path
218: String path = line.substring(pos).trim();
219:
220: // unencoding all special charsx
221: try {
222: path = URLDecoder.decode(path, "UTF-8");
223: } catch (Exception e) {
224: /*
225: * url decoding failed. E.g. because of
226: * "Incomplete trailing escape (%) pattern"
227: */
228: }
229:
230: // escaping all occurences of ; because this char is used as special char in the Robots DB
231: path = path
232: .replaceAll(
233: plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,
234: "%3B");
235:
236: // adding it to the pathlist
237: if (!isDisallowRule)
238: path = "!" + path;
239: if (isRuleBlock4AllAgents)
240: deny4AllAgents.add(path);
241: if (isRuleBlock4YaCyAgent)
242: deny4YaCyAgent.add(path);
243: }
244: }
245: }
246: }
247:
248: ArrayList<String> denyList = (rule4YaCyFound) ? deny4YaCyAgent
249: : deny4AllAgents;
250: return new Object[] { denyList, sitemap, crawlDelay };
251: }
252:
253: private static final int getPort(yacyURL theURL) {
254: int port = theURL.getPort();
255: if (port == -1) {
256: if (theURL.getProtocol().equalsIgnoreCase("http")) {
257: port = 80;
258: } else if (theURL.getProtocol().equalsIgnoreCase("https")) {
259: port = 443;
260: }
261:
262: }
263: return port;
264: }
265:
266: private static final String getHostPort(yacyURL theURL) {
267: String urlHostPort = null;
268: int port = getPort(theURL);
269: urlHostPort = theURL.getHost() + ":" + port;
270: urlHostPort = urlHostPort.toLowerCase().intern();
271:
272: return urlHostPort;
273: }
274:
275: public static yacyURL getSitemapURL(yacyURL theURL) {
276: if (theURL == null)
277: throw new IllegalArgumentException();
278: yacyURL sitemapURL = null;
279:
280: // generating the hostname:poart string needed to do a DB lookup
281: String urlHostPort = getHostPort(theURL);
282:
283: plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
284: synchronized (urlHostPort) {
285: // doing a DB lookup to determine if the robots data is already available
286: robotsTxt4Host = plasmaSwitchboard.robots
287: .getEntry(urlHostPort);
288: }
289: if (robotsTxt4Host == null)
290: return null;
291:
292: try {
293: String sitemapUrlStr = robotsTxt4Host.getSitemap();
294: if (sitemapUrlStr != null)
295: sitemapURL = new yacyURL(sitemapUrlStr, null);
296: } catch (MalformedURLException e) {/* ignore this */
297: }
298:
299: return sitemapURL;
300: }
301:
302: public static Integer getCrawlDelay(yacyURL theURL) {
303: if (theURL == null)
304: throw new IllegalArgumentException();
305: Integer crawlDelay = null;
306:
307: // generating the hostname:poart string needed to do a DB lookup
308: String urlHostPort = getHostPort(theURL);
309:
310: plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
311: synchronized (urlHostPort) {
312: // doing a DB lookup to determine if the robots data is already available
313: robotsTxt4Host = plasmaSwitchboard.robots
314: .getEntry(urlHostPort);
315: }
316: if (robotsTxt4Host == null)
317: return null;
318:
319: try {
320: crawlDelay = robotsTxt4Host.getCrawlDelay();
321: } catch (NumberFormatException e) {/* ignore this */
322: }
323:
324: return crawlDelay;
325: }
326:
327: @SuppressWarnings("unchecked")
328: public static boolean isDisallowed(yacyURL nexturl) {
329: if (nexturl == null)
330: throw new IllegalArgumentException();
331:
332: // generating the hostname:poart string needed to do a DB lookup
333: String urlHostPort = getHostPort(nexturl);
334:
335: plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
336: synchronized (urlHostPort) {
337: // doing a DB lookup to determine if the robots data is already available
338: robotsTxt4Host = plasmaSwitchboard.robots
339: .getEntry(urlHostPort);
340:
341: // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
342: if ((robotsTxt4Host == null)
343: || (robotsTxt4Host.getLoadedDate() == null)
344: || (System.currentTimeMillis()
345: - robotsTxt4Host.getLoadedDate().getTime() > 7
346: * 24 * 60 * 60 * 1000)) {
347: yacyURL robotsURL = null;
348: // generating the proper url to download the robots txt
349: try {
350: robotsURL = new yacyURL(nexturl.getProtocol(),
351: nexturl.getHost(), getPort(nexturl),
352: "/robots.txt");
353: } catch (MalformedURLException e) {
354: serverLog.logSevere("ROBOTS",
355: "Unable to generate robots.txt URL for URL '"
356: + nexturl.toString() + "'.");
357: return false;
358: }
359:
360: Object[] result = null;
361: boolean accessCompletelyRestricted = false;
362: byte[] robotsTxt = null;
363: String eTag = null;
364: Date modDate = null;
365: try {
366: serverLog.logFine("ROBOTS",
367: "Trying to download the robots.txt file from URL '"
368: + robotsURL + "'.");
369: result = downloadRobotsTxt(robotsURL, 5,
370: robotsTxt4Host);
371:
372: if (result != null) {
373: accessCompletelyRestricted = ((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED])
374: .booleanValue();
375: robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
376: eTag = (String) result[DOWNLOAD_ETAG];
377: modDate = (Date) result[DOWNLOAD_MODDATE];
378: } else if (robotsTxt4Host != null) {
379: robotsTxt4Host.setLoadedDate(new Date());
380: plasmaSwitchboard.robots
381: .addEntry(robotsTxt4Host);
382: }
383: } catch (Exception e) {
384: serverLog.logSevere("ROBOTS",
385: "Unable to download the robots.txt file from URL '"
386: + robotsURL + "'. "
387: + e.getMessage());
388: }
389:
390: if ((robotsTxt4Host == null)
391: || ((robotsTxt4Host != null) && (result != null))) {
392: ArrayList<String> denyPath = null;
393: String sitemap = null;
394: Integer crawlDelay = null;
395: if (accessCompletelyRestricted) {
396: denyPath = new ArrayList<String>();
397: denyPath.add("/");
398: } else {
399: // parsing the robots.txt Data and converting it into an arraylist
400: try {
401: Object[] parserResult = robotsParser
402: .parse(robotsTxt);
403: denyPath = (ArrayList<String>) parserResult[0];
404: sitemap = (String) parserResult[1];
405: crawlDelay = (Integer) parserResult[2];
406: } catch (IOException e) {
407: serverLog.logSevere("ROBOTS",
408: "Unable to parse the robots.txt file from URL '"
409: + robotsURL + "'.");
410: }
411: }
412:
413: // storing the data into the robots DB
414: robotsTxt4Host = plasmaSwitchboard.robots.addEntry(
415: urlHostPort, denyPath, new Date(), modDate,
416: eTag, sitemap, crawlDelay);
417: }
418: }
419: }
420:
421: if (robotsTxt4Host.isDisallowed(nexturl.getFile())) {
422: return true;
423: }
424: return false;
425: }
426:
427: static Object[] downloadRobotsTxt(yacyURL robotsURL,
428: int redirectionCount, plasmaCrawlRobotsTxt.Entry entry)
429: throws Exception {
430:
431: if (redirectionCount < 0)
432: return new Object[] { Boolean.FALSE, null, null };
433: redirectionCount--;
434:
435: boolean accessCompletelyRestricted = false;
436: byte[] robotsTxt = null;
437: httpc con = null;
438: long downloadStart, downloadEnd;
439: String eTag = null, oldEtag = null;
440: Date lastMod = null;
441: try {
442: downloadStart = System.currentTimeMillis();
443: plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
444: //TODO: adding Traffic statistic for robots download?
445: con = new httpc(robotsURL.getHost(), robotsURL.getHost(),
446: robotsURL.getPort(), 10000, robotsURL.getProtocol()
447: .equalsIgnoreCase("https"),
448: sb.remoteProxyConfig, null, null);
449:
450: // if we previously have downloaded this robots.txt then we can set the if-modified-since header
451: httpHeader reqHeaders = new httpHeader();
452:
453: // adding referer
454: reqHeaders.put(httpHeader.REFERER, (yacyURL.newURL(
455: robotsURL, "/")).toNormalform(true, true));
456:
457: if (entry != null) {
458: oldEtag = entry.getETag();
459: reqHeaders = new httpHeader();
460: Date modDate = entry.getModDate();
461: if (modDate != null)
462: reqHeaders.put(httpHeader.IF_MODIFIED_SINCE, httpc
463: .dateString(entry.getModDate()));
464:
465: }
466:
467: // sending the get request
468: httpc.response res = con.GET(robotsURL.getFile(),
469: reqHeaders);
470:
471: // check for interruption
472: if (Thread.currentThread().isInterrupted())
473: throw new InterruptedException("Shutdown in progress.");
474:
475: // check the response status
476: if (res.status.startsWith("2")) {
477: if (!res.responseHeader.mime().startsWith("text/plain")) {
478: robotsTxt = null;
479: serverLog.logFinest("ROBOTS",
480: "Robots.txt from URL '" + robotsURL
481: + "' has wrong mimetype '"
482: + res.responseHeader.mime() + "'.");
483: con.close();
484: } else {
485:
486: // getting some metadata
487: eTag = res.responseHeader
488: .containsKey(httpHeader.ETAG) ? ((String) res.responseHeader
489: .get(httpHeader.ETAG)).trim()
490: : null;
491: lastMod = res.responseHeader.lastModified();
492:
493: // if the robots.txt file was not changed we break here
494: if ((eTag != null) && (oldEtag != null)
495: && (eTag.equals(oldEtag))) {
496: serverLog
497: .logFinest(
498: "ROBOTS",
499: "Robots.txt from URL '"
500: + robotsURL
501: + "' was not modified. Abort downloading of new version.");
502: return null;
503: }
504:
505: // downloading the content
506: serverByteBuffer sbb = new serverByteBuffer();
507: res.writeContent(sbb, null);
508: robotsTxt = sbb.getBytes();
509: con.close();
510:
511: downloadEnd = System.currentTimeMillis();
512: serverLog.logFinest("ROBOTS",
513: "Robots.txt successfully loaded from URL '"
514: + robotsURL + "' in "
515: + (downloadEnd - downloadStart)
516: + " ms.");
517: }
518: } else if (res.status.startsWith("304")) {
519: con.close();
520: return null;
521: } else if (res.status.startsWith("3")) {
522: // getting redirection URL
523: String redirectionUrlString = (String) res.responseHeader
524: .get(httpHeader.LOCATION);
525: con.close();
526: if (redirectionUrlString == null) {
527: serverLog
528: .logFinest(
529: "ROBOTS",
530: "robots.txt could not be downloaded from URL '"
531: + robotsURL
532: + "' because of missing redirecton header. ["
533: + res.status + "].");
534: robotsTxt = null;
535: }
536:
537: redirectionUrlString = redirectionUrlString.trim();
538:
539: // generating the new URL object
540: yacyURL redirectionUrl = yacyURL.newURL(robotsURL,
541: redirectionUrlString);
542:
543: // following the redirection
544: serverLog.logFinest("ROBOTS",
545: "Redirection detected for robots.txt with URL '"
546: + robotsURL + "'."
547: + "\nRedirecting request to: "
548: + redirectionUrl);
549: return downloadRobotsTxt(redirectionUrl,
550: redirectionCount, entry);
551:
552: } else if (res.status.startsWith("401")
553: || res.status.startsWith("403")) {
554: con.close();
555: accessCompletelyRestricted = true;
556: serverLog.logFinest("ROBOTS",
557: "Access to Robots.txt not allowed on URL '"
558: + robotsURL + "'.");
559: } else {
560: serverLog.logFinest("ROBOTS",
561: "robots.txt could not be downloaded from URL '"
562: + robotsURL + "'. [" + res.status
563: + "].");
564: con.close();
565: robotsTxt = null;
566: }
567: } catch (Exception e) {
568: throw e;
569: }
570: return new Object[] { new Boolean(accessCompletelyRestricted),
571: robotsTxt, eTag, lastMod };
572: }
573: }
|