001: /* Robots.java
002: *
003: * $Id: Robotstxt.java 4947 2007-03-01 04:47:24Z gojomo $
004: *
005: * Created Sep 1, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.datamodel;
026:
027: import java.io.BufferedReader;
028: import java.io.IOException;
029: import java.util.ArrayList;
030: import java.util.LinkedList;
031: import java.util.List;
032: import java.util.Map;
033:
034: /**
035: * Utility class for parsing 'robots.txt' format directives, into a list
036: * of named user-agents and map from user-agents to disallowed paths.
037: */
038: public class Robotstxt {
039: public static boolean parse(BufferedReader reader,
040: final LinkedList<String> userAgents,
041: final Map<String, List<String>> disallows)
042: throws IOException {
043: boolean hasErrors = false;
044: String read;
045: // current is the disallowed paths for the preceding User-Agent(s)
046: ArrayList<String> current = null;
047: // whether a non-'User-Agent' directive has been encountered
048: boolean hasDirectivesYet = false;
049: String catchall = null;
050: while (reader != null) {
051: do {
052: read = reader.readLine();
053: // Skip comments & blanks
054: } while ((read != null)
055: && ((read = read.trim()).startsWith("#") || read
056: .length() == 0));
057: if (read == null) {
058: reader.close();
059: reader = null;
060: } else {
061: int commentIndex = read.indexOf("#");
062: if (commentIndex > -1) {
063: // Strip trailing comment
064: read = read.substring(0, commentIndex);
065: }
066: read = read.trim();
067: if (read.matches("(?i)^User-agent:.*")) {
068: String ua = read.substring(11).trim().toLowerCase();
069: if (current == null || hasDirectivesYet) {
070: // only create new rules-list if necessary
071: // otherwise share with previous user-agent
072: current = new ArrayList<String>();
073: hasDirectivesYet = false;
074: }
075: if (ua.equals("*")) {
076: ua = "";
077: catchall = ua;
078: } else {
079: userAgents.addLast(ua);
080: }
081: disallows.put(ua, current);
082: continue;
083: }
084: if (read.matches("(?i)Disallow:.*")) {
085: if (current == null) {
086: // buggy robots.txt
087: hasErrors = true;
088: continue;
089: }
090: String path = read.substring(9).trim();
091: current.add(path);
092: hasDirectivesYet = true;
093: continue;
094: }
095: if (read.matches("(?i)Crawl-delay:.*")) {
096: if (current == null) {
097: // buggy robots.txt
098: hasErrors = true;
099: continue;
100: }
101: // consider a crawl-delay, even though we don't
102: // yet understand it, as sufficient to end a
103: // grouping of User-Agent lines
104: hasDirectivesYet = true;
105: // TODO: understand/save/respect 'Crawl-Delay'
106: continue;
107: }
108: if (read.matches("(?i)Allow:.*")) {
109: if (current == null) {
110: // buggy robots.txt
111: hasErrors = true;
112: continue;
113: }
114: // consider an Allow, even though we don't
115: // yet understand it, as sufficient to end a
116: // grouping of User-Agent lines
117: hasDirectivesYet = true;
118: // TODO: understand/save/respect 'Allow'
119: continue;
120: }
121: // unknown line; do nothing for now
122: }
123: }
124:
125: if (catchall != null) {
126: userAgents.addLast(catchall);
127: }
128: return hasErrors;
129: }
130:
131: /**
132: * @param args Command-line arguments.
133: */
134: public static void main(String[] args) {
135: // TODO Auto-generated method stub
136: }
137: }
|