001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.net.URL;
036: import java.net.URLConnection;
037: import java.util.Hashtable;
038: import java.io.PushbackInputStream;
039: import java.io.BufferedInputStream;
040: import java.util.Vector;
041:
042: public class RobotExclusion {
043:
044: String myUserAgent;
045: // user agent (name) of this crawler, all lower-case
046: Hashtable entries = new Hashtable();
047:
048: // maps from a website ("host:port") to String[]
049:
050: /**
051: * Make a RobotExclusion object.
052: * @param userAgent name of the robot using this object, as shown
053: * in the User-Agent header fields of its HTTP requests. Use
054: * null for anonymous robots.
055: */
056: public RobotExclusion(String userAgent) {
057: myUserAgent = userAgent.toLowerCase();
058: }
059:
060: /**
061: * Check whether a URL is disallowed by robots.txt.
062: * @param url URL to test
063: * @return true if url's Web site denies robot access to the url
064: */
065: public boolean disallowed(URL url) {
066: if (!url.getProtocol().startsWith("http"))
067: // only HTTP URLs are protected by robots.txt
068: return false;
069:
070: String website = getWebSite(url);
071: String[] rules = (String[]) entries.get(website);
072:
073: if (rules == null) {
074: rules = getRobotsTxt(website, myUserAgent);
075: entries.put(website, rules);
076: }
077:
078: String path = url.getFile();
079: for (int i = 0; i < rules.length; ++i) {
080: if (path.startsWith(rules[i])) {
081: //System.err.println ("disallowed by rule " + rules[i]);
082: return true;
083: }
084: //System.err.println ("allowed by rule " + rules[i]);
085: }
086: return false;
087: }
088:
089: /**
090: * Clear the cache of robots.txt entries.
091: */
092: public void clear() {
093: entries.clear();
094: }
095:
096: /*
097: * Implementation
098: *
099: */
100:
101: String getWebSite(URL url) {
102: String hostname = url.getHost();
103: int port = url.getPort();
104: return port != -1 ? hostname + ":" + port : hostname;
105: }
106:
107: Vector rulebuf = new Vector();
108:
109: String[] getRobotsTxt(String website, String userAgent) {
110: try {
111: URL robotstxtURL = new URL("http://" + website
112: + "/robots.txt");
113: URLConnection uc = Access.getAccess().openConnection(
114: robotstxtURL);
115: PushbackInputStream in = new PushbackInputStream(
116: new BufferedInputStream(uc.getInputStream()));
117:
118: rulebuf.setSize(0);
119:
120: boolean relevant = false, specific = false;
121: String lastFieldName = null;
122: while (readField(in)) {
123: //System.err.println (fieldName + ":" + fieldValue);
124:
125: if (fieldName == null) { // end of record
126: if (specific)
127: break; // while loop
128: relevant = false;
129: } else if (fieldName.equals("user-agent")) {
130: if (lastFieldName != null
131: && lastFieldName.equals("disallow")) {
132: // end of record
133: if (specific)
134: break; // while loop
135: relevant = false;
136: }
137:
138: if (userAgent != null
139: && userAgent.indexOf(fieldValue
140: .toLowerCase()) != -1) {
141: relevant = true;
142: specific = true;
143: rulebuf.setSize(0);
144: } else if (fieldValue.equals("*")) {
145: relevant = true;
146: rulebuf.setSize(0);
147: }
148: } else if (relevant && fieldName.equals("disallow")) {
149: rulebuf.addElement(fieldValue);
150: } else { // end of record
151: if (specific)
152: break; // while loop
153: relevant = false;
154: }
155: lastFieldName = fieldName;
156: }
157:
158: in.close();
159:
160: String[] rules = new String[rulebuf.size()];
161: rulebuf.copyInto(rules);
162: return rules;
163: } catch (Exception e) {
164: // debugging only
165: // System.err.println ("RobotExclusion: error while retrieving " + website + "/robots.txt:");
166: // e.printStackTrace ();
167: return new String[0];
168: }
169: }
170:
171: String fieldName, fieldValue;
172: static final int MAX_LINE_LENGTH = 1024;
173: StringBuffer linebuf = new StringBuffer();
174:
175: // Reads one line from the input stream, parsing it into
176: // fieldName and fieldValue. Field name is lower case;
177: // whitespace is stripped at both ends of name and value.
178: // e.g., User-agent: Webcrawler
179: // is parsed into fieldName="user-agent" and fieldValue="Webcrawler".
180: // Field-less lines are parsed as fieldName=null and fieldValue=null.
181: // Returns true if a line was read, false on end-of-file.
182: boolean readField(PushbackInputStream in) throws Exception {
183: fieldName = null;
184: fieldValue = null;
185: linebuf.setLength(0);
186:
187: int c;
188: int n = 0;
189: boolean saw_eoln = false;
190: while (true) {
191: c = in.read();
192: if (c == -1)
193: break;
194: else if (c == '\r' || c == '\n')
195: saw_eoln = true;
196: else if (saw_eoln) {
197: in.unread(c);
198: break;
199: } else {
200: linebuf.append((char) c);
201: }
202:
203: ++n;
204: if (n == MAX_LINE_LENGTH)
205: break;
206: }
207:
208: //System.err.println (linebuf);
209:
210: if (n == 0)
211: return false;
212:
213: // extract fields from line and return
214: String line = linebuf.toString();
215: int colon = line.indexOf(':');
216: if (colon == -1) {
217: fieldName = null;
218: fieldValue = null;
219: } else {
220: fieldName = line.substring(0, colon).trim().toLowerCase();
221: fieldValue = line.substring(colon + 1).trim();
222: }
223: return true;
224: }
225:
226: public static void main(String argv[]) throws Exception {
227: RobotExclusion robot = new RobotExclusion(argv[0]);
228:
229: for (int i = 1; i < argv.length; ++i) {
230: System.out.println(argv[i]
231: + ": "
232: + (!robot.disallowed(new URL(argv[i])) ? "OK"
233: : "disallowed"));
234: }
235: System.in.read();
236: }
237: }
|