001: // NoRobots - implements the Robot Exclusion Standard
002: //
003: // Copyright (C)1996,1998 by Jef Poskanzer <jef@acme.com>.
004: // All rights reserved.
005: //
006: // Redistribution and use in source and binary forms, with or without
007: // modification, are permitted provided that the following conditions
008: // are met:
009: // 1. Redistributions of source code must retain the above copyright
010: // notice, this list of conditions and the following disclaimer.
011: // 2. Redistributions in binary form must reproduce the above copyright
012: // notice, this list of conditions and the following disclaimer in the
013: // documentation and/or other materials provided with the distribution.
014: //
015: // THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
016: // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
017: // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
018: // ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
019: // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
020: // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
021: // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
022: // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
023: // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
024: // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
025: // SUCH DAMAGE.
026:
027: /**
028: * moved to the net.matuschek.spider package bei Daniel Matuschek
029: * did some minimal modifications to use HttpTool for retrieval of robots.txt
030: */package net.matuschek.spider;
031:
032: import java.io.*;
033: import java.net.*;
034: import java.util.*;
035:
036: import org.apache.log4j.Category;
037:
038: import net.matuschek.http.*;
039:
040: /**
041: * Implements the Robot Exclusion Standard.
042: * <P>
043: * The basic idea of the Robot Exclusion Standard is that each web server
044: * can set up a single file called "/robots.txt" which contains pathnames
045: * that robots should not look at.
046: * See <A HREF="http://www.robotstxt.org/wc/norobots.html">the full spec</A>
047: * for details.
048: * Using this class is very simple - you create the object using your robot's
049: * name and the httptool to retrieve the date, and then you call check() on
050: * each URL. For efficiency, the class caches entries for servers you've
051: * visited recently.
052: * <p>
053: * @author cn
054: * @version 0.1
055: */
056: public class NoRobots {
057:
058: Category log = Category.getInstance(getClass().getName());
059:
060: // The file with the robot rules in it.
061: private static final String robotFile = "/robots.txt";
062:
063: // The name of this robot.
064: private String robotName;
065:
066: // A table of all the servers we have visited recently.
067: private Hashtable servers = new net.matuschek.util.LruHashtable(500);
068:
069: // tool to get /robots.txt
070: private HttpTool httpTool;
071: private boolean ignore = false;
072:
073: /**
074: * Constructor.
075: * @param robotName the name of the robot
076: * @param httpTool the HttpTool instance for downloading the robotFile
077: */
078: public NoRobots(String robotName, HttpTool inhttpTool) {
079: this .robotName = robotName;
080: this .httpTool = inhttpTool;
081: /*
082: this.httpTool = new HttpTool();
083: httpTool.setAgentName(inhttpTool.getAgentName());
084: try{
085: httpTool.setProxy(inhttpTool.getProxy());
086: } catch (HttpException e){
087: // ignore
088: }
089: */
090: }
091:
092: /**
093: * Check whether it's ok for this robot to fetch this URL. reads the
094: * information in the robots.txt file on this host. If a robots.txt file is
095: * there and this file disallows the robot to retrieve the requested url
096: * then the method returns false
097: * @param url the url we want to retrieve
098: * @return boolean true if allowed to retireve the url, false otherwise
099: */
100: public boolean ok(URL url) {
101: // if ignore is set to true, then this check returs true
102: if (ignore) {
103: return true;
104: }
105:
106: String protocol = url.getProtocol();
107: String host = url.getHost();
108: int port = url.getPort();
109: if (port == -1) {
110: port = 80;
111: }
112:
113: String file = url.getFile();
114:
115: Vector disallows = getDisallows(protocol, host, port);
116: Enumeration en = disallows.elements();
117: while (en.hasMoreElements()) {
118: String pattern = (String) en.nextElement();
119: if (file.startsWith(pattern))
120: return false;
121: }
122: return true;
123: }
124:
125: /**
126: * Method getDisallows.
127: * Get the disallows list for the given server. If it's not already in
128: * the servers hash table, we fetch it, parse it, and save it.
129: * @param protocol
130: * @param host
131: * @param port
132: * @return Vector
133: */
134: private Vector getDisallows(String protocol, String host, int port) {
135: String key = protocol + "://" + host + ":" + port;
136: Vector disallows = (Vector) servers.get(key);
137: if (disallows != null)
138: return disallows;
139:
140: disallows = new Vector();
141: try {
142: URL robotUrl = new URL(protocol, host, port, robotFile);
143: try {
144:
145: // get document
146: log.debug("Retrieving robot file '" + robotUrl + "'.");
147: httpTool.setReferer("-");
148: String robotsFile = "";
149: try {
150: HttpDoc doc = httpTool.retrieveDocument(robotUrl,
151: HttpConstants.GET, "");
152: //old source if (doc.isOk()) {
153: if (doc != null && doc.isOk()) {
154: robotsFile = new String(doc.getContent());
155: }
156: } catch (HttpException e) {
157: // ignore HTTP errors
158: log.info("Cannot read robots.txt: "
159: + e.getMessage());
160: }
161:
162: BufferedReader robotReader = new BufferedReader(
163: new StringReader(robotsFile));
164: boolean userAgentIsMe = false;
165: while (true) {
166: String line = robotReader.readLine();
167: if (line == null)
168: break;
169: line = line.trim();
170:
171: // Completely ignore lines that are just a comment - they
172: // don't even separate records.
173: if (line.startsWith("#"))
174: continue;
175:
176: // Trim off any other comments.
177: int cmt = line.indexOf('#');
178: if (cmt != -1)
179: line = line.substring(0, cmt).trim();
180:
181: if (line.length() == 0)
182: userAgentIsMe = false;
183: else if (line.toLowerCase().startsWith(
184: "user-agent:")) {
185: if (!userAgentIsMe) {
186: String value = line.substring(11).trim();
187: if (match(value, robotName))
188: userAgentIsMe = true;
189: }
190: } else if (line.toLowerCase().startsWith(
191: "disallow:")) {
192: if (userAgentIsMe) {
193: String value = line.substring(9).trim();
194: disallows.addElement(value);
195: }
196: }
197: }
198: } catch (IOException ignore) {
199: }
200: } catch (MalformedURLException ignore) {
201: }
202:
203: servers.put(key, disallows);
204: return disallows;
205: }
206:
207: /**
208: * Method match.
209: * Checks whether a string matches a given wildcard pattern.
210: * Only does ? and *, and multiple patterns separated by |.
211: * @param pattern
212: * @param string
213: * @return boolean
214: */
215: protected static boolean match(String pattern, String string) {
216: for (int p = 0;; ++p) {
217: for (int s = 0;; ++p, ++s) {
218: boolean sEnd = (s >= string.length());
219: boolean pEnd = (p >= pattern.length() || pattern
220: .charAt(p) == '|');
221: if (sEnd && pEnd)
222: return true;
223: if (sEnd || pEnd)
224: break;
225: if (pattern.charAt(p) == '?')
226: continue;
227: if (pattern.charAt(p) == '*') {
228: int i;
229: ++p;
230: for (i = string.length(); i >= s; --i)
231: if (match(pattern.substring(p), string
232: .substring(i))) /* not quite right */
233: return true;
234: break;
235: }
236: if (pattern.charAt(p) != string.charAt(s))
237: break;
238: }
239: p = pattern.indexOf('|', p);
240: if (p == -1)
241: return false;
242: }
243: }
244:
245: /**
246: * Method getIgnore.
247: * tells if the robot exclusion standard is ignored
248: * @return boolean true if the check on robots.txt is not done
249: */
250: public boolean getIgnore() {
251: return ignore;
252: }
253:
254: /**
255: * Method setIgnore.
256: * set the robot exclusion standard.
257: * @param ignore if ignore is true then the robot exclusion standard is
258: * ignored
259: */
260: public void setIgnore(boolean ignore) {
261: this .ignore = ignore;
262: }
263:
264: /**
265: * This method finishes the HttpTool.
266: */
267: public void finish() {
268: if (httpTool != null) {
269: httpTool.finish();
270: }
271: }
272: }
|