001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * RobotsExclusionPolicy.java
020: * Created on Apr 17, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.datamodel;
025:
026: import java.io.BufferedReader;
027: import java.io.IOException;
028: import java.io.ObjectInputStream;
029: import java.io.ObjectOutputStream;
030: import java.io.Serializable;
031: import java.util.ArrayList;
032: import java.util.HashMap;
033: import java.util.Iterator;
034: import java.util.LinkedList;
035: import java.util.List;
036: import java.util.logging.Level;
037: import java.util.logging.Logger;
038:
039: import org.apache.commons.httpclient.URIException;
040: import org.archive.crawler.settings.CrawlerSettings;
041:
042: /**
043: * RobotsExclusionPolicy represents the actual policy adopted with
044: * respect to a specific remote server, usually constructed from
045: * consulting the robots.txt, if any, the server provided.
046: *
047: * (The similarly named RobotsHonoringPolicy, on the other hand,
048: * describes the strategy used by the crawler to determine to what
049: * extent it respects exclusion rules.)
050: *
051: * The expiration of policies after a suitable amount of time has
052: * elapsed since last fetch is handled outside this class, in
053: * CrawlServer itself.
054: *
055: * @author gojomo
056: *
057: */
058: public class RobotsExclusionPolicy implements Serializable {
059:
060: private static final long serialVersionUID = 6323907991237383113L;
061:
062: private static final Logger logger = Logger
063: .getLogger(RobotsExclusionPolicy.class.getName());
064:
065: private final static int NORMAL_TYPE = 0;
066: private final static int ALLOWALL_TYPE = 1;
067: private final static int DENYALL_TYPE = 2;
068: private transient int type = NORMAL_TYPE;
069:
070: public static RobotsExclusionPolicy ALLOWALL = new RobotsExclusionPolicy(
071: ALLOWALL_TYPE);
072: public static RobotsExclusionPolicy DENYALL = new RobotsExclusionPolicy(
073: DENYALL_TYPE);
074:
075: private LinkedList<String> userAgents = null;
076: private HashMap<String, List<String>> disallows = null;
077: transient RobotsHonoringPolicy honoringPolicy = null;
078:
079: private String lastUsedUserAgent = null;
080: private List<String> userAgentsToTest = null;
081:
082: /**
083: * @param settings
084: * @param reader
085: * @param honoringPolicy
086: * @return Robot exclusion policy.
087: * @throws IOException
088: */
089: public static RobotsExclusionPolicy policyFor(
090: CrawlerSettings settings, BufferedReader reader,
091: RobotsHonoringPolicy honoringPolicy) throws IOException {
092: LinkedList<String> userAgents = new LinkedList<String>();
093: HashMap<String, List<String>> disallows = new HashMap<String, List<String>>();
094: Robotstxt.parse(reader, userAgents, disallows);
095: return (disallows.isEmpty()) ? ALLOWALL
096: : new RobotsExclusionPolicy(settings, userAgents,
097: disallows, honoringPolicy);
098: }
099:
100: /**
101: * @param settings
102: * @param u
103: * @param d
104: * @param honoringPolicy
105: */
106: public RobotsExclusionPolicy(CrawlerSettings settings,
107: LinkedList<String> u, HashMap<String, List<String>> d,
108: RobotsHonoringPolicy honoringPolicy) {
109: userAgents = u;
110: disallows = d;
111: this .honoringPolicy = honoringPolicy;
112:
113: if (honoringPolicy == null)
114: return;
115:
116: // If honoring policy is most favored user agent, all rules should be checked
117: if (honoringPolicy.isType(settings,
118: RobotsHonoringPolicy.MOST_FAVORED)) {
119: userAgentsToTest = userAgents;
120:
121: // IF honoring policy is most favored of set, then make a list with only the set as members
122: } else if (honoringPolicy.isType(settings,
123: RobotsHonoringPolicy.MOST_FAVORED_SET)) {
124: userAgentsToTest = new ArrayList<String>();
125: Iterator userAgentSet = honoringPolicy.getUserAgents(
126: settings).iterator();
127: while (userAgentSet.hasNext()) {
128: String userAgent = (String) userAgentSet.next();
129:
130: Iterator iter = userAgents.iterator();
131: while (iter.hasNext()) {
132: String ua = (String) iter.next();
133: if (userAgent.indexOf(ua) > -1) {
134: userAgentsToTest.add(ua);
135: break;
136: }
137: }
138: }
139: }
140: }
141:
142: public RobotsExclusionPolicy(int type) {
143: this (null, null, null, null);
144: this .type = type;
145: }
146:
147: public boolean disallows(CrawlURI curi, String userAgent) {
148: if (this == ALLOWALL)
149: return false;
150: if (this == DENYALL)
151: return true;
152:
153: // In the common case with policy=Classic, the useragent is remembered from uri to uri on
154: // the same server
155: if ((honoringPolicy.isType(curi, RobotsHonoringPolicy.CLASSIC) || honoringPolicy
156: .isType(curi, RobotsHonoringPolicy.CUSTOM))
157: && (lastUsedUserAgent == null || !lastUsedUserAgent
158: .equals(userAgent))) {
159:
160: lastUsedUserAgent = userAgent;
161: userAgentsToTest = new ArrayList<String>();
162: Iterator iter = userAgents.iterator();
163: String lowerCaseUserAgent = userAgent.toLowerCase();
164: while (iter.hasNext()) {
165: String ua = (String) iter.next();
166: // ua in below is already lowercase. See Robotstxt.java line 60.
167: if (lowerCaseUserAgent.indexOf(ua) > -1) {
168: userAgentsToTest.add(ua);
169: break; // consider no more sections
170: }
171: }
172: }
173:
174: boolean disallow = false;
175: boolean examined = false;
176: String ua = null;
177:
178: // Go thru list of all user agents we might act as
179: Iterator uas = userAgentsToTest.iterator();
180: while (uas.hasNext() && examined == false) {
181: disallow = false;
182: ua = (String) uas.next();
183: Iterator dis = ((List) disallows.get(ua)).iterator();
184:
185: // Check if the current user agent is allowed to crawl
186: while (dis.hasNext() && examined == false
187: && disallow == false) {
188: String disallowedPath = (String) dis.next();
189: if (disallowedPath.length() == 0) {
190: // blanket allow
191: examined = true;
192: disallow = false;
193: break;
194: }
195: try {
196: String p = curi.getUURI().getPathQuery();
197: if (p != null && p.startsWith(disallowedPath)) {
198: // the user agent tested isn't allowed to get this uri
199: disallow = true;
200: }
201: } catch (URIException e) {
202: logger.log(Level.SEVERE,
203: "Failed getPathQuery from " + curi, e);
204: }
205: }
206: if (disallow == false) {
207: // the user agent tested is allowed
208: examined = true;
209: }
210: }
211:
212: // Are we supposed to masquerade as the user agent to which restrictions
213: // we follow?
214: if (honoringPolicy.shouldMasquerade(curi) && ua != null
215: && !ua.equals("")) {
216: curi.setUserAgent(ua);
217: }
218: return disallow;
219: }
220:
221: // Methods for object serialization.
222:
223: /** If object is DENYALL or ALLOWALL, only the object identity and type
224: * is written in the serialization stream.
225: *
226: * @param stream the serialization stream.
227: * @throws IOException
228: */
229: private void writeObject(ObjectOutputStream stream)
230: throws IOException {
231: stream.writeInt(type);
232: if (type == NORMAL_TYPE) {
233: stream.defaultWriteObject();
234: }
235: }
236:
237: /** If object is DENYALL or ALLOWALL, only the object identity and type
238: * is read from the serialization stream.
239: *
240: * @param stream the serialization stream.
241: * @throws IOException
242: * @throws ClassNotFoundException
243: */
244: private void readObject(ObjectInputStream stream)
245: throws IOException, ClassNotFoundException {
246: type = stream.readInt();
247: if (type == NORMAL_TYPE) {
248: stream.defaultReadObject();
249: }
250: }
251:
252: /** If object is DENYALL or ALLOWALL, the object is replaced by constants
253: * so that check for object equality works.
254: * @return Object.
255: */
256: private Object readResolve() {
257: if (type == NORMAL_TYPE) {
258: return this;
259: } else if (type == ALLOWALL_TYPE) {
260: return ALLOWALL;
261: } else if (type == DENYALL_TYPE) {
262: return DENYALL;
263: }
264: return null;
265: }
266:
267: }
|