001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * CrawlServer.java
020: * Created on Apr 17, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.datamodel;
025:
026: import java.io.BufferedReader;
027: import java.io.IOException;
028: import java.io.InputStreamReader;
029: import java.io.ObjectInputStream;
030: import java.io.Serializable;
031: import java.io.StringReader;
032: import java.util.HashSet;
033: import java.util.Set;
034: import java.util.zip.Checksum;
035:
036: import org.apache.commons.httpclient.URIException;
037: import org.archive.crawler.datamodel.credential.CredentialAvatar;
038: import org.archive.crawler.framework.Checkpointer;
039: import org.archive.crawler.framework.ToeThread;
040: import org.archive.crawler.settings.CrawlerSettings;
041: import org.archive.crawler.settings.SettingsHandler;
042: import org.archive.io.ReplayInputStream;
043: import org.archive.net.UURIFactory;
044:
045: /**
046: * Represents a single remote "server".
047: *
048: * A server is a service on a host. There might be more than one service on a
049: * host differentiated by a port number.
050: *
051: * @author gojomo
052: */
053: public class CrawlServer implements Serializable,
054: CrawlSubstats.HasCrawlSubstats {
055:
056: private static final long serialVersionUID = -989714570750970369L;
057:
058: public static final long ROBOTS_NOT_FETCHED = -1;
059: /** only check if robots-fetch is perhaps superfluous
060: * after this many tries */
061: public static final long MIN_ROBOTS_RETRIES = 2;
062:
063: private final String server; // actually, host+port in the https case
064: private int port;
065: private transient SettingsHandler settingsHandler;
066: private RobotsExclusionPolicy robots;
067: long robotsFetched = ROBOTS_NOT_FETCHED;
068: boolean validRobots = false;
069: Checksum robotstxtChecksum;
070: CrawlSubstats substats = new CrawlSubstats();
071:
072: // how many consecutive connection errors have been encountered;
073: // used to drive exponentially increasing retry timeout or decision
074: // to 'freeze' entire class (queue) of URIs
075: protected int consecutiveConnectionErrors = 0;
076:
077: /**
078: * Set of credential avatars.
079: */
080: private transient Set<CredentialAvatar> avatars = null;
081:
082: /**
083: * Creates a new CrawlServer object.
084: *
085: * @param h the host string for the server.
086: */
087: public CrawlServer(String h) {
088: // TODO: possibly check for illegal host string
089: server = h;
090: int colonIndex = server.lastIndexOf(":");
091: if (colonIndex < 0) {
092: port = -1;
093: } else {
094: try {
095: port = Integer.parseInt(server
096: .substring(colonIndex + 1));
097: } catch (NumberFormatException e) {
098: port = -1;
099: }
100: }
101: }
102:
103: /** Get the robots exclusion policy for this server.
104: *
105: * @return the robots exclusion policy for this server.
106: */
107: public RobotsExclusionPolicy getRobots() {
108: return robots;
109: }
110:
111: /** Set the robots exclusion policy for this server.
112: *
113: * @param policy the policy to set.
114: */
115: public void setRobots(RobotsExclusionPolicy policy) {
116: robots = policy;
117: }
118:
119: public String toString() {
120: return "CrawlServer(" + server + ")";
121: }
122:
123: /** Update the robots exclusion policy.
124: *
125: * @param curi the crawl URI containing the fetched robots.txt
126: * @throws IOException
127: */
128: public void updateRobots(CrawlURI curi) {
129: RobotsHonoringPolicy honoringPolicy = settingsHandler
130: .getOrder().getRobotsHonoringPolicy();
131:
132: robotsFetched = System.currentTimeMillis();
133:
134: boolean gotSomething = curi.getFetchStatus() > 0
135: && curi.isHttpTransaction();
136: if (!gotSomething
137: && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
138: // robots.txt lookup failed, no reason to consider IGNORE yet
139: validRobots = false;
140: return;
141: }
142:
143: CrawlerSettings settings = getSettings(curi);
144: int type = honoringPolicy.getType(settings);
145: if (type == RobotsHonoringPolicy.IGNORE) {
146: // IGNORE = ALLOWALL
147: robots = RobotsExclusionPolicy.ALLOWALL;
148: validRobots = true;
149: return;
150: }
151:
152: if (!gotSomething) {
153: // robots.txt lookup failed and policy not IGNORE
154: validRobots = false;
155: return;
156: }
157:
158: if (!curi.is2XXSuccess()) {
159: // Not found or anything but a status code in the 2xx range is
160: // treated as giving access to all of a sites' content.
161: // This is the prevailing practice of Google, since 4xx
162: // responses on robots.txt are usually indicative of a
163: // misconfiguration or blanket-block, not an intentional
164: // indicator of partial blocking.
165: // TODO: consider handling server errors, redirects differently
166: robots = RobotsExclusionPolicy.ALLOWALL;
167: validRobots = true;
168: return;
169: }
170:
171: ReplayInputStream contentBodyStream = null;
172: try {
173: try {
174: BufferedReader reader;
175: if (type == RobotsHonoringPolicy.CUSTOM) {
176: reader = new BufferedReader(new StringReader(
177: honoringPolicy.getCustomRobots(settings)));
178: } else {
179: contentBodyStream = curi.getHttpRecorder()
180: .getRecordedInput()
181: .getContentReplayInputStream();
182:
183: contentBodyStream.setToResponseBodyStart();
184: reader = new BufferedReader(new InputStreamReader(
185: contentBodyStream));
186: }
187: robots = RobotsExclusionPolicy.policyFor(settings,
188: reader, honoringPolicy);
189: validRobots = true;
190: } finally {
191: if (contentBodyStream != null) {
192: contentBodyStream.close();
193: }
194: }
195: } catch (IOException e) {
196: robots = RobotsExclusionPolicy.ALLOWALL;
197: validRobots = true;
198: curi.addLocalizedError(getName(), e,
199: "robots.txt parsing IOException");
200: }
201: }
202:
203: /**
204: * @return Returns the time when robots.txt was fetched.
205: */
206: public long getRobotsFetchedTime() {
207: return robotsFetched;
208: }
209:
210: /**
211: * @return The server string which might include a port number.
212: */
213: public String getName() {
214: return server;
215: }
216:
217: /** Get the port number for this server.
218: *
219: * @return the port number or -1 if not known (uses default for protocol)
220: */
221: public int getPort() {
222: return port;
223: }
224:
225: /**
226: * Called when object is being deserialized.
227: * In addition to the default java deserialization, this method
228: * re-establishes the references to settings handler and robots honoring
229: * policy.
230: *
231: * @param stream the stream to deserialize from.
232: * @throws IOException if I/O errors occur
233: * @throws ClassNotFoundException If the class for an object being restored
234: * cannot be found.
235: */
236: private void readObject(ObjectInputStream stream)
237: throws IOException, ClassNotFoundException {
238: stream.defaultReadObject();
239: Thread t = Thread.currentThread();
240: if (t instanceof Checkpointer.CheckpointingThread) {
241: settingsHandler = ((Checkpointer.CheckpointingThread) t)
242: .getController().getSettingsHandler();
243: } else if (t instanceof ToeThread) {
244: settingsHandler = ((ToeThread) Thread.currentThread())
245: .getController().getSettingsHandler();
246: } else {
247: // TODO: log differently? (if no throw here
248: // NPE is inevitable)
249: throw new RuntimeException("CrawlServer must deserialize "
250: + "in a ToeThread or CheckpointingThread");
251: }
252: postDeserialize();
253: }
254:
255: private void postDeserialize() {
256: if (this .robots != null) {
257: RobotsHonoringPolicy honoringPolicy = settingsHandler
258: .getOrder().getRobotsHonoringPolicy();
259: this .robots.honoringPolicy = honoringPolicy;
260: }
261: }
262:
263: /** Get the settings handler.
264: *
265: * @return the settings handler.
266: */
267: public SettingsHandler getSettingsHandler() {
268: return this .settingsHandler;
269: }
270:
271: /** Get the settings object in effect for this server.
272: * @param curi
273: *
274: * @return the settings object in effect for this server.
275: * @throws URIException
276: */
277: private CrawlerSettings getSettings(CandidateURI curi) {
278: try {
279: return this .settingsHandler.getSettings(curi.getUURI()
280: .getReferencedHost(), curi.getUURI());
281: } catch (URIException e) {
282: return null;
283: }
284: }
285:
286: /** Set the settings handler to be used by this server.
287: *
288: * @param settingsHandler the settings handler to be used by this server.
289: */
290: public void setSettingsHandler(SettingsHandler settingsHandler) {
291: this .settingsHandler = settingsHandler;
292: }
293:
294: public void incrementConsecutiveConnectionErrors() {
295: this .consecutiveConnectionErrors++;
296: }
297:
298: public void resetConsecutiveConnectionErrors() {
299: this .consecutiveConnectionErrors = 0;
300: }
301:
302: /**
303: * @return Credential avatars for this server. Returns null if none.
304: */
305: public Set getCredentialAvatars() {
306: return this .avatars;
307: }
308:
309: /**
310: * @return True if there are avatars attached to this instance.
311: */
312: public boolean hasCredentialAvatars() {
313: return this .avatars != null && this .avatars.size() > 0;
314: }
315:
316: /**
317: * Add an avatar.
318: *
319: * @param ca Credential avatar to add to set of avatars.
320: */
321: public void addCredentialAvatar(CredentialAvatar ca) {
322: if (this .avatars == null) {
323: this .avatars = new HashSet<CredentialAvatar>();
324: }
325: this .avatars.add(ca);
326: }
327:
328: /**
329: * If true then valid robots.txt information has been retrieved. If false
330: * either no attempt has been made to fetch robots.txt or the attempt
331: * failed.
332: *
333: * @return Returns the validRobots.
334: */
335: public boolean isValidRobots() {
336: return validRobots;
337: }
338:
339: /**
340: * Get key to use doing lookup on server instances.
341: * @param cauri CandidateURI we're to get server key for.
342: * @return String to use as server key.
343: * @throws URIException
344: */
345: public static String getServerKey(CandidateURI cauri)
346: throws URIException {
347: // TODO: evaluate if this is really necessary -- why not
348: // make the server of a dns CandidateURI the looked-up domain,
349: // also simplifying FetchDNS?
350: String key = cauri.getUURI().getAuthorityMinusUserinfo();
351: if (key == null) {
352: // Fallback for cases where getAuthority() fails (eg 'dns:'.
353: // DNS UURIs have the 'domain' in the 'path' parameter, not
354: // in the authority).
355: key = cauri.getUURI().getCurrentHierPath();
356: if (key != null && !key.matches("[-_\\w\\.:]+")) {
357: // Not just word chars and dots and colons and dashes and
358: // underscores; throw away
359: key = null;
360: }
361: }
362: if (key != null
363: && cauri.getUURI().getScheme()
364: .equals(UURIFactory.HTTPS)) {
365: // If https and no port specified, add default https port to
366: // distinuish https from http server without a port.
367: if (!key.matches(".+:[0-9]+")) {
368: key += ":" + UURIFactory.HTTPS_PORT;
369: }
370: }
371: return key;
372: }
373:
374: /* (non-Javadoc)
375: * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
376: */
377: public CrawlSubstats getSubstats() {
378: return substats;
379: }
380: }
|