001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SimplePolitenessEnforcer.java
020: * Created on May 22, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.prefetch;
025:
026: import java.util.Iterator;
027: import java.util.Set;
028: import java.util.logging.Level;
029: import java.util.logging.Logger;
030:
031: import javax.management.AttributeNotFoundException;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.crawler.datamodel.CoreAttributeConstants;
035: import org.archive.crawler.datamodel.CrawlHost;
036: import org.archive.crawler.datamodel.CrawlServer;
037: import org.archive.crawler.datamodel.CrawlURI;
038: import org.archive.crawler.datamodel.CredentialStore;
039: import org.archive.crawler.datamodel.FetchStatusCodes;
040: import org.archive.crawler.datamodel.credential.Credential;
041: import org.archive.crawler.datamodel.credential.CredentialAvatar;
042: import org.archive.crawler.framework.Processor;
043: import org.archive.crawler.settings.SimpleType;
044: import org.archive.crawler.settings.Type;
045: import org.archive.net.UURI;
046:
047: /**
048: * Ensures the preconditions for a fetch -- such as DNS lookup
049: * or acquiring and respecting a robots.txt policy -- are
050: * satisfied before a URI is passed to subsequent stages.
051: *
052: * @author gojomo
053: */
054: public class PreconditionEnforcer extends Processor implements
055: CoreAttributeConstants, FetchStatusCodes {
056:
057: private static final long serialVersionUID = 4636474153589079615L;
058:
059: private static final Logger logger = Logger
060: .getLogger(PreconditionEnforcer.class.getName());
061:
062: private final static Integer DEFAULT_IP_VALIDITY_DURATION = new Integer(
063: 60 * 60 * 6); // six hours
064: private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION = new Integer(
065: 60 * 60 * 24); // one day
066:
067: /** seconds to keep IP information for */
068: public final static String ATTR_IP_VALIDITY_DURATION = "ip-validity-duration-seconds";
069: /** seconds to cache robots info */
070: public final static String ATTR_ROBOTS_VALIDITY_DURATION = "robot-validity-duration-seconds";
071:
072: /** whether to calculate robots exclusion without applying */
073: public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;
074: public final static String ATTR_CALCULATE_ROBOTS_ONLY = "calculate-robots-only";
075:
076: public PreconditionEnforcer(String name) {
077: super (name, "Precondition enforcer");
078:
079: Type e;
080:
081: e = addElementToDefinition(new SimpleType(
082: ATTR_IP_VALIDITY_DURATION,
083: "The minimum interval for which a dns-record will be considered "
084: + "valid (in seconds). "
085: + "If the record's DNS TTL is larger, that will be used instead.",
086: DEFAULT_IP_VALIDITY_DURATION));
087: e.setExpertSetting(true);
088:
089: e = addElementToDefinition(new SimpleType(
090: ATTR_ROBOTS_VALIDITY_DURATION,
091: "The time in seconds that fetched robots.txt information is "
092: + "considered to be valid. "
093: + "If the value is set to '0', then the robots.txt information"
094: + " will never expire.",
095: DEFAULT_ROBOTS_VALIDITY_DURATION));
096: e.setExpertSetting(true);
097:
098: e = addElementToDefinition(new SimpleType(
099: ATTR_CALCULATE_ROBOTS_ONLY,
100: "Whether to only calculate the robots status of an URI, "
101: + "without actually applying any exclusions found. If true, "
102: + "exlcuded URIs will only be annotated in the crawl.log, but "
103: + "still fetched. Default is false. ",
104: DEFAULT_CALCULATE_ROBOTS_ONLY));
105: e.setExpertSetting(true);
106: }
107:
108: protected void innerProcess(CrawlURI curi) {
109:
110: if (considerDnsPreconditions(curi)) {
111: return;
112: }
113:
114: // make sure we only process schemes we understand (i.e. not dns)
115: String scheme = curi.getUURI().getScheme().toLowerCase();
116: if (!(scheme.equals("http") || scheme.equals("https"))) {
117: logger
118: .fine("PolitenessEnforcer doesn't understand uri's of type "
119: + scheme + " (ignoring)");
120: return;
121: }
122:
123: if (considerRobotsPreconditions(curi)) {
124: return;
125: }
126:
127: if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
128: return;
129: }
130:
131: // OK, it's allowed
132:
133: // For all curis that will in fact be fetched, set appropriate delays.
134: // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
135: // curi.setDelayFactor(getDelayFactorFor(curi));
136: // curi.setMinimumDelay(getMinimumDelayFor(curi));
137:
138: return;
139: }
140:
141: /**
142: * Consider the robots precondition.
143: *
144: * @param curi CrawlURI we're checking for any required preconditions.
145: * @return True, if this <code>curi</code> has a precondition or processing
146: * should be terminated for some other reason. False if
147: * we can precede to process this url.
148: */
149: private boolean considerRobotsPreconditions(CrawlURI curi) {
150: // treat /robots.txt fetches specially
151: UURI uuri = curi.getUURI();
152: try {
153: if (uuri != null && uuri.getPath() != null
154: && curi.getUURI().getPath().equals("/robots.txt")) {
155: // allow processing to continue
156: curi.setPrerequisite(true);
157: return false;
158: }
159: } catch (URIException e) {
160: logger.severe("Failed get of path for " + curi);
161: }
162: // require /robots.txt if not present
163: if (isRobotsExpired(curi)) {
164: // Need to get robots
165: if (logger.isLoggable(Level.FINE)) {
166: logger.fine("No valid robots for "
167: + getController().getServerCache()
168: .getServerFor(curi) + "; deferring "
169: + curi);
170: }
171:
172: // Robots expired - should be refetched even though its already
173: // crawled.
174: try {
175: String prereq = curi.getUURI().resolve("/robots.txt")
176: .toString();
177: curi.markPrerequisite(prereq, getController()
178: .getPostprocessorChain());
179: } catch (URIException e1) {
180: logger.severe("Failed resolve using " + curi);
181: throw new RuntimeException(e1); // shouldn't ever happen
182: }
183: return true;
184: }
185: // test against robots.txt if available
186: CrawlServer cs = getController().getServerCache().getServerFor(
187: curi);
188: if (cs.isValidRobots()) {
189: String ua = getController().getOrder().getUserAgent(curi);
190: if (cs.getRobots().disallows(curi, ua)) {
191: if (((Boolean) getUncheckedAttribute(curi,
192: ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
193: // annotate URI as excluded, but continue to process normally
194: curi.addAnnotation("robotExcluded");
195: return false;
196: }
197: // mark as precluded; in FetchHTTP, this will
198: // prevent fetching and cause a skip to the end
199: // of processing (unless an intervening processor
200: // overrules)
201: curi.setFetchStatus(S_ROBOTS_PRECLUDED);
202: curi.putString("error", "robots.txt exclusion");
203: logger.fine("robots.txt precluded " + curi);
204: return true;
205: }
206: return false;
207: }
208: // No valid robots found => Attempt to get robots.txt failed
209: curi.skipToProcessorChain(getController()
210: .getPostprocessorChain());
211: curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
212: curi.putString("error", "robots.txt prerequisite failed");
213: if (logger.isLoggable(Level.FINE)) {
214: logger.fine("robots.txt prerequisite failed " + curi);
215: }
216: return true;
217: }
218:
219: /**
220: * @param curi CrawlURI whose dns prerequisite we're to check.
221: * @return true if no further processing in this module should occur
222: */
223: private boolean considerDnsPreconditions(CrawlURI curi) {
224: if (curi.getUURI().getScheme().equals("dns")) {
225: // DNS URIs never have a DNS precondition
226: curi.setPrerequisite(true);
227: return false;
228: }
229:
230: CrawlServer cs = getController().getServerCache().getServerFor(
231: curi);
232: if (cs == null) {
233: curi.setFetchStatus(S_UNFETCHABLE_URI);
234: curi.skipToProcessorChain(getController()
235: .getPostprocessorChain());
236: return true;
237: }
238:
239: // If we've done a dns lookup and it didn't resolve a host
240: // cancel further fetch-processing of this URI, because
241: // the domain is unresolvable
242: CrawlHost ch = getController().getServerCache()
243: .getHostFor(curi);
244: if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
245: if (logger.isLoggable(Level.FINE)) {
246: logger.fine("no dns for " + ch
247: + " cancelling processing for CrawlURI "
248: + curi.toString());
249: }
250: curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
251: curi.skipToProcessorChain(getController()
252: .getPostprocessorChain());
253: return true;
254: }
255:
256: // If we haven't done a dns lookup and this isn't a dns uri
257: // shoot that off and defer further processing
258: if (isIpExpired(curi)
259: && !curi.getUURI().getScheme().equals("dns")) {
260: logger.fine("Deferring processing of CrawlURI "
261: + curi.toString() + " for dns lookup.");
262: String preq = "dns:" + ch.getHostName();
263: try {
264: curi.markPrerequisite(preq, getController()
265: .getPostprocessorChain());
266: } catch (URIException e) {
267: throw new RuntimeException(e); // shouldn't ever happen
268: }
269: return true;
270: }
271:
272: // DNS preconditions OK
273: return false;
274: }
275:
276: /**
277: * Get the maximum time a dns-record is valid.
278: *
279: * @param curi the uri this time is valid for.
280: * @return the maximum time a dns-record is valid -- in seconds -- or
281: * negative if record's ttl should be used.
282: */
283: public long getIPValidityDuration(CrawlURI curi) {
284: Integer d;
285: try {
286: d = (Integer) getAttribute(ATTR_IP_VALIDITY_DURATION, curi);
287: } catch (AttributeNotFoundException e) {
288: d = DEFAULT_IP_VALIDITY_DURATION;
289: }
290:
291: return d.longValue();
292: }
293:
294: /** Return true if ip should be looked up.
295: *
296: * @param curi the URI to check.
297: * @return true if ip should be looked up.
298: */
299: public boolean isIpExpired(CrawlURI curi) {
300: CrawlHost host = getController().getServerCache().getHostFor(
301: curi);
302: if (!host.hasBeenLookedUp()) {
303: // IP has not been looked up yet.
304: return true;
305: }
306:
307: if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
308: // IP never expires (numeric IP)
309: return false;
310: }
311:
312: long duration = getIPValidityDuration(curi);
313: if (duration == 0) {
314: // Never expire ip if duration is null (set by user or more likely,
315: // set to zero in case where we tried in FetchDNS but failed).
316: return false;
317: }
318:
319: // catch old "default" -1 settings that are now problematic,
320: // convert to new minimum
321: if (duration <= 0) {
322: duration = DEFAULT_IP_VALIDITY_DURATION.intValue();
323: }
324:
325: long ttl = host.getIpTTL();
326: if (ttl > duration) {
327: // Use the larger of the operator-set minimum duration
328: // or the DNS record TTL
329: duration = ttl;
330: }
331:
332: // Duration and ttl are in seconds. Convert to millis.
333: if (duration > 0) {
334: duration *= 1000;
335: }
336:
337: return (duration + host.getIpFetched()) < System
338: .currentTimeMillis();
339: }
340:
341: /** Get the maximum time a robots.txt is valid.
342: *
343: * @param curi
344: * @return the time a robots.txt is valid in milliseconds.
345: */
346: public long getRobotsValidityDuration(CrawlURI curi) {
347: Integer d;
348: try {
349: d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION,
350: curi);
351: } catch (AttributeNotFoundException e) {
352: // This should never happen, but if it does, return default
353: logger.severe(e.getLocalizedMessage());
354: d = DEFAULT_ROBOTS_VALIDITY_DURATION;
355: }
356: // convert from seconds to milliseconds
357: return d.longValue() * 1000;
358: }
359:
360: /**
361: * Is the robots policy expired.
362: *
363: * This method will also return true if we haven't tried to get the
364: * robots.txt for this server.
365: *
366: * @param curi
367: * @return true if the robots policy is expired.
368: */
369: public boolean isRobotsExpired(CrawlURI curi) {
370: CrawlServer server = getController().getServerCache()
371: .getServerFor(curi);
372: long robotsFetched = server.getRobotsFetchedTime();
373: if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {
374: // Have not attempted to fetch robots
375: return true;
376: }
377: long duration = getRobotsValidityDuration(curi);
378: if (duration == 0) {
379: // When zero, robots should be valid forever
380: return false;
381: }
382: if (robotsFetched + duration < System.currentTimeMillis()) {
383: // Robots is still valid
384: return true;
385: }
386: return false;
387: }
388:
389: /**
390: * Consider credential preconditions.
391: *
392: * Looks to see if any credential preconditions (e.g. html form login
393: * credentials) for this <code>CrawlServer</code>. If there are, have they
394: * been run already? If not, make the running of these logins a precondition
395: * of accessing any other url on this <code>CrawlServer</code>.
396: *
397: * <p>
398: * One day, do optimization and avoid running the bulk of the code below.
399: * Argument for running the code everytime is that overrides and refinements
400: * may change what comes back from credential store.
401: *
402: * @param curi CrawlURI we're checking for any required preconditions.
403: * @return True, if this <code>curi</code> has a precondition that needs to
404: * be met before we can proceed. False if we can precede to process
405: * this url.
406: */
407: private boolean credentialPrecondition(final CrawlURI curi) {
408:
409: boolean result = false;
410:
411: CredentialStore cs = CredentialStore
412: .getCredentialStore(getSettingsHandler());
413: if (cs == null) {
414: logger.severe("No credential store for " + curi);
415: return result;
416: }
417:
418: Iterator i = cs.iterator(curi);
419: if (i == null) {
420: return result;
421: }
422:
423: while (i.hasNext()) {
424: Credential c = (Credential) i.next();
425:
426: if (c.isPrerequisite(curi)) {
427: // This credential has a prereq. and this curi is it. Let it
428: // through. Add its avatar to the curi as a mark. Also, does
429: // this curi need to be posted? Note, we do this test for
430: // is it a prereq BEFORE we do the check that curi is of the
431: // credential domain because such as yahoo have you go to
432: // another domain altogether to login.
433: c.attach(curi);
434: curi.setPost(c.isPost(curi));
435: break;
436: }
437:
438: if (!c.rootUriMatch(getController(), curi)) {
439: continue;
440: }
441:
442: if (!c.hasPrerequisite(curi)) {
443: continue;
444: }
445:
446: if (!authenticated(c, curi)) {
447: // Han't been authenticated. Queue it and move on (Assumption
448: // is that we can do one authentication at a time -- usually one
449: // html form).
450: String prereq = c.getPrerequisite(curi);
451: if (prereq == null || prereq.length() <= 0) {
452: CrawlServer server = getController()
453: .getServerCache().getServerFor(curi);
454: logger.severe(server.getName() + " has "
455: + " credential(s) of type " + c
456: + " but prereq" + " is null.");
457: } else {
458: try {
459: curi.markPrerequisite(prereq, getController()
460: .getPostprocessorChain());
461: } catch (URIException e) {
462: logger
463: .severe("unable to set credentials prerequisite "
464: + prereq);
465: getController().logUriError(e, curi.getUURI(),
466: prereq);
467: return false;
468: }
469: result = true;
470: if (logger.isLoggable(Level.FINE)) {
471: logger.fine("Queueing prereq " + prereq
472: + " of type " + c + " for " + curi);
473: }
474: break;
475: }
476: }
477: }
478: return result;
479: }
480:
481: /**
482: * Has passed credential already been authenticated.
483: *
484: * @param credential Credential to test.
485: * @param curi CrawlURI.
486: * @return True if already run.
487: */
488: private boolean authenticated(final Credential credential,
489: final CrawlURI curi) {
490: boolean result = false;
491: CrawlServer server = getController().getServerCache()
492: .getServerFor(curi);
493: if (!server.hasCredentialAvatars()) {
494: return result;
495: }
496: Set avatars = server.getCredentialAvatars();
497: for (Iterator i = avatars.iterator(); i.hasNext();) {
498: CredentialAvatar ca = (CredentialAvatar) i.next();
499: String key = null;
500: try {
501: key = credential.getKey(curi);
502: } catch (AttributeNotFoundException e) {
503: logger.severe("Failed getting key for " + credential
504: + " for " + curi);
505: continue;
506: }
507: if (ca.match(credential.getClass(), key)) {
508: result = true;
509: }
510: }
511: return result;
512: }
513: }
|