0001: /* AdaptiveRevisitFrontier.java
0002: *
0003: * Created on Sep 13, 2004
0004: *
0005: * Copyright (C) 2004 Kristinn Sigur?sson.
0006: *
0007: * This file is part of the Heritrix web crawler (crawler.archive.org).
0008: *
0009: * Heritrix is free software; you can redistribute it and/or modify
0010: * it under the terms of the GNU Lesser Public License as published by
0011: * the Free Software Foundation; either version 2.1 of the License, or
0012: * any later version.
0013: *
0014: * Heritrix is distributed in the hope that it will be useful,
0015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0017: * GNU Lesser Public License for more details.
0018: *
0019: * You should have received a copy of the GNU Lesser Public License
0020: * along with Heritrix; if not, write to the Free Software
0021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0022: */
0023: package org.archive.crawler.frontier;
0024:
0025: import java.io.File;
0026: import java.io.IOException;
0027: import java.io.PrintWriter;
0028: import java.io.Serializable;
0029: import java.io.StringWriter;
0030: import java.io.Writer;
0031: import java.util.ArrayList;
0032: import java.util.Date;
0033: import java.util.Iterator;
0034: import java.util.List;
0035: import java.util.logging.Level;
0036: import java.util.logging.Logger;
0037:
0038: import javax.management.AttributeNotFoundException;
0039:
0040: import org.apache.commons.httpclient.HttpStatus;
0041: import org.archive.crawler.datamodel.CandidateURI;
0042: import org.archive.crawler.datamodel.CoreAttributeConstants;
0043: import org.archive.crawler.datamodel.CrawlServer;
0044: import org.archive.crawler.datamodel.CrawlURI;
0045: import org.archive.crawler.datamodel.FetchStatusCodes;
0046: import org.archive.crawler.datamodel.UriUniqFilter;
0047: import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
0048: import org.archive.crawler.event.CrawlStatusListener;
0049: import org.archive.crawler.framework.CrawlController;
0050: import org.archive.crawler.framework.Frontier;
0051: import org.archive.crawler.framework.FrontierMarker;
0052: import org.archive.crawler.framework.exceptions.EndedException;
0053: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0054: import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0055: import org.archive.crawler.settings.ModuleType;
0056: import org.archive.crawler.settings.RegularExpressionConstraint;
0057: import org.archive.crawler.settings.SimpleType;
0058: import org.archive.crawler.settings.Type;
0059: import org.archive.crawler.url.Canonicalizer;
0060: import org.archive.crawler.util.BdbUriUniqFilter;
0061: import org.archive.net.UURI;
0062: import org.archive.queue.MemQueue;
0063: import org.archive.queue.Queue;
0064: import org.archive.util.ArchiveUtils;
0065:
0066: /**
0067: * A Frontier that will repeatedly visit all encountered URIs.
0068: * <p>
0069: * Wait time between visits is configurable and varies based on observed
0070: * changes of documents.
0071: * <p>
0072: * The Frontier borrows many things from HostQueuesFrontier, but implements
0073: * an entirely different strategy in issuing URIs and consequently in keeping a
0074: * record of discovered URIs.
0075: *
0076: * @author Kristinn Sigurdsson
0077: */
0078: public class AdaptiveRevisitFrontier extends ModuleType implements
0079: Frontier, FetchStatusCodes, CoreAttributeConstants,
0080: AdaptiveRevisitAttributeConstants, CrawlStatusListener,
0081: HasUriReceiver {
0082:
0083: private static final long serialVersionUID = -8666872690438543671L;
0084:
0085: private static final Logger logger = Logger
0086: .getLogger(AdaptiveRevisitFrontier.class.getName());
0087:
0088: /** How many multiples of last fetch elapsed time to wait before recontacting
0089: * same server */
0090: public final static String ATTR_DELAY_FACTOR = "delay-factor";
0091: private final static Float DEFAULT_DELAY_FACTOR = new Float(5);
0092:
0093: /** Always wait this long after one completion before recontacting
0094: * same server, regardless of multiple */
0095: public final static String ATTR_MIN_DELAY = "min-delay-ms";
0096:
0097: // 2 seconds
0098: private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);
0099:
0100: /** Never wait more than this long, regardless of multiple */
0101: public final static String ATTR_MAX_DELAY = "max-delay-ms";
0102:
0103: // 30 seconds
0104: private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);
0105:
0106: /** Maximum times to emit a CrawlURI without final disposition */
0107: public final static String ATTR_MAX_RETRIES = "max-retries";
0108: private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);
0109:
0110: /** For retryable problems, seconds to wait before a retry */
0111: public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";
0112:
0113: // 15 minutes
0114: private final static Long DEFAULT_RETRY_DELAY = new Long(900);
0115:
0116: /** Maximum simultaneous requests in process to a host (queue) */
0117: public final static String ATTR_HOST_VALENCE = "host-valence";
0118: private final static Integer DEFAULT_HOST_VALENCE = new Integer(1);
0119:
0120: /** Number of hops of embeds (ERX) to bump to front of host queue */
0121: public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops";
0122: private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(
0123: 0);
0124:
0125: /** Queue assignment to force on CrawlURIs. Intended to be used
0126: * via overrides*/
0127: public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
0128: protected final static String DEFAULT_FORCE_QUEUE = "";
0129: /** Acceptable characters in forced queue names.
0130: * Word chars, dash, period, comma, colon */
0131: protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";
0132:
0133: /** Should the queue assignment ignore www in hostnames, effectively
0134: * stripping them away.
0135: */
0136: public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
0137: protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(
0138: false);
0139:
0140: /** Should the Frontier use a seperate 'already included' datastructure
0141: * or rely on the queues'.
0142: */
0143: public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
0144: protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(
0145: false);
0146:
0147: private CrawlController controller;
0148:
0149: private AdaptiveRevisitQueueList hostQueues;
0150:
0151: private UriUniqFilter alreadyIncluded;
0152:
0153: private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();
0154:
0155: /** Policy for assigning CrawlURIs to named queues */
0156: private QueueAssignmentPolicy queueAssignmentPolicy = null;
0157:
0158: // top-level stats
0159: private long succeededFetchCount = 0;
0160: private long failedFetchCount = 0;
0161: // URI's that are disregarded (for example because of robot.txt rules)
0162: private long disregardedUriCount = 0;
0163:
0164: private long totalProcessedBytes = 0;
0165:
0166: // Flags indicating operator-specified crawl pause/end
0167: private boolean shouldPause = false;
0168: private boolean shouldTerminate = false;
0169:
0170: public AdaptiveRevisitFrontier(String name) {
0171: this (
0172: name,
0173: "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that "
0174: + "will repeatedly visit all "
0175: + "encountered URIs. Wait time between visits is configurable"
0176: + " and is determined by seperate Processor(s). See "
0177: + "WaitEvaluators "
0178: + "See documentation for ARFrontier limitations.");
0179: }
0180:
0181: public AdaptiveRevisitFrontier(String name, String description) {
0182: super (Frontier.ATTR_NAME, description);
0183: addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
0184: "How many multiples of last fetch elapsed time to wait before "
0185: + "recontacting same server",
0186: DEFAULT_DELAY_FACTOR));
0187: addElementToDefinition(new SimpleType(
0188: ATTR_MAX_DELAY,
0189: "Never wait more than this long, regardless of multiple",
0190: DEFAULT_MAX_DELAY));
0191: addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
0192: "Always wait this long after one completion before recontacting "
0193: + "same server, regardless of multiple",
0194: DEFAULT_MIN_DELAY));
0195: addElementToDefinition(new SimpleType(
0196: ATTR_MAX_RETRIES,
0197: "How often to retry fetching a URI that failed to be retrieved.\n"
0198: + "If zero, the crawler will get the robots.txt only.",
0199: DEFAULT_MAX_RETRIES));
0200: addElementToDefinition(new SimpleType(
0201: ATTR_RETRY_DELAY,
0202: "How long to wait by default until we retry fetching a"
0203: + " URI that failed to be retrieved (seconds). ",
0204: DEFAULT_RETRY_DELAY));
0205: addElementToDefinition(new SimpleType(
0206: ATTR_PREFERENCE_EMBED_HOPS,
0207: "Number of embedded (or redirected) hops up to which "
0208: + "a URI has higher priority scheduling. For example, if set "
0209: + "to 1 (the default), items such as inline images (1-hop "
0210: + "embedded resources) will be scheduled ahead of all regular "
0211: + "links (or many-hop resources, like nested frames). If set to "
0212: + "zero, no preferencing will occur, and embeds/redirects are "
0213: + "scheduled the same as regular links.",
0214: DEFAULT_PREFERENCE_EMBED_HOPS));
0215: Type t;
0216: t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
0217: "Maximum number of simultaneous requests to a single"
0218: + " host.", DEFAULT_HOST_VALENCE));
0219: t.setExpertSetting(true);
0220: t = addElementToDefinition(new SimpleType(
0221: ATTR_QUEUE_IGNORE_WWW,
0222: "If true then documents from x.com, www.x.com and any "
0223: + "www[0-9]+.x.com will be assigned to the same queue.",
0224: DEFAULT_QUEUE_IGNORE_WWW));
0225: t.setExpertSetting(true);
0226: t = addElementToDefinition(new SimpleType(
0227: ATTR_FORCE_QUEUE,
0228: "The queue name into which to force URIs. Should "
0229: + "be left blank at global level. Specify a "
0230: + "per-domain/per-host override to force URIs into "
0231: + "a particular named queue, regardless of the assignment "
0232: + "policy in effect (domain or ip-based politeness). "
0233: + "This could be used on domains known to all be from "
0234: + "the same small set of IPs (eg blogspot, dailykos, etc.) "
0235: + "to simulate IP-based politeness, or it could be used if "
0236: + "you wanted to enforce politeness over a whole domain, even "
0237: + "though the subdomains are split across many IPs.",
0238: DEFAULT_FORCE_QUEUE));
0239: t.setOverrideable(true);
0240: t.setExpertSetting(true);
0241: t
0242: .addConstraint(new RegularExpressionConstraint(
0243: ACCEPTABLE_FORCE_QUEUE,
0244: Level.WARNING,
0245: "This field must contain only alphanumeric "
0246: + "characters plus period, dash, comma, colon, or underscore."));
0247: t = addElementToDefinition(new SimpleType(
0248: ATTR_USE_URI_UNIQ_FILTER,
0249: "If true then the Frontier will use a seperate "
0250: + "datastructure to detect and eliminate duplicates.\n"
0251: + "This is required for Canonicalization rules to work.",
0252: DEFAULT_USE_URI_UNIQ_FILTER));
0253: t.setExpertSetting(true);
0254: t.setOverrideable(false);
0255:
0256: // Register persistent CrawlURI items
0257: CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
0258: CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
0259: }
0260:
0261: public synchronized void initialize(CrawlController c)
0262: throws FatalConfigurationException, IOException {
0263: controller = c;
0264: controller.addCrawlStatusListener(this );
0265:
0266: queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();
0267:
0268: hostQueues = new AdaptiveRevisitQueueList(
0269: c.getBdbEnvironment(), c.getBdbEnvironment()
0270: .getClassCatalog());
0271:
0272: if (((Boolean) getUncheckedAttribute(null,
0273: ATTR_USE_URI_UNIQ_FILTER)).booleanValue()) {
0274: alreadyIncluded = createAlreadyIncluded();
0275: } else {
0276: alreadyIncluded = null;
0277: }
0278:
0279: loadSeeds();
0280: }
0281:
0282: /**
0283: * Create a UriUniqFilter that will serve as record
0284: * of already seen URIs.
0285: *
0286: * @return A UURISet that will serve as a record of already seen URIs
0287: * @throws IOException
0288: */
0289: protected UriUniqFilter createAlreadyIncluded() throws IOException {
0290: UriUniqFilter uuf = new BdbUriUniqFilter(this .controller
0291: .getBdbEnvironment());
0292: uuf.setDestination(this );
0293: return uuf;
0294: }
0295:
0296: /**
0297: * Loads the seeds
0298: * <p>
0299: * This method is called by initialize() and kickUpdate()
0300: */
0301: public void loadSeeds() {
0302: Writer ignoredWriter = new StringWriter();
0303: // Get the seeds to refresh.
0304: Iterator iter = this .controller.getScope().seedsIterator(
0305: ignoredWriter);
0306: while (iter.hasNext()) {
0307: CandidateURI caUri = CandidateURI
0308: .createSeedCandidateURI((UURI) iter.next());
0309: caUri.setSchedulingDirective(CandidateURI.MEDIUM);
0310: schedule(caUri);
0311: }
0312: batchFlush();
0313: // save ignored items (if any) where they can be consulted later
0314: AbstractFrontier.saveIgnoredItems(ignoredWriter.toString(),
0315: controller.getDisk());
0316: }
0317:
0318: public String getClassKey(CandidateURI cauri) {
0319: String queueKey = (String) getUncheckedAttribute(cauri,
0320: ATTR_FORCE_QUEUE);
0321: if ("".equals(queueKey)) {
0322: // Typical case, barring overrides
0323: queueKey = queueAssignmentPolicy.getClassKey(controller,
0324: cauri);
0325: // The queueAssignmentPolicy is always based on Hostnames
0326: // We may need to remove any www[0-9]{0,}\. prefixes from the
0327: // hostnames
0328: if (((Boolean) getUncheckedAttribute(cauri,
0329: ATTR_QUEUE_IGNORE_WWW)).booleanValue()) {
0330: queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.", "");
0331: }
0332: }
0333: return queueKey;
0334: }
0335:
0336: /**
0337: * Canonicalize passed uuri. Its would be sweeter if this canonicalize
0338: * function was encapsulated by that which it canonicalizes but because
0339: * settings change with context -- i.e. there may be overrides in operation
0340: * for a particular URI -- its not so easy; Each CandidateURI would need a
0341: * reference to the settings system. That's awkward to pass in.
0342: *
0343: * @param uuri Candidate URI to canonicalize.
0344: * @return Canonicalized version of passed <code>uuri</code>.
0345: */
0346: protected String canonicalize(UURI uuri) {
0347: return Canonicalizer.canonicalize(uuri, this .controller
0348: .getOrder());
0349: }
0350:
0351: /**
0352: * Canonicalize passed CandidateURI. This method differs from
0353: * {@link #canonicalize(UURI)} in that it takes a look at
0354: * the CandidateURI context possibly overriding any canonicalization effect if
0355: * it could make us miss content. If canonicalization produces an URL that
0356: * was 'alreadyseen', but the entry in the 'alreadyseen' database did
0357: * nothing but redirect to the current URL, we won't get the current URL;
0358: * we'll think we've already see it. Examples would be archive.org
0359: * redirecting to www.archive.org or the inverse, www.netarkivet.net
0360: * redirecting to netarkivet.net (assuming stripWWW rule enabled).
0361: * <p>Note, this method under circumstance sets the forceFetch flag.
0362: *
0363: * @param cauri CandidateURI to examine.
0364: * @return Canonicalized <code>cacuri</code>.
0365: */
0366: protected String canonicalize(CandidateURI cauri) {
0367: String canon = canonicalize(cauri.getUURI());
0368: if (cauri.isLocation()) {
0369: // If the via is not the same as where we're being redirected (i.e.
0370: // we're not being redirected back to the same page, AND the
0371: // canonicalization of the via is equal to the the current cauri,
0372: // THEN forcefetch (Forcefetch so no chance of our not crawling
0373: // content because alreadyseen check things its seen the url before.
0374: // An example of an URL that redirects to itself is:
0375: // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
0376: // An example of an URL whose canonicalization equals its via's
0377: // canonicalization, and we want to fetch content at the
0378: // redirection (i.e. need to set forcefetch), is netarkivet.dk.
0379: if (!cauri.toString().equals(cauri.getVia().toString())
0380: && canonicalize(cauri.getVia()).equals(canon)) {
0381: cauri.setForceFetch(true);
0382: }
0383: }
0384: return canon;
0385: }
0386:
0387: /**
0388: *
0389: * @param caUri The URI to schedule.
0390: */
0391: protected void innerSchedule(CandidateURI caUri) {
0392: CrawlURI curi;
0393: if (caUri instanceof CrawlURI) {
0394: curi = (CrawlURI) caUri;
0395: } else {
0396: curi = CrawlURI.from(caUri, System.currentTimeMillis());
0397: curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0398: .currentTimeMillis());
0399: // New CrawlURIs get 'current time' as the time of next processing.
0400: }
0401:
0402: if (curi.getClassKey() == null) {
0403: curi.setClassKey(getClassKey(curi));
0404: }
0405:
0406: if (curi.isSeed() && curi.getVia() != null
0407: && curi.flattenVia().length() > 0) {
0408: // The only way a seed can have a non-empty via is if it is the
0409: // result of a seed redirect. Add it to the seeds list.
0410: //
0411: // This is a feature. This is handling for case where a seed
0412: // gets immediately redirected to another page. What we're doing
0413: // is treating the immediate redirect target as a seed.
0414: this .controller.getScope().addSeed(curi);
0415: // And it needs rapid scheduling.
0416: curi.setSchedulingDirective(CandidateURI.MEDIUM);
0417: }
0418:
0419: // Optionally preferencing embeds up to MEDIUM
0420: int prefHops = ((Integer) getUncheckedAttribute(curi,
0421: ATTR_PREFERENCE_EMBED_HOPS)).intValue();
0422: boolean prefEmbed = false;
0423: if (prefHops > 0) {
0424: int embedHops = curi.getTransHops();
0425: if (embedHops > 0
0426: && embedHops <= prefHops
0427: && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
0428: // number of embed hops falls within the preferenced range, and
0429: // uri is not already MEDIUM -- so promote it
0430: curi.setSchedulingDirective(CandidateURI.MEDIUM);
0431: prefEmbed = true;
0432: }
0433: }
0434:
0435: // Finally, allow curi to be fetched right now
0436: // (while not overriding overdue items)
0437: curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0438: .currentTimeMillis());
0439:
0440: try {
0441: logger.finest("scheduling " + curi.toString());
0442: AdaptiveRevisitHostQueue hq = getHQ(curi);
0443: hq.add(curi, prefEmbed);
0444: } catch (IOException e) {
0445: // TODO Handle IOExceptions
0446: e.printStackTrace();
0447: }
0448:
0449: }
0450:
0451: /**
0452: * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
0453: * it if necessary.
0454: *
0455: * @param curi CrawlURI for which to get a queue
0456: * @return AdaptiveRevisitHostQueue for given CrawlURI
0457: * @throws IOException
0458: */
0459: protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi)
0460: throws IOException {
0461: AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0462: .getClassKey());
0463: if (hq == null) {
0464: // Need to create it.
0465: int valence = DEFAULT_HOST_VALENCE.intValue();
0466: try {
0467: valence = ((Integer) getAttribute(curi,
0468: ATTR_HOST_VALENCE)).intValue();
0469: } catch (AttributeNotFoundException e2) {
0470: logger.severe("Unable to load valence.");
0471: }
0472: hq = hostQueues.createHQ(curi.getClassKey(), valence);
0473: }
0474: return hq;
0475: }
0476:
0477: protected void batchSchedule(CandidateURI caUri) {
0478: threadWaiting.getQueue().enqueue(caUri);
0479: }
0480:
0481: protected void batchFlush() {
0482: innerBatchFlush();
0483: }
0484:
0485: private void innerBatchFlush() {
0486: Queue q = threadWaiting.getQueue();
0487: while (!q.isEmpty()) {
0488: CandidateURI caUri = (CandidateURI) q.dequeue();
0489: if (alreadyIncluded != null) {
0490: String cannon = canonicalize(caUri);
0491: System.out.println("Cannon of " + caUri + " is "
0492: + cannon);
0493: if (caUri.forceFetch()) {
0494: alreadyIncluded.addForce(cannon, caUri);
0495: } else {
0496: alreadyIncluded.add(cannon, caUri);
0497: }
0498: } else {
0499: innerSchedule(caUri);
0500: }
0501: }
0502: }
0503:
0504: /**
0505: * @param curi
0506: * @return the CrawlServer to be associated with this CrawlURI
0507: */
0508: protected CrawlServer getServer(CrawlURI curi) {
0509: return this .controller.getServerCache().getServerFor(curi);
0510: }
0511:
0512: /* (non-Javadoc)
0513: * @see org.archive.crawler.framework.Frontier#next()
0514: */
0515: public synchronized CrawlURI next() throws InterruptedException,
0516: EndedException {
0517: controller.checkFinish();
0518:
0519: while (shouldPause) {
0520: controller.toePaused();
0521: wait();
0522: }
0523:
0524: if (shouldTerminate) {
0525: throw new EndedException("terminated");
0526: }
0527:
0528: AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();
0529:
0530: while (hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY) {
0531: // Ok, so we don't have a ready queue, wait until the top one
0532: // will become available.
0533: long waitTime = hq.getNextReadyTime()
0534: - System.currentTimeMillis();
0535: if (waitTime > 0) {
0536: wait(waitTime);
0537: }
0538: // The top HQ may have changed, so get it again
0539: hq = hostQueues.getTopHQ();
0540: }
0541:
0542: if (shouldTerminate) {
0543: // May have been terminated while thread was waiting for IO
0544: throw new EndedException("terminated");
0545: }
0546:
0547: try {
0548: CrawlURI curi = hq.next();
0549: // Populate CURI with 'transient' variables such as server.
0550: logger.fine("Issuing " + curi.toString());
0551: long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
0552: long currT = System.currentTimeMillis();
0553: long overdue = (currT - temp);
0554: if (logger.isLoggable(Level.FINER)) {
0555: String waitI = "not set";
0556: if (curi.containsKey(A_WAIT_INTERVAL)) {
0557: waitI = ArchiveUtils
0558: .formatMillisecondsToConventional(curi
0559: .getLong(A_WAIT_INTERVAL));
0560: }
0561: logger.finer("Wait interval: " + waitI
0562: + ", Time of next proc: " + temp
0563: + ", Current time: " + currT + ", Overdue by: "
0564: + overdue + "ms");
0565: }
0566: if (overdue < 0) {
0567: // This should never happen.
0568: logger.severe("Time overdue for " + curi.toString()
0569: + "is negative (" + overdue + ")!");
0570: }
0571: curi.putLong(A_FETCH_OVERDUE, overdue);
0572: return curi;
0573: } catch (IOException e) {
0574: // TODO: Need to handle this in an intelligent manner.
0575: // Is probably fatal?
0576: e.printStackTrace();
0577: }
0578:
0579: return null;
0580: }
0581:
0582: /* (non-Javadoc)
0583: * @see org.archive.crawler.framework.Frontier#isEmpty()
0584: */
0585: public boolean isEmpty() {
0586: // Technically, the Frontier should never become empty since URIs are
0587: // only discarded under exceptional circumstances.
0588: return hostQueues.getSize() == 0;
0589: }
0590:
0591: /* (non-Javadoc)
0592: * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
0593: */
0594: public void schedule(CandidateURI caURI) {
0595: batchSchedule(caURI);
0596: }
0597:
0598: /* (non-Javadoc)
0599: * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
0600: */
0601: public synchronized void finished(CrawlURI curi) {
0602: logger.fine(curi.toString()
0603: + " "
0604: + CrawlURI.fetchStatusCodesToString(curi
0605: .getFetchStatus()));
0606: curi.incrementFetchAttempts();
0607: logLocalizedErrors(curi);
0608:
0609: innerFinished(curi);
0610: }
0611:
0612: protected synchronized void innerFinished(CrawlURI curi) {
0613: try {
0614: innerBatchFlush();
0615:
0616: if (curi.isSuccess()) {
0617: successDisposition(curi);
0618: } else if (needsPromptRetry(curi)) {
0619: // Consider statuses which allow nearly-immediate retry
0620: // (like deferred to allow precondition to be fetched)
0621: reschedule(curi, false);
0622: } else if (needsRetrying(curi)) {
0623: // Consider errors which can be retried
0624: reschedule(curi, true);
0625: controller.fireCrawledURINeedRetryEvent(curi);
0626: } else if (isDisregarded(curi)) {
0627: // Check for codes that mean that while the crawler did
0628: // manage to get it it must be disregarded for any reason.
0629: disregardDisposition(curi);
0630: } else {
0631: // In that case FAILURE, note & log
0632: failureDisposition(curi);
0633: }
0634:
0635: // New items might be available, let waiting threads know
0636: // More then one queue might have become available due to
0637: // scheduling of items outside the parent URIs host, so we
0638: // wake all waiting threads.
0639: notifyAll();
0640: } catch (RuntimeException e) {
0641: curi.setFetchStatus(S_RUNTIME_EXCEPTION);
0642: // store exception temporarily for logging
0643: logger.warning("RTE in innerFinished() " + e.getMessage());
0644: e.printStackTrace();
0645: curi.putObject(A_RUNTIME_EXCEPTION, e);
0646: failureDisposition(curi);
0647: } catch (AttributeNotFoundException e) {
0648: logger.severe(e.getMessage());
0649: }
0650: }
0651:
0652: /**
0653: * Take note of any processor-local errors that have
0654: * been entered into the CrawlURI.
0655: * @param curi CrawlURI with errors.
0656: */
0657: private void logLocalizedErrors(CrawlURI curi) {
0658: if (curi.containsKey(A_LOCALIZED_ERRORS)) {
0659: List localErrors = (List) curi
0660: .getObject(A_LOCALIZED_ERRORS);
0661: Iterator iter = localErrors.iterator();
0662: while (iter.hasNext()) {
0663: Object array[] = { curi, iter.next() };
0664: controller.localErrors.log(Level.WARNING, curi
0665: .getUURI().toString(), array);
0666: }
0667: // once logged, discard
0668: curi.remove(A_LOCALIZED_ERRORS);
0669: }
0670: }
0671:
0672: /**
0673: * The CrawlURI has been successfully crawled.
0674: *
0675: * @param curi The CrawlURI
0676: */
0677: protected void successDisposition(CrawlURI curi) {
0678: curi.aboutToLog();
0679:
0680: long waitInterval = 0;
0681:
0682: if (curi.containsKey(A_WAIT_INTERVAL)) {
0683: waitInterval = curi.getLong(A_WAIT_INTERVAL);
0684: curi
0685: .addAnnotation("wt:"
0686: + ArchiveUtils
0687: .formatMillisecondsToConventional(waitInterval));
0688: } else {
0689: logger.severe("Missing wait interval for "
0690: + curi.toString()
0691: + " WaitEvaluator may be missing.");
0692: }
0693: if (curi.containsKey(A_NUMBER_OF_VISITS)) {
0694: curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
0695: }
0696: if (curi.containsKey(A_NUMBER_OF_VERSIONS)) {
0697: curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS)
0698: + "ver");
0699: }
0700: if (curi.containsKey(A_FETCH_OVERDUE)) {
0701: curi.addAnnotation("ov:"
0702: + ArchiveUtils
0703: .formatMillisecondsToConventional((curi
0704: .getLong(A_FETCH_OVERDUE))));
0705: }
0706:
0707: Object array[] = { curi };
0708: controller.uriProcessing.log(Level.INFO, curi.getUURI()
0709: .toString(), array);
0710:
0711: succeededFetchCount++;
0712: totalProcessedBytes += curi.getContentSize();
0713:
0714: // Let everyone know in case they want to do something before we strip
0715: // the curi.
0716: controller.fireCrawledURISuccessfulEvent(curi);
0717:
0718: curi.setSchedulingDirective(CandidateURI.NORMAL);
0719:
0720: // Set time of next processing
0721: curi.putLong(A_TIME_OF_NEXT_PROCESSING, System
0722: .currentTimeMillis()
0723: + waitInterval);
0724:
0725: /* Update HQ */
0726: AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0727: .getClassKey());
0728:
0729: // Wake up time is based on the time when a fetch was completed + the
0730: // calculated snooze time for politeness. If the fetch completion time
0731: // is missing, we'll use current time.
0732: long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi
0733: .getLong(A_FETCH_COMPLETED_TIME)
0734: : (new Date()).getTime())
0735: + calculateSnoozeTime(curi);
0736:
0737: // Ready the URI for reserialization.
0738: curi.processingCleanup();
0739: curi.resetDeferrals();
0740: curi.resetFetchAttempts();
0741: try {
0742: hq.update(curi, true, wakeupTime);
0743: } catch (IOException e) {
0744: logger.severe("An IOException occured when updating "
0745: + curi.toString() + "\n" + e.getMessage());
0746: e.printStackTrace();
0747: }
0748: }
0749:
0750: /**
0751: * Put near top of relevant hostQueue (but behind anything recently
0752: * scheduled 'high')..
0753: *
0754: * @param curi CrawlURI to reschedule. Its time of next processing is not
0755: * modified.
0756: * @param errorWait signals if there should be a wait before retrying.
0757: * @throws AttributeNotFoundException
0758: */
0759: protected void reschedule(CrawlURI curi, boolean errorWait)
0760: throws AttributeNotFoundException {
0761: long delay = 0;
0762: if (errorWait) {
0763: if (curi.containsKey(A_RETRY_DELAY)) {
0764: delay = curi.getLong(A_RETRY_DELAY);
0765: } else {
0766: // use ARFrontier default
0767: delay = ((Long) getAttribute(ATTR_RETRY_DELAY, curi))
0768: .longValue();
0769: }
0770: }
0771:
0772: long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi
0773: .getLong(A_FETCH_COMPLETED_TIME)
0774: : (new Date()).getTime())
0775: + delay;
0776:
0777: AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0778: .getClassKey());
0779: // Ready the URI for reserialization.
0780: curi.processingCleanup();
0781: if (errorWait) {
0782: curi.resetDeferrals(); //Defferals only refer to immediate retries.
0783: }
0784: try {
0785: hq.update(curi, errorWait, retryTime);
0786: } catch (IOException e) {
0787: // TODO Handle IOException
0788: e.printStackTrace();
0789: }
0790: }
0791:
0792: /**
0793: * The CrawlURI has encountered a problem, and will not
0794: * be retried.
0795: *
0796: * @param curi The CrawlURI
0797: */
0798: protected void failureDisposition(CrawlURI curi) {
0799: //Let interested listeners know of failed disposition.
0800: this .controller.fireCrawledURIFailureEvent(curi);
0801:
0802: // send to basic log
0803: curi.aboutToLog();
0804: Object array[] = { curi };
0805: this .controller.uriProcessing.log(Level.INFO, curi.getUURI()
0806: .toString(), array);
0807:
0808: // if exception, also send to crawlErrors
0809: if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
0810: this .controller.runtimeErrors.log(Level.WARNING, curi
0811: .getUURI().toString(), array);
0812: }
0813: failedFetchCount++;
0814:
0815: // Put the failed URI at the very back of the queue.
0816: curi.setSchedulingDirective(CandidateURI.NORMAL);
0817: // TODO: reconsider this
0818: curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);
0819:
0820: AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0821: .getClassKey());
0822: // Ready the URI for serialization.
0823: curi.processingCleanup();
0824: curi.resetDeferrals();
0825: curi.resetFetchAttempts();
0826: try {
0827: // No wait on failure. No contact was made with the server.
0828: boolean shouldForget = shouldBeForgotten(curi);
0829: if (shouldForget && alreadyIncluded != null) {
0830: alreadyIncluded.forget(canonicalize(curi.getUURI()),
0831: curi);
0832: }
0833: hq.update(curi, false, 0, shouldForget);
0834: } catch (IOException e) {
0835: // TODO Handle IOException
0836: e.printStackTrace();
0837: }
0838: }
0839:
0840: protected void disregardDisposition(CrawlURI curi) {
0841: //Let interested listeners know of disregard disposition.
0842: controller.fireCrawledURIDisregardEvent(curi);
0843:
0844: // send to basic log
0845: curi.aboutToLog();
0846: Object array[] = { curi };
0847: controller.uriProcessing.log(Level.INFO, curi.getUURI()
0848: .toString(), array);
0849:
0850: disregardedUriCount++;
0851:
0852: // Todo: consider timout before retrying disregarded elements.
0853: // Possibly add a setting to the WaitEvaluators?
0854: curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);
0855: curi.setSchedulingDirective(CandidateURI.NORMAL);
0856:
0857: AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi
0858: .getClassKey());
0859: // Ready the URI for reserialization.
0860: curi.processingCleanup();
0861: curi.resetDeferrals();
0862: curi.resetFetchAttempts();
0863: try {
0864: // No politness wait on disregard. No contact was made with server
0865: hq.update(curi, false, 0, shouldBeForgotten(curi));
0866: } catch (IOException e) {
0867: // TODO Handle IOException
0868: e.printStackTrace();
0869: }
0870: }
0871:
0872: /**
0873: * Some URIs, if they recur, deserve another
0874: * chance at consideration: they might not be too
0875: * many hops away via another path, or the scope
0876: * may have been updated to allow them passage.
0877: *
0878: * @param curi
0879: * @return True if curi should be forgotten.
0880: */
0881: protected boolean shouldBeForgotten(CrawlURI curi) {
0882: switch (curi.getFetchStatus()) {
0883: case S_OUT_OF_SCOPE:
0884: case S_TOO_MANY_EMBED_HOPS:
0885: case S_TOO_MANY_LINK_HOPS:
0886: return true;
0887: default:
0888: return false;
0889: }
0890: }
0891:
0892: /**
0893: * Checks if a recently completed CrawlURI that did not finish successfully
0894: * needs to be retried immediately (processed again as soon as politeness
0895: * allows.)
0896: *
0897: * @param curi The CrawlURI to check
0898: * @return True if we need to retry promptly.
0899: * @throws AttributeNotFoundException If problems occur trying to read the
0900: * maximum number of retries from the settings framework.
0901: */
0902: protected boolean needsPromptRetry(CrawlURI curi)
0903: throws AttributeNotFoundException {
0904: if (curi.getFetchAttempts() >= ((Integer) getAttribute(
0905: ATTR_MAX_RETRIES, curi)).intValue()) {
0906: return false;
0907: }
0908:
0909: switch (curi.getFetchStatus()) {
0910: case S_DEFERRED:
0911: return true;
0912:
0913: case HttpStatus.SC_UNAUTHORIZED:
0914: // We can get here though usually a positive status code is
0915: // a success. We get here if there is rfc2617 credential data
0916: // loaded and we're supposed to go around again. See if any
0917: // rfc2617 credential present and if there, assume it got
0918: // loaded in FetchHTTP on expectation that we're to go around
0919: // again. If no rfc2617 loaded, we should not be here.
0920: boolean loaded = curi.hasRfc2617CredentialAvatar();
0921: if (!loaded) {
0922: logger.severe("Have 401 but no creds loaded " + curi);
0923: }
0924: return loaded;
0925:
0926: default:
0927: return false;
0928: }
0929: }
0930:
0931: /**
0932: * Checks if a recently completed CrawlURI that did not finish successfully
0933: * needs to be retried (processed again after some time elapses)
0934: *
0935: * @param curi The CrawlURI to check
0936: * @return True if we need to retry.
0937: * @throws AttributeNotFoundException If problems occur trying to read the
0938: * maximum number of retries from the settings framework.
0939: */
0940: protected boolean needsRetrying(CrawlURI curi)
0941: throws AttributeNotFoundException {
0942: // Check to see if maximum number of retries has been exceeded.
0943: if (curi.getFetchAttempts() >= ((Integer) getAttribute(
0944: ATTR_MAX_RETRIES, curi)).intValue()) {
0945: return false;
0946: } else {
0947: // Check if FetchStatus indicates that a delayed retry is needed.
0948: switch (curi.getFetchStatus()) {
0949: case S_CONNECT_FAILED:
0950: case S_CONNECT_LOST:
0951: case S_DOMAIN_UNRESOLVABLE:
0952: // these are all worth a retry
0953: // TODO: consider if any others (S_TIMEOUT in some cases?)
0954: // deserve retry
0955: return true;
0956: default:
0957: return false;
0958: }
0959: }
0960: }
0961:
0962: protected boolean isDisregarded(CrawlURI curi) {
0963: switch (curi.getFetchStatus()) {
0964: case S_ROBOTS_PRECLUDED: // they don't want us to have it
0965: case S_OUT_OF_SCOPE: // filtered out by scope
0966: case S_BLOCKED_BY_CUSTOM_PROCESSOR:
0967: case S_BLOCKED_BY_USER: // filtered out by user
0968: case S_TOO_MANY_EMBED_HOPS: // too far from last true link
0969: case S_TOO_MANY_LINK_HOPS: // too far from seeds
0970: case S_DELETED_BY_USER: // user deleted
0971: return true;
0972: default:
0973: return false;
0974: }
0975: }
0976:
0977: /**
0978: * Calculates how long a host queue needs to be snoozed following the
0979: * crawling of a URI.
0980: *
0981: * @param curi The CrawlURI
0982: * @return How long to snooze.
0983: */
0984: protected long calculateSnoozeTime(CrawlURI curi) {
0985: long durationToWait = 0;
0986: if (curi.containsKey(A_FETCH_BEGAN_TIME)
0987: && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
0988:
0989: try {
0990:
0991: long completeTime = curi
0992: .getLong(A_FETCH_COMPLETED_TIME);
0993: long durationTaken = (completeTime - curi
0994: .getLong(A_FETCH_BEGAN_TIME));
0995:
0996: durationToWait = (long) (((Float) getAttribute(
0997: ATTR_DELAY_FACTOR, curi)).floatValue() * durationTaken);
0998:
0999: long minDelay = ((Integer) getAttribute(ATTR_MIN_DELAY,
1000: curi)).longValue();
1001:
1002: if (minDelay > durationToWait) {
1003: // wait at least the minimum
1004: durationToWait = minDelay;
1005: }
1006:
1007: long maxDelay = ((Integer) getAttribute(ATTR_MAX_DELAY,
1008: curi)).longValue();
1009: if (durationToWait > maxDelay) {
1010: // wait no more than the maximum
1011: durationToWait = maxDelay;
1012: }
1013: } catch (AttributeNotFoundException e) {
1014: logger.severe("Unable to find attribute. "
1015: + curi.toString());
1016: //Wait for max interval.
1017: durationToWait = DEFAULT_MAX_DELAY.longValue();
1018: }
1019:
1020: }
1021: long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ? durationToWait
1022: : DEFAULT_MIN_DELAY.longValue();
1023: logger.finest("Snooze time for " + curi.toString() + " = "
1024: + ret);
1025: return ret;
1026: }
1027:
1028: /* (non-Javadoc)
1029: * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
1030: */
1031: public synchronized long discoveredUriCount() {
1032: return (this .alreadyIncluded != null) ? this .alreadyIncluded
1033: .count() : hostQueues.getSize();
1034: }
1035:
1036: /* (non-Javadoc)
1037: * @see org.archive.crawler.framework.Frontier#queuedUriCount()
1038: */
1039: public synchronized long queuedUriCount() {
1040: return hostQueues.getSize();
1041: }
1042:
1043: /* (non-Javadoc)
1044: * @see org.archive.crawler.framework.Frontier#finishedUriCount()
1045: */
1046: public long finishedUriCount() {
1047: return succeededFetchCount + failedFetchCount
1048: + disregardedUriCount;
1049: }
1050:
1051: /* (non-Javadoc)
1052: * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
1053: */
1054: public long succeededFetchCount() {
1055: return succeededFetchCount;
1056: }
1057:
1058: /* (non-Javadoc)
1059: * @see org.archive.crawler.framework.Frontier#failedFetchCount()
1060: */
1061: public long failedFetchCount() {
1062: return failedFetchCount;
1063: }
1064:
1065: /* (non-Javadoc)
1066: * @see org.archive.crawler.framework.Frontier#disregardedUriCount()
1067: */
1068: public long disregardedUriCount() {
1069: return disregardedUriCount++;
1070: }
1071:
1072: /* (non-Javadoc)
1073: * @see org.archive.crawler.framework.Frontier#totalBytesWritten()
1074: */
1075: public long totalBytesWritten() {
1076: return totalProcessedBytes;
1077: }
1078:
1079: /**
1080: * Method is not supported by this Frontier implementation..
1081: * @param pathToLog
1082: * @throws IOException
1083: */
1084: public void importRecoverLog(String pathToLog) throws IOException {
1085: throw new IOException("Unsupported by this frontier.");
1086: }
1087:
1088: public synchronized FrontierMarker getInitialMarker(String regexpr,
1089: boolean inCacheOnly) {
1090: return null;
1091: }
1092:
1093: /* (non-Javadoc)
1094: * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)
1095: */
1096: public synchronized ArrayList getURIsList(FrontierMarker marker,
1097: int numberOfMatches, boolean verbose)
1098: throws InvalidFrontierMarkerException {
1099: // TODO Auto-generated method stub
1100: return null;
1101: }
1102:
1103: /* (non-Javadoc)
1104: * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)
1105: */
1106: public synchronized long deleteURIs(String match) {
1107: // TODO Auto-generated method stub
1108: return 0;
1109: }
1110:
1111: /* (non-Javadoc)
1112: * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1113: */
1114: public synchronized void deleted(CrawlURI curi) {
1115: // TODO Auto-generated method stub
1116: }
1117:
1118: public void considerIncluded(UURI u) {
1119: // This will cause the URI to be crawled!!!
1120: CrawlURI curi = new CrawlURI(u);
1121: innerSchedule(curi);
1122:
1123: }
1124:
1125: public void kickUpdate() {
1126: loadSeeds();
1127: }
1128:
1129: public void start() {
1130: unpause();
1131: }
1132:
1133: synchronized public void pause() {
1134: shouldPause = true;
1135: notifyAll();
1136: }
1137:
1138: synchronized public void unpause() {
1139: shouldPause = false;
1140: notifyAll();
1141: }
1142:
1143: synchronized public void terminate() {
1144: shouldTerminate = true;
1145: }
1146:
1147: /* (non-Javadoc)
1148: * @see org.archive.crawler.framework.Frontier#getFrontierJournal()
1149: */
1150: public FrontierJournal getFrontierJournal() {
1151: return null;
1152: }
1153:
1154: private static class ThreadLocalQueue extends
1155: ThreadLocal<Queue<CandidateURI>> implements Serializable {
1156:
1157: private static final long serialVersionUID = 8268977225156462059L;
1158:
1159: protected Queue<CandidateURI> initialValue() {
1160: return new MemQueue<CandidateURI>();
1161: }
1162:
1163: /**
1164: * @return Queue of 'batched' items
1165: */
1166: public Queue<CandidateURI> getQueue() {
1167: return get();
1168: }
1169: }
1170:
1171: /**
1172: * This method is not supported by this Frontier implementation
1173: * @param pathToLog
1174: * @param retainFailures
1175: * @throws IOException
1176: */
1177: public void importRecoverLog(String pathToLog,
1178: boolean retainFailures) throws IOException {
1179: throw new IOException("Unsupported");
1180: }
1181:
1182: //
1183: // Reporter implementation
1184: //
1185:
1186: public String[] getReports() {
1187: // none but default for now
1188: return new String[] {};
1189: }
1190:
1191: /* (non-Javadoc)
1192: * @see org.archive.util.Reporter#singleLineReport()
1193: */
1194: public String singleLineReport() {
1195: return ArchiveUtils.singleLineReport(this );
1196: }
1197:
1198: /* (non-Javadoc)
1199: * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1200: */
1201: public void reportTo(PrintWriter writer) throws IOException {
1202: reportTo(null, writer);
1203: }
1204:
1205: /* (non-Javadoc)
1206: * @see org.archive.crawler.framework.Frontier#oneLineReport()
1207: */
1208: public synchronized void singleLineReportTo(PrintWriter w)
1209: throws IOException {
1210: hostQueues.singleLineReportTo(w);
1211: }
1212:
1213: /* (non-Javadoc)
1214: * @see org.archive.util.Reporter#singleLineLegend()
1215: */
1216: public String singleLineLegend() {
1217: return hostQueues.singleLineLegend();
1218: }
1219:
1220: /* (non-Javadoc)
1221: * @see org.archive.crawler.framework.Frontier#report()
1222: */
1223: public synchronized void reportTo(String name, PrintWriter writer) {
1224: // ignore name; only one report for now
1225: hostQueues.reportTo(name, writer);
1226: }
1227:
1228: /* (non-Javadoc)
1229: * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1230: */
1231: public void crawlStarted(String message) {
1232: // Not interested
1233: }
1234:
1235: /* (non-Javadoc)
1236: * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1237: */
1238: public void crawlEnding(String sExitMessage) {
1239: // Not interested
1240: }
1241:
1242: /* (non-Javadoc)
1243: * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1244: */
1245: public void crawlEnded(String sExitMessage) {
1246: // Cleanup!
1247: if (this .alreadyIncluded != null) {
1248: this .alreadyIncluded.close();
1249: this .alreadyIncluded = null;
1250: }
1251: hostQueues.close();
1252: }
1253:
1254: /* (non-Javadoc)
1255: * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1256: */
1257: public void crawlPausing(String statusMessage) {
1258: // Not interested
1259: }
1260:
1261: /* (non-Javadoc)
1262: * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1263: */
1264: public void crawlPaused(String statusMessage) {
1265: // Not interested
1266: }
1267:
1268: /* (non-Javadoc)
1269: * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1270: */
1271: public void crawlResuming(String statusMessage) {
1272: // Not interested
1273: }
1274:
1275: /* (non-Javadoc)
1276: * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
1277: */
1278: public void crawlCheckpoint(File checkpointDir) throws Exception {
1279: // Not interested
1280: }
1281:
1282: /* (non-Javadoc)
1283: * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
1284: */
1285: public void receive(CandidateURI item) {
1286: System.out.println("Received " + item);
1287: innerSchedule(item);
1288: }
1289:
1290: /* (non-Javadoc)
1291: * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)
1292: */
1293: public FrontierGroup getGroup(CrawlURI curi) {
1294: try {
1295: return getHQ(curi);
1296: } catch (IOException ioe) {
1297: throw new RuntimeException(ioe);
1298: }
1299: }
1300:
1301: public long averageDepth() {
1302: return hostQueues.getAverageDepth();
1303: }
1304:
1305: public float congestionRatio() {
1306: return hostQueues.getCongestionRatio();
1307: }
1308:
1309: public long deepestUri() {
1310: return hostQueues.getDeepestQueueSize();
1311: }
1312: }
|