001: /* RuntimeLimitEnforcer
002: *
003: * Created on July 7, 2006
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.prefetch;
024:
025: import java.util.logging.Level;
026: import java.util.logging.Logger;
027:
028: import org.archive.crawler.admin.CrawlJob;
029: import org.archive.crawler.datamodel.CrawlURI;
030: import org.archive.crawler.datamodel.FetchStatusCodes;
031: import org.archive.crawler.framework.Processor;
032: import org.archive.crawler.settings.SimpleType;
033: import org.archive.crawler.settings.Type;
034:
035: /**
036: * A processor to enforce runtime limits on crawls.
037: * <p>
038: * This processor extends and improves on the 'max-time' capability of Heritrix.
039: * Essentially, the 'Terminate job' option functions the same way as 'max-time'.
040: * The processor however also enables pausing when the runtime is exceeded and
041: * the blocking of all URIs.
042: * <p>
043: * <ol>
044: * <li>Pause job - Pauses the crawl. A change (increase) to the
045: * runtime duration will make it pausible to resume the crawl.
046: * Attempts to resume the crawl without modifying the run time
047: * will cause it to be immediately paused again.</li>
048: * <li>Terminate job - Terminates the job. Equivalent
049: * to using the max-time setting on the CrawlController.</li>
050: * <li>Block URIs - Blocks each URI with an -5002
051: * (blocked by custom processor) fetch status code. This will
052: * cause all the URIs queued to wind up in the crawl.log.</li>
053: * <ol>
054: * <p>
055: * The processor allows variable runtime based on host (or other
056: * override/refinement criteria) however using such overrides only makes sense
057: * when using 'Block URIs' as pause and terminate will have global impact once
058: * encountered anywhere.
059: *
060: * @author Kristinn Sigurðsson
061: */
062: public class RuntimeLimitEnforcer extends Processor implements
063: FetchStatusCodes {
064:
065: private static final long serialVersionUID = 1L;
066:
067: protected Logger logger = Logger
068: .getLogger(RuntimeLimitEnforcer.class.getName());
069:
070: public static final String ATTR_RUNTIME_SECONDS = "runtime-sec"
071: .intern();
072: protected static final long DEFAULT_RUNTIME_SECONDS = 86400; // 1 day
073:
074: public static final String ATTR_END_OPERATION = "end-operation"
075: .intern();
076: protected static final String OP_PAUSE = "Pause job".intern();
077: protected static final String OP_TERMINATE = "Terminate job"
078: .intern();
079: protected static final String OP_BLOCK_URIS = "Block URIs".intern();
080: protected static final String DEFAULT_END_OPERATION = OP_PAUSE;
081: protected static final String[] AVAILABLE_END_OPERATIONS = {
082: OP_PAUSE, OP_TERMINATE, OP_BLOCK_URIS };
083:
084: public RuntimeLimitEnforcer(String name) {
085: super (
086: name,
087: "A processor that halts further progress once a fixed "
088: + "amount of time has elapsed since the start of a crawl. "
089: + "It is possible to configure this processor per host, but "
090: + "it should be noted that Heritrix does not track runtime "
091: + "per host seperately. Especially when using facilities "
092: + "like the BdbFrontier's hold-queues, the actual amount of "
093: + "time spent crawling a host may have little relevance to "
094: + "total elapsed time. Note however that using overrides "
095: + "and/or refinements only makes sense when using the "
096: + "'Block URIs' end operation. The pause and terminate "
097: + "operations have global impact once encountered.");
098: Type t = new SimpleType(
099: ATTR_RUNTIME_SECONDS,
100: "The amount of time, in seconds, that the crawl will be "
101: + "allowed to run before this processor performs it's 'end "
102: + "operation.'", DEFAULT_RUNTIME_SECONDS);
103: addElementToDefinition(t);
104: t = new SimpleType(
105: ATTR_END_OPERATION,
106: "The action that the processor takes once the runtime has "
107: + "elapsed.\n "
108: + "Operation: Pause job - Pauses the crawl. A change "
109: + "(increase) to the runtime duration will "
110: + "make it pausible to resume the crawl. Attempts to resume "
111: + "the crawl without modifying the run time will cause it to "
112: + "be immediately paused again.\n "
113: + "Operation: Terminate job - Terminates the job. Equivalent "
114: + "to using the max-time setting on the CrawlController.\n "
115: + "Operation: Block URIs - Blocks each URI with an -5002 "
116: + "(blocked by custom processor) fetch status code. This will "
117: + "cause all the URIs queued to wind up in the crawl.log.",
118: DEFAULT_END_OPERATION, AVAILABLE_END_OPERATIONS);
119: addElementToDefinition(t);
120: }
121:
122: protected void innerProcess(CrawlURI curi)
123: throws InterruptedException {
124: long allowedRuntime = getRuntime(curi);
125: long currentRuntime = getController().getStatistics()
126: .crawlDuration();
127: if (currentRuntime > allowedRuntime) {
128: String op = (String) getUncheckedAttribute(curi,
129: ATTR_END_OPERATION);
130: if (op != null) {
131: if (op.equals(OP_PAUSE)) {
132: getController().requestCrawlPause();
133: } else if (op.equals(OP_TERMINATE)) {
134: getController().requestCrawlStop(
135: CrawlJob.STATUS_FINISHED_TIME_LIMIT);
136: } else if (op.equals(OP_BLOCK_URIS)) {
137: curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
138: curi.addAnnotation("Runtime exceeded "
139: + allowedRuntime + "ms");
140: curi.skipToProcessorChain(getController()
141: .getPostprocessorChain());
142: }
143: } else {
144: logger.log(Level.SEVERE, "Null value for "
145: + ATTR_END_OPERATION + " when processing "
146: + curi.toString());
147: }
148: }
149: }
150:
151: /**
152: * Returns the amount of time to allow the crawl to run before this
153: * processor interrupts.
154: * @return the amount of time in milliseconds.
155: */
156: protected long getRuntime(CrawlURI curi) {
157: Object o = getUncheckedAttribute(curi, ATTR_RUNTIME_SECONDS);
158: if (o == null) {
159: logger.log(Level.SEVERE, "Null value for "
160: + ATTR_RUNTIME_SECONDS + " when processing "
161: + curi.toString());
162: return Long.MAX_VALUE;
163: }
164: return ((Long) o).longValue() * 1000; //extract value and convert to ms.
165: }
166:
167: }
|