001: /* WaitEvaluator
002: *
003: * $Id: WaitEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
004: *
005: * Created on 26.11.2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.postprocessor;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import javax.management.AttributeNotFoundException;
031:
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.framework.Processor;
034: import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
035: import org.archive.crawler.settings.SimpleType;
036:
037: /**
038: * A processor that determines when a URI should be revisited next. Does
039: * <b>not</b> account for DNS and robots.txt expiration. That should be
040: * handled seperately by the Frontiers.
041: *
042: * @author Kristinn Sigurdsson
043: */
044: public class WaitEvaluator extends Processor implements
045: AdaptiveRevisitAttributeConstants {
046:
047: private static final long serialVersionUID = 7452762726125458413L;
048:
049: Logger logger = Logger.getLogger(WaitEvaluator.class.getName());
050:
051: /** Default wait time after initial visit. */
052: public final static String ATTR_INITIAL_WAIT_INTERVAL = "initial-wait-interval-seconds";
053: protected final static Long DEFAULT_INITIAL_WAIT_INTERVAL = new Long(
054: 86400); // 1 day
055: /** Maximum wait between visits */
056: public final static String ATTR_MAX_WAIT_INTERVAL = "max-wait-interval-seconds";
057: protected final static Long DEFAULT_MAX_WAIT_INTERVAL = new Long(
058: 2419200); // 4 weeks
059: /** Minimum wait between visits */
060: public final static String ATTR_MIN_WAIT_INTERVAL = "min-wait-interval-seconds";
061: protected final static Long DEFAULT_MIN_WAIT_INTERVAL = new Long(
062: 3600); // 1 hour
063: /** Factor increase on wait when unchanged */
064: public final static String ATTR_UNCHANGED_FACTOR = "unchanged-factor";
065: protected final static Double DEFAULT_UNCHANGED_FACTOR = new Double(
066: 1.5);
067: /** Factor decrease on wait when changed */
068: public final static String ATTR_CHANGED_FACTOR = "changed-factor";
069: protected final static Double DEFAULT_CHANGED_FACTOR = new Double(
070: 1.5);
071: /** Fixed wait time for 'unknown' change status. I.e. wait time for URIs
072: * whose content change detection is not available. */
073: public final static String ATTR_DEFAULT_WAIT_INTERVAL = "default-wait-interval-seconds";
074: protected final static Long DEFAULT_DEFAULT_WAIT_INTERVAL = new Long(
075: 259200); // 3 days
076: /** Indicates if the amount of time the URI was overdue should be added
077: * to the wait time before the new wait time is calculated. */
078: public final static String ATTR_USE_OVERDUE_TIME = "use-overdue-time";
079: protected final static Boolean DEFAULT_USE_OVERDUE_TIME = new Boolean(
080: false);
081:
082: /**
083: * Constructor
084: *
085: * @param name The name of the module
086: */
087: public WaitEvaluator(String name) {
088: this (
089: name,
090: "Evaluates how long to wait before fetching a URI again. "
091: + "Typically, this processor should be in the post processing "
092: + "chain. It will pass if another wait evaluator has already "
093: + "processed the CrawlURI.",
094: DEFAULT_INITIAL_WAIT_INTERVAL,
095: DEFAULT_MAX_WAIT_INTERVAL, DEFAULT_MIN_WAIT_INTERVAL,
096: DEFAULT_UNCHANGED_FACTOR, DEFAULT_CHANGED_FACTOR);
097: }
098:
099: /**
100: * Constructor
101: *
102: * @param name The name of the module
103: * @param description Description of the module
104: * @param default_inital_wait_interval The default value for initial wait
105: * time
106: * @param default_max_wait_interval The maximum value for wait time
107: * @param default_min_wait_interval The minimum value for wait time
108: * @param default_unchanged_factor The factor for changing wait times of
109: * unchanged documents (will be multiplied by this value)
110: * @param default_changed_factor The factor for changing wait times of
111: * changed documents (will be divided by this value)
112: */
113: public WaitEvaluator(String name, String description,
114: Long default_inital_wait_interval,
115: Long default_max_wait_interval,
116: Long default_min_wait_interval,
117: Double default_unchanged_factor,
118: Double default_changed_factor) {
119: super (name, description);
120:
121: addElementToDefinition(new SimpleType(
122: ATTR_INITIAL_WAIT_INTERVAL,
123: "The initial wait time between revisits. Will then be "
124: + "updated according to crawler experiance. I.e. shorter "
125: + "wait, visit more often, if document has changed between "
126: + "visits, and vica versa.",
127: default_inital_wait_interval));
128: addElementToDefinition(new SimpleType(
129: ATTR_MAX_WAIT_INTERVAL,
130: "The maximum settable wait time between revisits. Once a "
131: + "URIs wait time reaches this value, it will not grow "
132: + "further, regardless of subsequent visits that discover "
133: + "no changes. Note that this does not ensure that the URI "
134: + "does not wait any longer, since the crawler might be "
135: + "'behind,' forcing a URI to wait until other URIs, "
136: + "scheduled for earlier are completed..",
137: default_max_wait_interval));
138: addElementToDefinition(new SimpleType(
139: ATTR_MIN_WAIT_INTERVAL,
140: "The minum settable wait time between revisits. Once a "
141: + "URIs wait time reaches this value, it will not be shortened "
142: + "further, regardlesss of subsequent visits that discover "
143: + "changes.", default_min_wait_interval));
144: addElementToDefinition(new SimpleType(
145: ATTR_DEFAULT_WAIT_INTERVAL,
146: "Fixed wait time for 'unknown' change status. I.e. wait time "
147: + "for URIs whose content change detection is not available.",
148: DEFAULT_DEFAULT_WAIT_INTERVAL));
149: addElementToDefinition(new SimpleType(
150: ATTR_UNCHANGED_FACTOR,
151: "The factor by which a URIs wait time is increased when a "
152: + "revisit reveals an unchanged document. A value of 1 will "
153: + "leave it unchanged, a value of 2 will double it etc.",
154: default_unchanged_factor));
155: addElementToDefinition(new SimpleType(
156: ATTR_CHANGED_FACTOR,
157: "The factor by which a URIs wait time is decreased when a "
158: + "revisit reveals a changed document. A value of 1 will leave "
159: + "it unchanged, a value of two will half it etc.",
160: default_changed_factor));
161: addElementToDefinition(new SimpleType(
162: ATTR_USE_OVERDUE_TIME,
163: "Indicates if the amount of time the URI was overdue should "
164: + "be added to the wait time before the new wait time is "
165: + "calculated.", DEFAULT_USE_OVERDUE_TIME));
166:
167: // Register persistent CrawlURI items
168: CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL);
169: }
170:
171: protected void innerProcess(CrawlURI curi)
172: throws InterruptedException {
173:
174: if (curi.isSuccess() == false) {
175: // If the URI was not crawled successfully, we can not reevaluate
176: // the wait interval.
177: return;
178: }
179:
180: if (curi.containsKey(A_WAIT_REEVALUATED)
181: && ((Boolean) curi.getObject(A_WAIT_REEVALUATED))
182: .booleanValue()) {
183: // This CrawlURIs wait interval has already been reevaluted during
184: // this processing round.
185: return;
186: }
187:
188: long min;
189: try {
190: min = ((Long) getAttribute(curi, ATTR_MIN_WAIT_INTERVAL))
191: .longValue() * 1000;
192: } catch (AttributeNotFoundException e1) {
193: min = DEFAULT_MIN_WAIT_INTERVAL.longValue();
194: logger.fine("Unable to load minimum wait interval for "
195: + curi.toString());
196: }
197:
198: long max;
199: try {
200: max = ((Long) getAttribute(curi, ATTR_MAX_WAIT_INTERVAL))
201: .longValue() * 1000;
202: } catch (AttributeNotFoundException e1) {
203: max = DEFAULT_MAX_WAIT_INTERVAL.longValue();
204: logger.fine("Unable to load maximum wait interval for "
205: + curi.toString());
206: }
207:
208: long waitInterval;
209: if (!curi.containsKey(A_CONTENT_STATE_KEY)
210: || curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) {
211: try {
212: waitInterval = ((Long) getAttribute(curi,
213: ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000;
214: } catch (AttributeNotFoundException e1) {
215: waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL
216: .longValue();
217: logger.fine("Unable to load default wait interval for "
218: + curi.toString());
219: }
220: } else {
221: /* Calculate curi's time of next processing */
222: waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue() * 1000;
223:
224: // Retrieve wait interval
225: if (curi.containsKey(A_WAIT_INTERVAL)) {
226: waitInterval = curi.getLong(A_WAIT_INTERVAL);
227:
228: // Should override time be taken into account?
229: boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME
230: .booleanValue();
231: try {
232: useOverrideTime = ((Boolean) getAttribute(curi,
233: ATTR_USE_OVERDUE_TIME)).booleanValue();
234: } catch (AttributeNotFoundException e1) {
235: useOverrideTime = DEFAULT_USE_OVERDUE_TIME
236: .booleanValue();
237: logger.fine("Unable to load use-overdue-time for "
238: + curi.toString());
239: }
240:
241: if (useOverrideTime) {
242: waitInterval += curi.getLong(A_FETCH_OVERDUE);
243: }
244:
245: // Revise the wait interval
246: if (curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
247: // Had changed. Decrease wait interval time.
248: double factor;
249: try {
250: factor = ((Double) getAttribute(curi,
251: ATTR_CHANGED_FACTOR)).doubleValue();
252: } catch (AttributeNotFoundException e2) {
253: factor = DEFAULT_CHANGED_FACTOR.doubleValue();
254: logger
255: .fine("Unable to load changed factor for "
256: + curi.toString());
257: }
258: waitInterval = (long) (waitInterval / factor);
259: } else if (curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNCHANGED) {
260: // Had not changed. Increase wait interval time
261: double factor;
262: try {
263: factor = ((Double) getAttribute(curi,
264: ATTR_UNCHANGED_FACTOR)).doubleValue();
265: } catch (AttributeNotFoundException e2) {
266: factor = DEFAULT_UNCHANGED_FACTOR.doubleValue();
267: logger
268: .fine("Unable to load unchanged factor for "
269: + curi.toString());
270: }
271: waitInterval = (long) (waitInterval * factor);
272: }
273: } else {
274: // If wait element not found, use initial wait interval
275: try {
276: waitInterval = ((Long) getAttribute(curi,
277: ATTR_INITIAL_WAIT_INTERVAL)).longValue() * 1000;
278: } catch (AttributeNotFoundException e1) {
279: // If this fails use default (already set) and log error.
280: logger
281: .fine("Unable to load initial wait interval for "
282: + curi.toString());
283: }
284: }
285: }
286:
287: if (waitInterval < min) {
288: waitInterval = min;
289: } else if (waitInterval > max) {
290: waitInterval = max;
291: }
292:
293: if (logger.isLoggable(Level.FINE)) {
294: logger.fine("URI " + curi.toString() + ", change: "
295: + curi.getInt(A_CONTENT_STATE_KEY)
296: + " new wait interval: " + waitInterval);
297: }
298: // Update wait interval
299: curi.putLong(A_WAIT_INTERVAL, waitInterval);
300: curi.putObject(A_WAIT_REEVALUATED, new Boolean(true));
301: }
302: }
|