001: /* FetchHistoryProcessor
002: *
003: * Created on Feb 12, 2005
004: *
005: * Copyright (C) 2007 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor.recrawl;
024:
025: import org.apache.commons.httpclient.Header;
026: import org.apache.commons.httpclient.HttpMethodBase;
027: import org.apache.commons.httpclient.HttpState;
028: import org.apache.commons.httpclient.HttpStatus;
029: import org.archive.crawler.datamodel.CoreAttributeConstants;
030: import org.archive.crawler.datamodel.CrawlURI;
031: import org.archive.crawler.framework.Processor;
032: import org.archive.crawler.settings.SimpleType;
033:
034: import st.ata.util.AList;
035: import st.ata.util.HashtableAList;
036:
037: /**
038: * Maintain a history of fetch information inside the CrawlURI's attributes.
039: *
040: * @author gojomo
041: * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
042: */
043: public class FetchHistoryProcessor extends Processor implements
044: CoreAttributeConstants {
045: private static final long serialVersionUID = 8476621038669163983L;
046:
047: /** setting for desired history array length */
048: public static final String ATTR_HISTORY_LENGTH = "history-length";
049: /** default history array length */
050: public static final Integer DEFAULT_HISTORY_LENGTH = 2;
051:
052: /**
053: * Usual constructor
054: *
055: * @param name
056: */
057: public FetchHistoryProcessor(String name) {
058: super (
059: name,
060: "FetchHistoryProcessor. Maintain a history of fetch "
061: + "information inside the CrawlURI's attributes..");
062:
063: addElementToDefinition(new SimpleType(
064: ATTR_HISTORY_LENGTH,
065: "Number of previous fetch entries to retain in the URI "
066: + "history. The current fetch becomes a history entry at "
067: + "this Processor step, so the smallest useful value is "
068: + "'2' (including the current fetch). Default is '2'.",
069: DEFAULT_HISTORY_LENGTH));
070: }
071:
072: @Override
073: protected void innerProcess(CrawlURI curi)
074: throws InterruptedException {
075: AList latestFetch = new HashtableAList();
076:
077: // save status
078: latestFetch.putInt(A_STATUS, curi.getFetchStatus());
079: // save fetch start time
080: latestFetch.putLong(A_FETCH_BEGAN_TIME, curi
081: .getLong(A_FETCH_BEGAN_TIME));
082: // save digest
083: String digest = curi.getContentDigestSchemeString();
084: if (digest != null) {
085: latestFetch.putString(A_CONTENT_DIGEST, digest);
086: }
087: // save relevant HTTP headers, if available
088: if (curi.containsKey(A_HTTP_TRANSACTION)) {
089: HttpMethodBase method = (HttpMethodBase) curi
090: .getObject(A_HTTP_TRANSACTION);
091: saveHeader(A_ETAG_HEADER, method, latestFetch);
092: saveHeader(A_LAST_MODIFIED_HEADER, method, latestFetch);
093: // save reference length (real or virtual)
094: long referenceLength;
095: if (curi.containsKey(A_REFERENCE_LENGTH)) {
096: // reuse previous length if available (see FetchHTTP#setSizes).
097: referenceLength = curi.getLong(A_REFERENCE_LENGTH);
098: } else {
099: // normally, use content-length
100: referenceLength = curi.getContentLength();
101: }
102: latestFetch.putLong(A_REFERENCE_LENGTH, referenceLength);
103: }
104:
105: // get or create proper-sized history array
106: int targetHistoryLength = (Integer) getUncheckedAttribute(curi,
107: ATTR_HISTORY_LENGTH);
108: AList[] history = curi.getAList().containsKey(A_FETCH_HISTORY) ? curi
109: .getAList().getAListArray(A_FETCH_HISTORY)
110: : new AList[targetHistoryLength];
111: if (history.length != targetHistoryLength) {
112: AList[] newHistory = new AList[targetHistoryLength];
113: System.arraycopy(history, 0, newHistory, 0, Math.min(
114: history.length, newHistory.length));
115: history = newHistory;
116: }
117:
118: // rotate all history entries up one slot, insert new at [0]
119: for (int i = history.length - 1; i > 0; i--) {
120: history[i] = history[i - 1];
121: }
122: history[0] = latestFetch;
123:
124: curi.getAList().putAListArray(A_FETCH_HISTORY, history);
125: }
126:
127: /**
128: * Save a header from the given HTTP operation into the AList.
129: *
130: * @param name header name to save into history AList
131: * @param method http operation containing headers
132: * @param latestFetch AList to get header
133: */
134: protected void saveHeader(String name, HttpMethodBase method,
135: AList latestFetch) {
136: Header header = method.getResponseHeader(name);
137: if (header != null) {
138: latestFetch.putString(name, header.getValue());
139: }
140: }
141:
142: @Override
143: protected void initialTasks() {
144: // ensure history info persists across enqueues and recrawls
145: CrawlURI.addAlistPersistentMember(A_FETCH_HISTORY);
146: }
147:
148: }
|