001: /* HTTPMidFetchUnhangedFilter
002: *
003: * $Id: HTTPMidFetchUnchangedFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
004: *
005: * Created on 4.2.2005
006: *
007: * Copyright (C) 2005 Kristinn Sigur?sson
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.filter;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.HttpMethod;
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.crawler.framework.Filter;
033: import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
034:
035: /**
036: * A mid fetch filter for HTTP fetcher processors. It will evaluate the HTTP
037: * header to try and predict if the document has changed since it last passed
038: * through this filter. It does this by comparing the last-modified and etag
039: * values with the same values stored during the last processing of the URI.
040: * <p>
041: * If both values are present, they must agree on predicting no change,
042: * otherwise a change is predicted (return true).
043: * <p>
044: * If only one of the values is present, it alone is used to predict if a
045: * change has occured.
046: * <p>
047: * If neither value is present the filter will return true (predict change)
048: *
049: * @author Kristinn Sigurdsson
050: */
051: public class HTTPMidFetchUnchangedFilter extends Filter implements
052: AdaptiveRevisitAttributeConstants {
053:
054: private static final long serialVersionUID = -7416477243375196980L;
055:
056: private static final Logger logger = Logger
057: .getLogger(HTTPMidFetchUnchangedFilter.class.getName());
058:
059: // Header predictor state constants
060: public static final int HEADER_PREDICTS_MISSING = -1;
061: public static final int HEADER_PREDICTS_UNCHANGED = 0;
062: public static final int HEADER_PREDICTS_CHANGED = 1;
063:
064: /**
065: * Constructor
066: *
067: * @param name Module name
068: */
069: public HTTPMidFetchUnchangedFilter(String name) {
070: this (
071: name,
072: "Filters out unchanged documents. "
073: + "Examines HTTP Header timestamp and etags. "
074: + "This filter should"
075: + "only be used in the 'midfetch-filters' on the FetchHTTP "
076: + "processor. Earlier then that, the headers are not available "
077: + "and later, the entire document is available and examining "
078: + "this will usually give better results then relying on HTTP "
079: + "headers. See documentation for further details.");
080:
081: // Register persistent CrawlURI items
082: CrawlURI.addAlistPersistentMember(A_LAST_DATESTAMP);
083: CrawlURI.addAlistPersistentMember(A_LAST_ETAG);
084: }
085:
086: /**
087: * Constructor
088: *
089: * @param name Module name
090: * @param description A description of the modules functions
091: */
092: public HTTPMidFetchUnchangedFilter(String name, String description) {
093: super (name, description);
094: }
095:
096: protected boolean innerAccepts(Object o) {
097: // Return FALSE when the document has NOT changed!
098: // Return TRUE if the document has changed or we can't tell
099: if (o instanceof CrawlURI == false) {
100: // Only handles CrawlURIs
101: if (logger.isLoggable(Level.INFO)) {
102: logger
103: .info("Error: Object passed for evaluation was not a "
104: + "CrawlURI. " + o.toString());
105: }
106: return true;
107: }
108:
109: CrawlURI curi = (CrawlURI) o;
110:
111: if (curi.isHttpTransaction() == false) {
112: // Only handles HTTP
113: if (logger.isLoggable(Level.INFO)) {
114: logger
115: .info("Error: Non HTTP CrawlURI was passed for evalution. "
116: + curi.toString());
117: }
118: return true;
119: }
120:
121: if (curi.containsKey(A_HTTP_TRANSACTION) == false) {
122: // Missing header info, can't do anything.
123: if (logger.isLoggable(Level.INFO)) {
124: logger
125: .info("Error: Missing HttpMethod object in CrawlURI. "
126: + curi.toString());
127: }
128: return true;
129: }
130:
131: // Intially assume header info is missing
132: int datestamp = HEADER_PREDICTS_MISSING;
133: int etag = HEADER_PREDICTS_MISSING;
134: HttpMethod method = (HttpMethod) curi
135: .getObject(A_HTTP_TRANSACTION);
136:
137: // Compare datestamps (last-modified)
138: String newDatestamp = null;
139: if (method.getResponseHeader("last-modified") != null) {
140: newDatestamp = method.getResponseHeader("last-modified")
141: .getValue();
142: }
143:
144: if (newDatestamp != null && newDatestamp.length() > 0) {
145: datestamp = HEADER_PREDICTS_CHANGED; // Not missing, assume change
146: if (curi.containsKey(A_LAST_DATESTAMP)) {
147: if (newDatestamp.equals(curi
148: .getString(A_LAST_DATESTAMP))) {
149: // Both new and old are present and equal, datestamp
150: // predicts no change
151: datestamp = HEADER_PREDICTS_UNCHANGED;
152: }
153: }
154: curi.putString(A_LAST_DATESTAMP, newDatestamp);
155: }
156:
157: // Compare ETags
158: String newETag = null;
159: if (method.getResponseHeader("last-etag") != null) {
160: newETag = method.getResponseHeader("last-etag").getValue();
161: }
162:
163: if (newETag != null && newETag.length() > 0) {
164: etag = HEADER_PREDICTS_CHANGED; // Not missing, assume change
165: if (curi.containsKey(A_LAST_ETAG)) {
166: if (newETag.equals(curi.getString(A_LAST_ETAG))) {
167: // Both new and old are present and equal, etag
168: // predicts no change
169: etag = HEADER_PREDICTS_UNCHANGED;
170: }
171: }
172: curi.putString(A_LAST_ETAG, newETag);
173: }
174:
175: // If both are present, predict no change only if both agree
176: if (datestamp == HEADER_PREDICTS_UNCHANGED
177: && etag == HEADER_PREDICTS_UNCHANGED) {
178: // Have both and they agree, no change
179: curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
180: return false;
181: }
182: // If one or the other is missing, trust the one that is present
183: if (datestamp == HEADER_PREDICTS_MISSING
184: && etag == HEADER_PREDICTS_UNCHANGED) {
185: // Only have etag, and it predicts no change
186: curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
187: return false;
188: }
189: if (datestamp == HEADER_PREDICTS_UNCHANGED
190: && etag == HEADER_PREDICTS_MISSING) {
191: // Only have last-modified, and it predicts no change
192: curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
193: return false;
194: }
195: return true; // Default, assume change.
196: }
197: }
|