001: /* ChangeEvaluator
002: *
003: * $Id: ChangeEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
004: *
005: * Created on 11.11.2004
006: *
007: * Copyright (C) 2004 Kristinn Sigurdsson.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.extractor;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import org.archive.crawler.datamodel.CrawlURI;
031: import org.archive.crawler.framework.Processor;
032: import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
033: import org.archive.util.Base32;
034:
035: /**
036: * This processor compares the CrawlURI's current
037: * {@link org.archive.crawler.datamodel.CrawlURI#getContentDigest() content digest}
038: * with the one from a previous crawl. If they are equal, then further
039: * processing is skipped (going straight to the post processor chain) and the
040: * CrawlURI is marked appropriately.
041: *
042: * @author Kristinn Sigurdsson
043: */
044: public class ChangeEvaluator extends Processor implements
045: AdaptiveRevisitAttributeConstants {
046:
047: private static final long serialVersionUID = 5547590621493534632L;
048: private static final Logger logger = Logger
049: .getLogger(ChangeEvaluator.class.getName());
050:
051: /**
052: * Constructor
053: * @param name The name of the module
054: */
055: public ChangeEvaluator(String name) {
056: super (
057: name,
058: "Compares CrawlURI's current "
059: + "content digest with digest from previous crawl. If "
060: + "equal, further processing is skipped (going "
061: + "straight to the post processor chain) and the CrawlURI is "
062: + "marked appropriately. Should be located at the start of "
063: + "the Extractor chain.");
064:
065: // Register persistent CrawlURI items
066: CrawlURI.addAlistPersistentMember(A_LAST_CONTENT_DIGEST);
067: CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VISITS);
068: CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VERSIONS);
069: }
070:
071: protected void innerProcess(CrawlURI curi)
072: throws InterruptedException {
073: if (curi.isSuccess() == false) {
074: // Early return. No point in doing comparison on failed downloads.
075: if (logger.isLoggable(Level.FINEST)) {
076: logger.finest("Not handling " + curi.toString()
077: + ", did not " + "succeed.");
078: }
079: return;
080: }
081:
082: // If a mid fetch filter aborts the HTTP fetch because the headers
083: // predict no change, we can skip the whole comparing hashes.
084: if (!curi.containsKey(A_CONTENT_STATE_KEY)
085: || curi.getInt(A_CONTENT_STATE_KEY) != CONTENT_UNCHANGED) {
086: String currentDigest = null;
087: Object digest = curi.getContentDigest();
088: if (digest != null) {
089: currentDigest = Base32.encode((byte[]) digest);
090: }
091:
092: String oldDigest = null;
093: if (curi.containsKey(A_LAST_CONTENT_DIGEST)) {
094: oldDigest = curi.getString(A_LAST_CONTENT_DIGEST);
095: }
096:
097: // Compare the String representation of the byte arrays.
098: if (currentDigest == null && oldDigest == null) {
099: // Both are null, can't do a thing
100: if (logger.isLoggable(Level.FINER)) {
101: logger.finer("On " + curi.toString()
102: + " both digest are null");
103: }
104: // NOTE! RETURN!
105: return;
106: }
107:
108: if (currentDigest != null && oldDigest != null
109: && currentDigest.equals(oldDigest)) {
110: // If equal, we have just downloaded a duplicate.
111: if (logger.isLoggable(Level.FINER)) {
112: logger.finer("On " + curi.toString()
113: + " both digest are " + "equal. Old: "
114: + oldDigest + ", new: " + currentDigest);
115: }
116: curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
117: // TODO: In the future processors should take note of the content
118: // state, removing the need for the following 'skip'
119: curi.skipToProcessorChain(getController()
120: .getPostprocessorChain());
121: // Make not in log
122: curi.addAnnotation("unchanged");
123: // Set content size to zero, we are not going to 'write it to disk'
124: curi.setContentSize(0);
125: } else {
126: // Document has changed
127: if (logger.isLoggable(Level.FINER)) {
128: logger.finer("On "
129: + curi.toString()
130: + " digest are not "
131: + "equal. Old: "
132: + (oldDigest == null ? "null" : oldDigest)
133: + ", new: "
134: + (currentDigest == null ? "null"
135: : currentDigest));
136: }
137: // currentDigest may be null, that probably means a failed download
138: curi.putInt(A_CONTENT_STATE_KEY, CONTENT_CHANGED);
139: curi.putString(A_LAST_CONTENT_DIGEST, currentDigest);
140: }
141: } else {
142: if (logger.isLoggable(Level.FINER)) {
143: logger.finer("On " + curi.toString()
144: + " content state was "
145: + "already set as UNCHANGED.");
146: }
147: }
148:
149: // Update visit and version counters
150: int visits = 1;
151: if (curi.containsKey(A_NUMBER_OF_VISITS)) {
152: visits = curi.getInt(A_NUMBER_OF_VISITS) + 1;
153: }
154: curi.putInt(A_NUMBER_OF_VISITS, visits);
155:
156: // Update versions.
157: if (curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
158: int versions = 1;
159: if (curi.containsKey(A_NUMBER_OF_VERSIONS)) {
160: versions = curi.getInt(A_NUMBER_OF_VERSIONS) + 1;
161: }
162: curi.putInt(A_NUMBER_OF_VERSIONS, versions);
163: }
164: }
165: }
|