001: /* IdenticalDigestDecideRule
002: *
003: * $Id: HopsPathMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55 +0000 (Mon, 25 Sep 2006) paul_jack $
004: *
005: * Created on Feb 17, 2007
006: *
007: * Copyright (C) 2007 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules.recrawl;
026:
027: import org.archive.crawler.datamodel.CoreAttributeConstants;
028: import org.archive.crawler.datamodel.CrawlURI;
029: import org.archive.crawler.deciderules.PredicatedDecideRule;
030: import org.archive.crawler.settings.SimpleType;
031: import org.archive.crawler.settings.Type;
032:
033: import st.ata.util.AList;
034:
035: /**
036: * Rule applies configured decision to any CrawlURIs whose prior-history
037: * content-digest matches the latest fetch.
038: *
039: * @author gojomo
040: */
041: public class IdenticalDigestDecideRule extends PredicatedDecideRule
042: implements CoreAttributeConstants {
043: private static final long serialVersionUID = 4275993790856626949L;
044:
045: /**
046: * Usual constructor.
047: * @param name
048: */
049: public IdenticalDigestDecideRule(String name) {
050: super (name);
051: setDescription("IdenticalDigestDecideRule. Applies configured "
052: + "decision to any CrawlURIs whose prior-history "
053: + "content-digest matches the latest fetch.");
054: // make default REJECT (overriding superclass)
055: Type type = addElementToDefinition(new SimpleType(
056: ATTR_DECISION, "Decision to be applied", REJECT,
057: ALLOWED_TYPES));
058: }
059:
060: /**
061: * Evaluate whether given CrawlURI's content-digest exactly
062: * matches that of preceding fetch.
063: *
064: * @param object should be CrawlURI
065: * @return true if current-fetch content-digest matches previous
066: */
067: protected boolean evaluate(Object object) {
068: CrawlURI curi = (CrawlURI) object;
069: return hasIdenticalDigest(curi);
070: }
071:
072: /**
073: * Utility method for testing if a CrawlURI's last two history
074: * entiries (one being the most recent fetch) have identical
075: * content-digest information.
076: *
077: * @param curi CrawlURI to test
078: * @return true if last two history entries have identical digests,
079: * otherwise false
080: */
081: public static boolean hasIdenticalDigest(CrawlURI curi) {
082: if (curi.getAList().containsKey(A_FETCH_HISTORY)) {
083: AList[] history = curi.getAList().getAListArray(
084: A_FETCH_HISTORY);
085: return history[0] != null
086: && history[0]
087: .containsKey(CoreAttributeConstants.A_CONTENT_DIGEST)
088: && history[1] != null
089: && history[1]
090: .containsKey(CoreAttributeConstants.A_CONTENT_DIGEST)
091: && history[0]
092: .getString(
093: CoreAttributeConstants.A_CONTENT_DIGEST)
094: .equals(
095: history[1]
096: .getString(CoreAttributeConstants.A_CONTENT_DIGEST));
097: } else {
098: return false;
099: }
100: }
101: }
|