001: /* Copyright (C) 2007 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Mar 5, 2007
020: *
021: */
022: package org.archive.crawler.extractor;
023:
024: import java.io.File;
025: import java.io.IOException;
026: import java.util.ArrayList;
027: import java.util.Iterator;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CoreAttributeConstants;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.framework.ToeThread;
034:
035: /**
036: * Pseudo-extractor that suppresses link-extraction of likely trap pages,
037: * by noticing when content's digest is identical to that of its 'via'.
038: *
039: * @author gojomo
040: *
041: */
042: public class TrapSuppressExtractor extends Extractor implements
043: CoreAttributeConstants {
044: private static final long serialVersionUID = -1028783453022579530L;
045:
046: private static final Logger LOGGER = Logger
047: .getLogger(TrapSuppressExtractor.class.getName());
048:
049: /** ALIst attribute key for carrying-forward content-digest from 'via'*/
050: public static String A_VIA_DIGEST = "via-digest";
051:
052: protected long numberOfCURIsHandled = 0;
053: protected long numberOfCURIsSuppressed = 0;
054:
055: /**
056: * Usual constructor.
057: * @param name
058: */
059: public TrapSuppressExtractor(String name) {
060: super (name,
061: "TrapSuppressExtractor. Prevent extraction of likely "
062: + "trap content.");
063: }
064:
065: @Override
066: protected void initialTasks() {
067: super .initialTasks();
068: }
069:
070: protected void extract(CrawlURI curi) {
071: numberOfCURIsHandled++;
072:
073: String currentDigest = curi.getContentDigestSchemeString();
074: String viaDigest = null;
075: if (curi.containsKey(A_VIA_DIGEST)) {
076: viaDigest = curi.getString(A_VIA_DIGEST);
077: }
078:
079: if (currentDigest != null) {
080: if (currentDigest.equals(viaDigest)) {
081: // mark as already-extracted -- suppressing further extraction
082: curi.linkExtractorFinished();
083: curi.addAnnotation("trapSuppressExtractor");
084: numberOfCURIsSuppressed++;
085: }
086: // already consulted; so clobber with current value to be
087: // inherited
088: curi.putString(A_VIA_DIGEST, currentDigest);
089: curi.makeHeritable(A_VIA_DIGEST);
090: }
091: }
092:
093: /**
094: * Provide a human-readable textual summary of this Processor's state.
095: *
096: * @see org.archive.crawler.framework.Processor#report()
097: */
098: public String report() {
099: StringBuffer ret = new StringBuffer();
100: ret
101: .append("Processor: org.archive.crawler.extractor.TrapSuppressExtractor\n");
102: ret
103: .append(" Function: Suppress extraction on likely traps\n");
104: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
105: + "\n");
106: ret.append(" CrawlURIs suppressed: " + numberOfCURIsSuppressed
107: + "\n\n");
108:
109: return ret.toString();
110: }
111: }
|