001: /* HTTPContentDigest
002: *
003: * $Id: HTTPContentDigest.java 4654 2006-09-25 20:19:54Z paul_jack $
004: *
005: * Created on 5.1.2005
006: *
007: * Copyright (C) 2005 Kristinn Sigur?sson
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.extractor;
026:
027: import java.io.IOException;
028: import java.security.MessageDigest;
029: import java.security.NoSuchAlgorithmException;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032: import java.util.regex.Matcher;
033:
034: import javax.management.AttributeNotFoundException;
035:
036: import org.archive.crawler.datamodel.CrawlURI;
037: import org.archive.crawler.framework.Processor;
038: import org.archive.crawler.settings.SimpleType;
039: import org.archive.io.ReplayCharSequence;
040: import org.archive.util.Base32;
041: import org.archive.util.TextUtils;
042:
043: /**
044: * A processor for calculating custum HTTP content digests in place of the
045: * default (if any) computed by the HTTP fetcher processors.
046: * <p>
047: * This processor allows the user to specify a regular expression called
048: * <i>strip-reg-expr<i>. Any segment of a document (text only, binary files will
049: * be skipped) that matches this regular expression will by rewritten with
050: * the blank character (character 32 in the ANSI character set) <b> for the
051: * purpose of the digest</b> this has no effect on the document for subsequent
052: * processing or archiving.
053: * <p>
054: * NOTE: Content digest only accounts for the document body, not headers.
055: * <p>
056: * The operator will also be able to specify a maximum length for documents
057: * being evaluated by this processors. Documents exceeding that length will be
058: * ignored.
059: * <p>
060: * To further discriminate by file type or URL, an operator should use the
061: * override and refinement options.
062: * <p>
063: * It is generally recommended that this recalculation only be performed when
064: * absolutely needed (because of stripping data that changes automatically each
065: * time the URL is fetched) as this is an expensive operation.
066: *
067: * @author Kristinn Sigurdsson
068: */
069: public class HTTPContentDigest extends Processor {
070:
071: private static final long serialVersionUID = 8055532198737384358L;
072:
073: private static Logger logger = Logger
074: .getLogger(HTTPContentDigest.class.getName());
075:
076: /** A regular expression detailing elements to strip before making digest */
077: public final static String ATTR_STRIP_REG_EXPR = "strip-reg-expr";
078: protected final static String DEFAULT_STRIP_REG_EXPR = "";
079: /** Maximum file size for - longer files will be ignored. -1 = unlimited*/
080: public final static String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
081: protected final static Long DEFAULT_MAX_SIZE_BYTES = new Long(
082: 1048576); // 1 Megabyte
083:
084: private static final String SHA1 = "SHA1";
085:
086: /**
087: * Constructor
088: * @param name Processor name
089: */
090: public HTTPContentDigest(String name) {
091: super (
092: name,
093: "Calculate custom - stripped - content digests. "
094: + "A processor for calculating custom HTTP content digests "
095: + "in place of the default (if any) computed by the HTTP "
096: + "fetcher processors. "
097: + "This processor enables you to specify a regular expression "
098: + "called strip-reg-expr. Any segment of a document (text "
099: + "only, binary files will be skipped) that matches this "
100: + "regular expression will be rewritten with the blank "
101: + "character (character 32 in the ANSI character set) FOR THE "
102: + "PURPOSE OF THE DIGEST, this has no effect on the document "
103: + "for subsequent processing or archiving. You can also "
104: + "specify a maximum length for documents being evaluated by "
105: + "this processor. Documents exceeding that length will be "
106: + "ignored. "
107: + "To further discriminate by file type or URL, you should use "
108: + "the override and refinement options (the processor can be "
109: + "disabled by default and only enabled as needed in overrides "
110: + "and refinements. "
111: + "It is generally recommended that this recalculation only be "
112: + "performed when absolutely needed (because of stripping data "
113: + "that changes automatically each time the URL is fetched) as "
114: + "this is an expensive operation.");
115:
116: addElementToDefinition(new SimpleType(
117: ATTR_STRIP_REG_EXPR,
118: "A regular expression that matches those portions of "
119: + "downloaded documents that need to be ignored when "
120: + "calculating the content digest. "
121: + "Segments matching this expression will be rewritten with "
122: + "the blank character for the content digest.",
123: DEFAULT_STRIP_REG_EXPR));
124: addElementToDefinition(new SimpleType(
125: ATTR_MAX_SIZE_BYTES,
126: "Maximum size of of documents to recalculate the digest for."
127: + " Documents that exceed this value (bytes) will be ignored."
128: + " Defaults to 1048576 bytes, or 1 MB. "
129: + "-1 denotes unlimited size. A setting of 0 will effectively "
130: + "disable the processor.",
131: DEFAULT_MAX_SIZE_BYTES));
132: }
133:
134: protected void innerProcess(CrawlURI curi)
135: throws InterruptedException {
136: if (!curi.isHttpTransaction()) {
137: // Only handles HTTP docsuments.
138: return;
139: }
140: if (!TextUtils.matches("^text.*$", curi.getContentType())) {
141: // Only handles text based documents.
142: return;
143: }
144: long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue();
145: try {
146: maxsize = ((Long) getAttribute(curi, ATTR_MAX_SIZE_BYTES))
147: .longValue();
148: } catch (AttributeNotFoundException e) {
149: logger
150: .severe("Missing max-size-bytes attribute when processing "
151: + curi.toString());
152: }
153: if (maxsize < curi.getContentSize() && maxsize > -1) {
154: // Document too big.
155: return;
156: }
157:
158: // Ok, if we got this far we need to calculate the content digest.
159: // Get the regexpr
160: String regexpr = "";
161: try {
162: regexpr = (String) getAttribute(curi, ATTR_STRIP_REG_EXPR);
163: } catch (AttributeNotFoundException e2) {
164: logger.severe("Missing strip-reg-exp when processing "
165: + curi.toString());
166: return; // Can't do anything without it.
167: }
168:
169: // Get a replay of the document character seq.
170: ReplayCharSequence cs = null;
171:
172: try {
173: cs = curi.getHttpRecorder().getReplayCharSequence();
174: } catch (Exception e) {
175: curi.addLocalizedError(this .getName(), e,
176: "Failed get of replay char sequence "
177: + curi.toString() + " " + e.getMessage());
178: logger.warning("Failed get of replay char sequence "
179: + curi.toString() + " " + e.getMessage() + " "
180: + Thread.currentThread().getName());
181: return; // Can't proceed if this happens.
182: }
183:
184: // Create a MessageDigest
185: MessageDigest digest = null;
186:
187: // We have a ReplayCharSequence open. Wrap all in finally so we
188: // for sure close it before we leave.
189: try {
190: try {
191: digest = MessageDigest.getInstance(SHA1);
192: } catch (NoSuchAlgorithmException e1) {
193: e1.printStackTrace();
194: return;
195: }
196:
197: digest.reset();
198:
199: String s = null;
200:
201: if (regexpr.length() == 0) {
202: s = cs.toString();
203: } else {
204: // Process the document
205: Matcher m = TextUtils.getMatcher(regexpr, cs);
206: s = m.replaceAll(" ");
207: TextUtils.recycleMatcher(m);
208: }
209: digest.update(s.getBytes());
210:
211: // Get the new digest value
212: byte[] newDigestValue = digest.digest();
213:
214: // Log if needed.
215: if (logger.isLoggable(Level.FINEST)) {
216: logger.finest("Recalculated content digest for "
217: + curi.toString()
218: + " old: "
219: + Base32.encode((byte[]) curi
220: .getContentDigest()) + ", new: "
221: + Base32.encode(newDigestValue));
222: }
223: // Save new digest value
224: curi.setContentDigest(SHA1, newDigestValue);
225: } finally {
226: if (cs != null) {
227: try {
228: cs.close();
229: } catch (IOException ioe) {
230: logger
231: .warning(TextUtils
232: .exceptionToString(
233: "Failed close of ReplayCharSequence.",
234: ioe));
235: }
236: }
237: }
238: }
239: }
|