Source Code Cross Referenced for HTTPContentDigest.java in » Web-Crawler » heritrix » org » archive » crawler » extractor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.extractor

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* HTTPContentDigest
002:         * 
003:         * $Id: HTTPContentDigest.java 4654 2006-09-25 20:19:54Z paul_jack $
004:         * 
005:         * Created on 5.1.2005
006:         *
007:         * Copyright (C) 2005 Kristinn Sigur?sson
008:         * 
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         * 
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         * 
016:         * Heritrix is distributed in the hope that it will be useful, 
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         * 
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.crawler.extractor;
026:
027:        import java.io.IOException;
028:        import java.security.MessageDigest;
029:        import java.security.NoSuchAlgorithmException;
030:        import java.util.logging.Level;
031:        import java.util.logging.Logger;
032:        import java.util.regex.Matcher;
033:
034:        import javax.management.AttributeNotFoundException;
035:
036:        import org.archive.crawler.datamodel.CrawlURI;
037:        import org.archive.crawler.framework.Processor;
038:        import org.archive.crawler.settings.SimpleType;
039:        import org.archive.io.ReplayCharSequence;
040:        import org.archive.util.Base32;
041:        import org.archive.util.TextUtils;
042:
043:        /**
044:         * A processor for calculating custum HTTP content digests in place of the 
045:         * default (if any) computed by the HTTP fetcher processors.
046:         * <p>
047:         * This processor allows the user to specify a regular expression called 
048:         * <i>strip-reg-expr<i>. Any segment of a document (text only, binary files will
049:         * be skipped) that matches this regular expression will by rewritten with 
050:         * the blank character (character 32 in the ANSI character set) <b> for the 
051:         * purpose of the digest</b> this has no effect on the document for subsequent 
052:         * processing or archiving.
053:         * <p>
054:         * NOTE: Content digest only accounts for the document body, not headers.
055:         * <p>
056:         * The operator will also be able to specify a maximum length for documents 
057:         * being evaluated by this processors. Documents exceeding that length will be 
058:         * ignored.
059:         * <p>
060:         * To further discriminate by file type or URL, an operator should use the 
061:         * override and refinement options. 
062:         * <p>
063:         * It is generally recommended that this recalculation only be performed when 
064:         * absolutely needed (because of stripping data that changes automatically each 
065:         * time the URL is fetched) as this is an expensive operation.
066:         *
067:         * @author Kristinn Sigurdsson
068:         */
069:        public class HTTPContentDigest extends Processor {
070:
071:            private static final long serialVersionUID = 8055532198737384358L;
072:
073:            private static Logger logger = Logger
074:                    .getLogger(HTTPContentDigest.class.getName());
075:
076:            /** A regular expression detailing elements to strip before making digest */
077:            public final static String ATTR_STRIP_REG_EXPR = "strip-reg-expr";
078:            protected final static String DEFAULT_STRIP_REG_EXPR = "";
079:            /** Maximum file size for - longer files will be ignored. -1 = unlimited*/
080:            public final static String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
081:            protected final static Long DEFAULT_MAX_SIZE_BYTES = new Long(
082:                    1048576); // 1 Megabyte
083:
084:            private static final String SHA1 = "SHA1";
085:
086:            /**
087:             * Constructor
088:             * @param name Processor name
089:             */
090:            public HTTPContentDigest(String name) {
091:                super (
092:                        name,
093:                        "Calculate custom - stripped - content digests. "
094:                                + "A processor for calculating custom HTTP content digests "
095:                                + "in place of the default (if any) computed by the HTTP "
096:                                + "fetcher processors. "
097:                                + "This processor enables you to specify a regular expression "
098:                                + "called strip-reg-expr. Any segment of a document (text "
099:                                + "only, binary files will be skipped) that matches this "
100:                                + "regular expression will be rewritten with the blank "
101:                                + "character (character 32 in the ANSI character set) FOR THE "
102:                                + "PURPOSE OF THE DIGEST, this has no effect on the document "
103:                                + "for subsequent processing or archiving. You can also "
104:                                + "specify a maximum length for documents being evaluated by "
105:                                + "this processor. Documents exceeding that length will be "
106:                                + "ignored. "
107:                                + "To further discriminate by file type or URL, you should use "
108:                                + "the override and refinement options (the processor can be "
109:                                + "disabled by default and only enabled as needed in overrides "
110:                                + "and refinements. "
111:                                + "It is generally recommended that this recalculation only be "
112:                                + "performed when absolutely needed (because of stripping data "
113:                                + "that changes automatically each time the URL is fetched) as "
114:                                + "this is an expensive operation.");
115:
116:                addElementToDefinition(new SimpleType(
117:                        ATTR_STRIP_REG_EXPR,
118:                        "A regular expression that matches those portions of "
119:                                + "downloaded documents that need to be ignored when "
120:                                + "calculating the content digest. "
121:                                + "Segments matching this expression will be rewritten with "
122:                                + "the blank character for the content digest.",
123:                        DEFAULT_STRIP_REG_EXPR));
124:                addElementToDefinition(new SimpleType(
125:                        ATTR_MAX_SIZE_BYTES,
126:                        "Maximum size of of documents to recalculate the digest for."
127:                                + " Documents that exceed this value (bytes) will be ignored."
128:                                + " Defaults to 1048576 bytes, or 1 MB. "
129:                                + "-1 denotes unlimited size. A setting of 0 will effectively "
130:                                + "disable the processor.",
131:                        DEFAULT_MAX_SIZE_BYTES));
132:            }
133:
134:            protected void innerProcess(CrawlURI curi)
135:                    throws InterruptedException {
136:                if (!curi.isHttpTransaction()) {
137:                    // Only handles HTTP docsuments.
138:                    return;
139:                }
140:                if (!TextUtils.matches("^text.*$", curi.getContentType())) {
141:                    // Only handles text based documents.
142:                    return;
143:                }
144:                long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue();
145:                try {
146:                    maxsize = ((Long) getAttribute(curi, ATTR_MAX_SIZE_BYTES))
147:                            .longValue();
148:                } catch (AttributeNotFoundException e) {
149:                    logger
150:                            .severe("Missing max-size-bytes attribute when processing "
151:                                    + curi.toString());
152:                }
153:                if (maxsize < curi.getContentSize() && maxsize > -1) {
154:                    // Document too big.
155:                    return;
156:                }
157:
158:                // Ok, if we got this far we need to calculate the content digest. 
159:                // Get the regexpr
160:                String regexpr = "";
161:                try {
162:                    regexpr = (String) getAttribute(curi, ATTR_STRIP_REG_EXPR);
163:                } catch (AttributeNotFoundException e2) {
164:                    logger.severe("Missing strip-reg-exp when processing "
165:                            + curi.toString());
166:                    return; // Can't do anything without it.
167:                }
168:
169:                // Get a replay of the document character seq.
170:                ReplayCharSequence cs = null;
171:
172:                try {
173:                    cs = curi.getHttpRecorder().getReplayCharSequence();
174:                } catch (Exception e) {
175:                    curi.addLocalizedError(this .getName(), e,
176:                            "Failed get of replay char sequence "
177:                                    + curi.toString() + " " + e.getMessage());
178:                    logger.warning("Failed get of replay char sequence "
179:                            + curi.toString() + " " + e.getMessage() + " "
180:                            + Thread.currentThread().getName());
181:                    return; // Can't proceed if this happens.
182:                }
183:
184:                // Create a MessageDigest 
185:                MessageDigest digest = null;
186:
187:                // We have a ReplayCharSequence open.  Wrap all in finally so we
188:                // for sure close it before we leave.
189:                try {
190:                    try {
191:                        digest = MessageDigest.getInstance(SHA1);
192:                    } catch (NoSuchAlgorithmException e1) {
193:                        e1.printStackTrace();
194:                        return;
195:                    }
196:
197:                    digest.reset();
198:
199:                    String s = null;
200:
201:                    if (regexpr.length() == 0) {
202:                        s = cs.toString();
203:                    } else {
204:                        // Process the document
205:                        Matcher m = TextUtils.getMatcher(regexpr, cs);
206:                        s = m.replaceAll(" ");
207:                        TextUtils.recycleMatcher(m);
208:                    }
209:                    digest.update(s.getBytes());
210:
211:                    // Get the new digest value
212:                    byte[] newDigestValue = digest.digest();
213:
214:                    // Log if needed.
215:                    if (logger.isLoggable(Level.FINEST)) {
216:                        logger.finest("Recalculated content digest for "
217:                                + curi.toString()
218:                                + " old: "
219:                                + Base32.encode((byte[]) curi
220:                                        .getContentDigest()) + ", new: "
221:                                + Base32.encode(newDigestValue));
222:                    }
223:                    // Save new digest value
224:                    curi.setContentDigest(SHA1, newDigestValue);
225:                } finally {
226:                    if (cs != null) {
227:                        try {
228:                            cs.close();
229:                        } catch (IOException ioe) {
230:                            logger
231:                                    .warning(TextUtils
232:                                            .exceptionToString(
233:                                                    "Failed close of ReplayCharSequence.",
234:                                                    ioe));
235:                        }
236:                    }
237:                }
238:            }
239:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.