001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Jul 11, 2003
020: *
021: */
022: package org.archive.crawler.extractor;
023:
024: import java.io.File;
025: import java.io.IOException;
026: import java.util.ArrayList;
027: import java.util.Iterator;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CoreAttributeConstants;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.framework.ToeThread;
034:
035: /** Allows the caller to process a CrawlURI representing a PDF
036: * for the purpose of extracting URIs
037: *
038: * @author Parker Thompson
039: *
040: */
041: public class ExtractorPDF extends Extractor implements
042: CoreAttributeConstants {
043:
044: private static final long serialVersionUID = -6040669467531928494L;
045:
046: private static final Logger LOGGER = Logger
047: .getLogger(ExtractorPDF.class.getName());
048: private static int DEFAULT_MAX_SIZE_TO_PARSE = 5 * 1024 * 1024; // 5MB
049:
050: // TODO: make configurable
051: private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
052:
053: protected long numberOfCURIsHandled = 0;
054: protected long numberOfLinksExtracted = 0;
055:
056: /**
057: * @param name
058: */
059: public ExtractorPDF(String name) {
060: super (name, "PDF extractor. Link extraction on PDF documents.");
061: }
062:
063: protected void extract(CrawlURI curi) {
064: if (!isHttpTransactionContentToProcess(curi)
065: || !isExpectedMimeType(curi.getContentType(),
066: "application/pdf")) {
067: return;
068: }
069:
070: numberOfCURIsHandled++;
071:
072: File tempFile;
073:
074: if (curi.getHttpRecorder().getRecordedInput().getSize() > maxSizeToParse) {
075: return;
076: }
077:
078: int sn = ((ToeThread) Thread.currentThread()).getSerialNumber();
079: tempFile = new File(getController().getScratchDisk(), "tt" + sn
080: + "tmp.pdf");
081:
082: PDFParser parser;
083: ArrayList uris;
084: try {
085: curi.getHttpRecorder().getRecordedInput()
086: .copyContentBodyTo(tempFile);
087: parser = new PDFParser(tempFile.getAbsolutePath());
088: uris = parser.extractURIs();
089: } catch (IOException e) {
090: curi.addLocalizedError(getName(), e,
091: "ExtractorPDF IOException");
092: return;
093: } catch (RuntimeException e) {
094: // Truncated/corrupt PDFs may generate ClassCast exceptions, or
095: // other problems
096: curi.addLocalizedError(getName(), e,
097: "ExtractorPDF RuntimeException");
098: return;
099: } finally {
100: tempFile.delete();
101: }
102:
103: if (uris != null && uris.size() > 0) {
104: Iterator iter = uris.iterator();
105: while (iter.hasNext()) {
106: String uri = (String) iter.next();
107: try {
108: curi.createAndAddLink(uri, Link.NAVLINK_MISC,
109: Link.NAVLINK_HOP);
110: } catch (URIException e1) {
111: // There may not be a controller (e.g. If we're being run
112: // by the extractor tool).
113: if (getController() != null) {
114: getController().logUriError(e1, curi.getUURI(),
115: uri);
116: } else {
117: LOGGER.info(curi + ", " + uri + ": "
118: + e1.getMessage());
119: }
120: }
121: }
122: numberOfLinksExtracted += uris.size();
123: }
124:
125: LOGGER.fine(curi + " has " + uris.size() + " links.");
126: // Set flag to indicate that link extraction is completed.
127: curi.linkExtractorFinished();
128: }
129:
130: /**
131: * Provide a human-readable textual summary of this Processor's state.
132: *
133: * @see org.archive.crawler.framework.Processor#report()
134: */
135: public String report() {
136: StringBuffer ret = new StringBuffer();
137: ret
138: .append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
139: ret
140: .append(" Function: Link extraction on PDF documents\n");
141: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
142: + "\n");
143: ret.append(" Links extracted: " + numberOfLinksExtracted
144: + "\n\n");
145:
146: return ret.toString();
147: }
148: }
|