001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Jul 7, 2003
020: *
021: */
022: package org.archive.crawler.extractor;
023:
024: import java.io.IOException;
025: import java.util.logging.Logger;
026: import java.util.regex.Matcher;
027: import java.util.regex.Pattern;
028:
029: import org.apache.commons.httpclient.URIException;
030: import org.archive.crawler.datamodel.CoreAttributeConstants;
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.io.ReplayInputStream;
033: import org.archive.io.SeekReader;
034: import org.archive.io.SeekReaderCharSequence;
035: import org.archive.util.ms.Doc;
036:
037: /**
038: * This class allows the caller to extract href style links from word97-format word documents.
039: *
040: * @author Parker Thompson
041: *
042: */
043: public class ExtractorDOC extends Extractor implements
044: CoreAttributeConstants {
045:
046: private static final long serialVersionUID = 1896822554981116303L;
047:
048: private static Pattern PATTERN = Pattern
049: .compile("HYPERLINK.*?\"(.*?)\"");
050:
051: private static Logger logger = Logger
052: .getLogger("org.archive.crawler.extractor.ExtractorDOC");
053: private long numberOfCURIsHandled = 0;
054: private long numberOfLinksExtracted = 0;
055:
056: /**
057: * @param name
058: */
059: public ExtractorDOC(String name) {
060: super (name,
061: "MS-Word document Extractor. Extracts links from MS-Word"
062: + " '.doc' documents.");
063: }
064:
065: /**
066: * Processes a word document and extracts any hyperlinks from it.
067: * This only extracts href style links, and does not examine the actual
068: * text for valid URIs.
069: * @param curi CrawlURI to process.
070: */
071: protected void extract(CrawlURI curi) {
072: // Assumes docs will be coming in through http.
073: // TODO make this more general (currently we're only fetching via http
074: // so it doesn't matter)
075: if (!isHttpTransactionContentToProcess(curi)
076: || !isExpectedMimeType(curi.getContentType(),
077: "application/msword")) {
078: return;
079: }
080:
081: int links = 0;
082: ReplayInputStream documentStream = null;
083: SeekReader docReader = null;
084:
085: numberOfCURIsHandled++;
086:
087: // Get the doc as a repositionable reader
088: try {
089: documentStream = curi.getHttpRecorder().getRecordedInput()
090: .getContentReplayInputStream();
091:
092: if (documentStream == null) {
093: // TODO: note problem
094: return;
095: }
096:
097: docReader = Doc.getText(documentStream);
098: } catch (Exception e) {
099: curi.addLocalizedError(getName(), e,
100: "ExtractorDOC Exception");
101: return;
102: } finally {
103: try {
104: documentStream.close();
105: } catch (IOException ignored) {
106:
107: }
108: }
109:
110: CharSequence cs = new SeekReaderCharSequence(docReader, 0);
111: Matcher m = PATTERN.matcher(cs);
112: while (m.find()) {
113: links++;
114: addLink(curi, m.group(1));
115: }
116:
117: curi.linkExtractorFinished(); // Set flag to indicate that link extraction is completed.
118: logger.fine(curi + " has " + links + " links.");
119: }
120:
121: private void addLink(CrawlURI curi, String hyperlink) {
122: try {
123: curi.createAndAddLink(hyperlink, Link.NAVLINK_MISC,
124: Link.NAVLINK_HOP);
125: } catch (URIException e1) {
126: getController().logUriError(e1, curi.getUURI(), hyperlink);
127: if (getController() != null) {
128: // Controller can be null: e.g. when running
129: // ExtractorTool.
130: getController().logUriError(e1, curi.getUURI(),
131: hyperlink);
132: } else {
133: logger.info(curi + ", " + hyperlink + ": "
134: + e1.getMessage());
135: }
136: }
137: numberOfLinksExtracted++;
138: }
139:
140: /* (non-Javadoc)
141: * @see org.archive.crawler.framework.Processor#report()
142: */
143: public String report() {
144: StringBuffer ret = new StringBuffer();
145: ret
146: .append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
147: ret
148: .append(" Function: Link extraction on MS Word documents (.doc)\n");
149: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
150: + "\n");
151: ret.append(" Links extracted: " + numberOfLinksExtracted
152: + "\n\n");
153:
154: return ret.toString();
155: }
156: }
|