001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SimpleHTTPExtractor.java
020: * Created on Jul 3, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.extractor;
025:
026: import java.util.logging.Logger;
027:
028: import org.apache.commons.httpclient.Header;
029: import org.apache.commons.httpclient.HttpMethod;
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CoreAttributeConstants;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.framework.Processor;
034:
035: /**
036: * Extracts URIs from HTTP response headers.
037: * @author gojomo
038: */
039: public class ExtractorHTTP extends Processor implements
040: CoreAttributeConstants {
041:
042: private static final long serialVersionUID = 8499072198570554647L;
043:
044: private static final Logger LOGGER = Logger
045: .getLogger(ExtractorHTTP.class.getName());
046: protected long numberOfCURIsHandled = 0;
047: protected long numberOfLinksExtracted = 0;
048:
049: public ExtractorHTTP(String name) {
050: super (name,
051: "HTTP extractor. Extracts URIs from HTTP response headers.");
052: }
053:
054: public void innerProcess(CrawlURI curi) {
055: if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
056: // If not http or if an error status code, skip.
057: return;
058: }
059: numberOfCURIsHandled++;
060: HttpMethod method = (HttpMethod) curi
061: .getObject(A_HTTP_TRANSACTION);
062: addHeaderLink(curi, method.getResponseHeader("Location"));
063: addHeaderLink(curi, method
064: .getResponseHeader("Content-Location"));
065: }
066:
067: protected void addHeaderLink(CrawlURI curi, Header loc) {
068: if (loc == null) {
069: // If null, return without adding anything.
070: return;
071: }
072: // TODO: consider possibility of multiple headers
073: try {
074: curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
075: Link.REFER_HOP);
076: numberOfLinksExtracted++;
077: } catch (URIException e) {
078: // There may not be a controller (e.g. If we're being run
079: // by the extractor tool).
080: if (getController() != null) {
081: getController().logUriError(e, curi.getUURI(),
082: loc.getValue());
083: } else {
084: LOGGER.info(curi + ", " + loc.getValue() + ": "
085: + e.getMessage());
086: }
087: }
088:
089: }
090:
091: public String report() {
092: StringBuffer ret = new StringBuffer();
093: ret
094: .append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
095: ret.append(" Function: "
096: + "Extracts URIs from HTTP response headers\n");
097: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
098: + "\n");
099: ret.append(" Links extracted: " + numberOfLinksExtracted
100: + "\n\n");
101: return ret.toString();
102: }
103: }
|