001: /*
002: * ExtractorXML
003: *
004: * $Id: ExtractorXML.java 4653 2006-09-25 18:58:50Z paul_jack $
005: *
006: * Created on Sep 27, 2005
007: *
008: * Copyright (C) 2005 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.extractor;
028:
029: import java.io.IOException;
030: import java.util.logging.Logger;
031: import java.util.regex.Matcher;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.crawler.datamodel.CoreAttributeConstants;
035: import org.archive.crawler.datamodel.CrawlURI;
036: import org.archive.crawler.framework.CrawlController;
037: import org.archive.io.ReplayCharSequence;
038: import org.archive.util.TextUtils;
039:
040: /**
041: * A simple extractor which finds HTTP URIs inside XML/RSS files,
042: * inside attribute values and simple elements (those with only
043: * whitespace + HTTP URI + whitespace as contents)
044: *
045: * @author gojomo
046: *
047: **/
048:
049: public class ExtractorXML extends Extractor implements
050: CoreAttributeConstants {
051:
052: private static final long serialVersionUID = 3101230586822401584L;
053:
054: private static Logger logger = Logger.getLogger(ExtractorXML.class
055: .getName());
056:
057: private static String ESCAPED_AMP = "&";
058:
059: static final String XML_URI_EXTRACTOR = "(?i)[\"\'>]\\s*(http:[^\\s\"\'<>]+)\\s*[\"\'<]";
060: // GROUPS:
061: // (G1) URI
062:
063: private long numberOfCURIsHandled = 0;
064: private long numberOfLinksExtracted = 0;
065:
066: /**
067: * @param name
068: */
069: public ExtractorXML(String name) {
070: super (name, "XML Extractor. Extracts links from XML/RSS.");
071: }
072:
073: /**
074: * @param curi Crawl URI to process.
075: */
076: public void extract(CrawlURI curi) {
077: if (!isHttpTransactionContentToProcess(curi)) {
078: return;
079: }
080: String mimeType = curi.getContentType();
081: if (mimeType == null) {
082: return;
083: }
084: if ((mimeType.toLowerCase().indexOf("xml") < 0)
085: && (!curi.toString().toLowerCase().endsWith(".rss"))
086: && (!curi.toString().toLowerCase().endsWith(".xml"))) {
087: return;
088: }
089: this .numberOfCURIsHandled++;
090:
091: ReplayCharSequence cs = null;
092: try {
093: cs = curi.getHttpRecorder().getReplayCharSequence();
094: } catch (IOException e) {
095: logger.severe("Failed getting ReplayCharSequence: "
096: + e.getMessage());
097: }
098: if (cs == null) {
099: logger.severe("Failed getting ReplayCharSequence: "
100: + curi.toString());
101: return;
102: }
103: try {
104: this .numberOfLinksExtracted += processXml(curi, cs,
105: getController());
106: // Set flag to indicate that link extraction is completed.
107: curi.linkExtractorFinished();
108: } finally {
109: if (cs != null) {
110: try {
111: cs.close();
112: } catch (IOException ioe) {
113: logger
114: .warning(TextUtils
115: .exceptionToString(
116: "Failed close of ReplayCharSequence.",
117: ioe));
118: }
119: }
120: }
121: }
122:
123: public static long processXml(CrawlURI curi, CharSequence cs,
124: CrawlController controller) {
125: long foundLinks = 0;
126: Matcher uris = null;
127: String xmlUri;
128: uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
129: while (uris.find()) {
130: xmlUri = uris.group(1);
131: // TODO: Escape more HTML Entities.
132: xmlUri = TextUtils.replaceAll(ESCAPED_AMP, xmlUri, "&");
133: foundLinks++;
134: try {
135: // treat as speculative, as whether context really
136: // intends to create a followable/fetchable URI is
137: // unknown
138: curi.createAndAddLink(xmlUri, Link.SPECULATIVE_MISC,
139: Link.SPECULATIVE_HOP);
140: } catch (URIException e) {
141: // There may not be a controller (e.g. If we're being run
142: // by the extractor tool).
143: if (controller != null) {
144: controller.logUriError(e, curi.getUURI(), xmlUri);
145: } else {
146: logger.info(curi + ", " + xmlUri + ": "
147: + e.getMessage());
148: }
149: }
150: }
151: TextUtils.recycleMatcher(uris);
152: return foundLinks;
153: }
154:
155: public String report() {
156: StringBuffer ret = new StringBuffer();
157: ret
158: .append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
159: ret.append(" Function: Link extraction on XML/RSS\n");
160: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
161: + "\n");
162: ret.append(" Links extracted: " + numberOfLinksExtracted
163: + "\n\n");
164:
165: return ret.toString();
166: }
167: }
|