001: /*
002: * ExtractorURI
003: *
004: * $Id: ExtractorURI.java 4671 2006-09-26 23:47:15Z paul_jack $
005: *
006: * Created on July 20, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.extractor;
028:
029: import java.util.ArrayList;
030: import java.util.Collection;
031: import java.util.List;
032: import java.util.logging.Level;
033: import java.util.logging.Logger;
034: import java.util.regex.Matcher;
035:
036: import org.apache.commons.codec.DecoderException;
037: import org.apache.commons.httpclient.URIException;
038: import org.archive.crawler.datamodel.CoreAttributeConstants;
039: import org.archive.crawler.datamodel.CrawlURI;
040: import org.archive.net.LaxURLCodec;
041: import org.archive.net.UURI;
042: import org.archive.util.TextUtils;
043:
044: /**
045: * An extractor for finding URIs inside other URIs. Unlike most other
046: * extractors, this works on URIs discovered by previous extractors. Thus
047: * it should appear near the end of any set of extractors.
048: *
049: * Initially, only finds absolute HTTP(S) URIs in query-string or its
050: * parameters.
051: *
052: * TODO: extend to find URIs in path-info
053: *
054: * @author Gordon Mohr
055: *
056: **/
057:
058: public class ExtractorURI extends Extractor implements
059: CoreAttributeConstants {
060:
061: private static final long serialVersionUID = -6273897743240970822L;
062:
063: private static Logger LOGGER = Logger.getLogger(ExtractorURI.class
064: .getName());
065:
066: static final String ABS_HTTP_URI_PATTERN = "^https?://[^\\s<>]*$";
067:
068: // FIXME: these counters are not incremented atomically; totals may not
069: // be correct
070: private long numberOfCURIsHandled = 0;
071: private long numberOfLinksExtracted = 0;
072:
073: /**
074: * Constructor
075: *
076: * @param name
077: */
078: public ExtractorURI(String name) {
079: super (
080: name,
081: "URI Extractor. Extracts links inside other "
082: + "discovered URIs. Should appear last among extractors.");
083: }
084:
085: /**
086: * Perform usual extraction on a CrawlURI
087: *
088: * @param curi Crawl URI to process.
089: */
090: public void extract(CrawlURI curi) {
091:
092: this .numberOfCURIsHandled++;
093: // use array copy because discoveriess will add to outlinks
094: Collection<Link> links = curi.getOutLinks();
095: Link[] sourceLinks = links.toArray(new Link[links.size()]);
096: for (Link wref : sourceLinks) {
097: extractLink(curi, wref);
098: }
099: }
100:
101: /**
102: * Consider a single Link for internal URIs
103: *
104: * @param curi CrawlURI to add discoveries to
105: * @param wref Link to examine for internal URIs
106: */
107: protected void extractLink(CrawlURI curi, Link wref) {
108: UURI source = UURI.from(wref.getDestination());
109: if (source == null) {
110: // shouldn't happen
111: return;
112: }
113: List<String> found = extractQueryStringLinks(source);
114: for (String uri : found) {
115: try {
116: curi.createAndAddLink(uri, Link.SPECULATIVE_MISC,
117: Link.SPECULATIVE_HOP);
118: numberOfLinksExtracted++;
119: } catch (URIException e) {
120: LOGGER.log(Level.FINE, "bad URI", e);
121: }
122: }
123: // TODO: consider path URIs too
124:
125: }
126:
127: /**
128: * Look for URIs inside the supplied UURI.
129: *
130: * Static for ease of testing or outside use.
131: *
132: * @param source UURI to example
133: * @return List of discovered String URIs.
134: */
135: protected static List<String> extractQueryStringLinks(UURI source) {
136: List<String> results = new ArrayList<String>();
137: String decodedQuery;
138: try {
139: decodedQuery = source.getQuery();
140: } catch (URIException e1) {
141: // shouldn't happen
142: return results;
143: }
144: if (decodedQuery == null) {
145: return results;
146: }
147: // check if full query-string appears to be http(s) URI
148: Matcher m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,
149: decodedQuery);
150: if (m.matches()) {
151: TextUtils.recycleMatcher(m);
152: results.add(decodedQuery);
153: }
154: // split into params, see if any param value is http(s) URI
155: String rawQuery = new String(source.getRawQuery());
156: String[] params = rawQuery.split("&");
157: for (String param : params) {
158: String[] keyVal = param.split("=");
159: if (keyVal.length == 2) {
160: String candidate;
161: try {
162: candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]);
163: } catch (DecoderException e) {
164: continue;
165: }
166: // TODO: use other non-UTF8 codecs when appropriate
167: m.reset(candidate);
168: if (m.matches()) {
169: results.add(candidate);
170: }
171: }
172: }
173: return results;
174: }
175:
176: public String report() {
177: StringBuffer ret = new StringBuffer();
178: ret.append("Processor: " + ExtractorURI.class.getName() + "\n");
179: ret
180: .append(" Function: Extracts links inside other URIs\n");
181: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
182: + "\n");
183: ret.append(" Links extracted: " + numberOfLinksExtracted
184: + "\n\n");
185:
186: return ret.toString();
187: }
188: }
|