001: /*
002: * ExtractorURI
003: *
004: * $Id: ExtractorImpliedURI.java 4943 2007-02-27 02:54:54Z ia_igor $
005: *
006: * Created on July 20, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.extractor;
028:
029: import java.util.Collection;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032: import java.util.regex.Matcher;
033:
034: import org.apache.commons.httpclient.URIException;
035: import org.archive.crawler.datamodel.CoreAttributeConstants;
036: import org.archive.crawler.datamodel.CrawlURI;
037: import org.archive.crawler.settings.SimpleType;
038: import org.archive.util.TextUtils;
039:
040: /**
041: * An extractor for finding 'implied' URIs inside other URIs. If the
042: * 'trigger' regex is matched, a new URI will be constructed from the
043: * 'build' replacement pattern.
044: *
045: * Unlike most other extractors, this works on URIs discovered by
046: * previous extractors. Thus it should appear near the end of any
047: * set of extractors.
048: *
049: * Initially, only finds absolute HTTP(S) URIs in query-string or its
050: * parameters.
051: *
052: * TODO: extend to find URIs in path-info
053: *
054: * @author Gordon Mohr
055: *
056: **/
057:
058: public class ExtractorImpliedURI extends Extractor implements
059: CoreAttributeConstants {
060:
061: private static final long serialVersionUID = 8579045413127769497L;
062:
063: private static Logger LOGGER = Logger
064: .getLogger(ExtractorImpliedURI.class.getName());
065:
066: /** regex which when matched triggers addition of 'implied' URI */
067: public static final String ATTR_TRIGGER_REGEXP = "trigger-regexp";
068: /** replacement pattern used to build 'implied' URI */
069: public static final String ATTR_BUILD_PATTERN = "build-pattern";
070:
071: /** whether to remove URIs that trigger addition of 'implied' URI;
072: * default false
073: */
074: public static final String ATTR_REMOVE_TRIGGER_URIS = "remove-trigger-uris";
075:
076: // FIXME: these counters are not incremented atomically; totals may not
077: // be correct
078: private long numberOfCURIsHandled = 0;
079: private long numberOfLinksExtracted = 0;
080:
081: /**
082: * Constructor
083: *
084: * @param name
085: */
086: public ExtractorImpliedURI(String name) {
087: super (
088: name,
089: "Implied URI Extractor. Finds URIs implied by other "
090: + "URIs according to regex/replacement patterns. Should "
091: + "appear after most other extractors.");
092:
093: addElementToDefinition(new SimpleType(
094: ATTR_TRIGGER_REGEXP,
095: "Triggering regular expression. When a discovered URI "
096: + "matches this pattern, the 'implied' URI will be "
097: + "built. The capturing groups of this expression are "
098: + "available for the build replacement pattern.",
099: ""));
100: addElementToDefinition(new SimpleType(ATTR_BUILD_PATTERN,
101: "Replacement pattern to build 'implied' URI, using "
102: + "captured groups of trigger expression.", ""));
103: addElementToDefinition(new SimpleType(
104: ATTR_REMOVE_TRIGGER_URIS,
105: "If true, all URIs that match trigger regular expression "
106: + "are removed from the list of extracted URIs. "
107: + "Default is false.", Boolean.FALSE));
108: }
109:
110: /**
111: * Perform usual extraction on a CrawlURI
112: *
113: * @param curi Crawl URI to process.
114: */
115: public void extract(CrawlURI curi) {
116:
117: this .numberOfCURIsHandled++;
118: // use array copy because discoveriess will add to outlinks
119: Collection<Link> links = curi.getOutLinks();
120: Link[] sourceLinks = links.toArray(new Link[links.size()]);
121: for (Link wref : sourceLinks) {
122: String implied = extractImplied(wref.getDestination(),
123: (String) getUncheckedAttribute(curi,
124: ATTR_TRIGGER_REGEXP),
125: (String) getUncheckedAttribute(curi,
126: ATTR_BUILD_PATTERN));
127: if (implied != null) {
128: try {
129: curi
130: .createAndAddLink(implied,
131: Link.SPECULATIVE_MISC,
132: Link.SPECULATIVE_HOP);
133:
134: numberOfLinksExtracted++;
135:
136: final boolean removeTriggerURI = ((Boolean) getUncheckedAttribute(
137: curi, ATTR_REMOVE_TRIGGER_URIS))
138: .booleanValue();
139:
140: // remove trigger URI from the outlinks if configured so.
141: if (removeTriggerURI) {
142: if (curi.getOutLinks().remove(wref)) {
143: LOGGER.log(Level.FINE, wref
144: .getDestination()
145: + " has been removed from "
146: + wref.getSource()
147: + " outlinks list.");
148: numberOfLinksExtracted--;
149:
150: } else {
151: LOGGER.log(Level.FINE, "Failed to remove "
152: + wref.getDestination() + " from "
153: + wref.getSource()
154: + " outlinks list.");
155: }
156: }
157:
158: } catch (URIException e) {
159: LOGGER.log(Level.FINE, "bad URI", e);
160: }
161: }
162: }
163: }
164:
165: /**
166: * Utility method for extracting 'implied' URI given a source uri,
167: * trigger pattern, and build pattern.
168: *
169: * @param uri source to check for implied URI
170: * @param trigger regex pattern which if matched implies another URI
171: * @param build replacement pattern to build the implied URI
172: * @return implied URI, or null if none
173: */
174: protected static String extractImplied(CharSequence uri,
175: String trigger, String build) {
176: if (trigger.length() == 0) {
177: // short-circuit empty-string trigger
178: return null;
179: }
180: Matcher m = TextUtils.getMatcher(trigger, uri);
181: if (m.matches()) {
182: String result = m.replaceFirst(build);
183: TextUtils.recycleMatcher(m);
184: return result;
185: }
186: return null;
187: }
188:
189: public String report() {
190: StringBuffer ret = new StringBuffer();
191: ret.append("Processor: " + ExtractorImpliedURI.class.getName()
192: + "\n");
193: ret
194: .append(" Function: Extracts links inside other URIs\n");
195: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
196: + "\n");
197: ret.append(" Links extracted: " + numberOfLinksExtracted
198: + "\n\n");
199:
200: return ret.toString();
201: }
202: }
|