001: /* LexicalCrawlMapper
002: *
003: * Created on Sep 30, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor;
024:
025: import java.io.BufferedReader;
026: import java.io.File;
027: import java.io.FileReader;
028: import java.io.IOException;
029: import java.io.InputStreamReader;
030: import java.io.Reader;
031: import java.net.URL;
032: import java.net.URLConnection;
033: import java.util.Iterator;
034: import java.util.SortedMap;
035: import java.util.TreeMap;
036:
037: import org.archive.crawler.datamodel.CandidateURI;
038: import org.archive.crawler.settings.SimpleType;
039: import org.archive.util.iterator.LineReadingIterator;
040: import org.archive.util.iterator.RegexpLineIterator;
041:
042: /**
043: * A simple crawl splitter/mapper, dividing up CandidateURIs/CrawlURIs
044: * between crawlers by diverting some range of URIs to local log files
045: * (which can then be imported to other crawlers).
046: *
047: * May operate on a CrawlURI (typically early in the processing chain) or
048: * its CandidateURI outlinks (late in the processing chain, after
049: * LinksScoper), or both (if inserted and configured in both places).
050: *
051: * <p>Uses lexical comparisons of classKeys to map URIs to crawlers. The
052: * 'map' is specified via either a local or HTTP-fetchable file. Each
053: * line of this file should contain two space-separated tokens, the
054: * first a key and the second a crawler node name (which should be
055: * legal as part of a filename). All URIs will be mapped to the crawler
056: * node name associated with the nearest mapping key equal or subsequent
057: * to the URI's own classKey. If there are no mapping keys equal or
058: * after the classKey, the mapping 'wraps around' to the first mapping key.
059: *
060: * <p>One crawler name is distinguished as the 'local name'; URIs mapped to
061: * this name are not diverted, but continue to be processed normally.
062: *
063: * <p>For example, assume a SurtAuthorityQueueAssignmentPolicy and
064: * a simple mapping file:
065: *
066: * <pre>
067: * d crawlerA
068: * ~ crawlerB
069: * </pre>
070: * <p>All URIs with "com," classKeys will find the 'd' key as the nearest
071: * subsequent mapping key, and thus be mapped to 'crawlerA'. If that's
072: * the 'local name', the URIs will be processed normally; otherwise, the
073: * URI will be written to a diversion log aimed for 'crawlerA'.
074: *
075: * <p>If using the JMX importUris operation importing URLs dropped by
076: * a {@link LexicalCrawlMapper} instance, use <code>recoveryLog</code> style.
077: *
078: * @author gojomo
079: * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
080: */
081: public class LexicalCrawlMapper extends CrawlMapper {
082: private static final long serialVersionUID = 1L;
083:
084: /** where to load map from */
085: public static final String ATTR_MAP_SOURCE = "map-source";
086: public static final String DEFAULT_MAP_SOURCE = "";
087:
088: /**
089: * Mapping of classKey ranges (as represented by their start) to
090: * crawlers (by abstract name/filename)
091: */
092: TreeMap<String, String> map = new TreeMap<String, String>();
093:
094: /**
095: * Constructor.
096: * @param name Name of this processor.
097: */
098: public LexicalCrawlMapper(String name) {
099: super (name, "LexicalCrawlMapper. Maps URIs to a named "
100: + "crawler by a lexical comparison of the URI's "
101: + "classKey to a supplied ranges map.");
102: addElementToDefinition(new SimpleType(
103: ATTR_MAP_SOURCE,
104: "Path (or HTTP URL) to map specification file. Each line "
105: + "should include 2 whitespace-separated tokens: the first a "
106: + "key indicating the end of a range, the second the crawler "
107: + "node to which URIs in the key range should be mapped.",
108: DEFAULT_MAP_SOURCE));
109: }
110:
111: /**
112: * Look up the crawler node name to which the given CandidateURI
113: * should be mapped.
114: *
115: * @param cauri CandidateURI to consider
116: * @return String node name which should handle URI
117: */
118: protected String map(CandidateURI cauri) {
119: // get classKey, via frontier to generate if necessary
120: String classKey = getController().getFrontier().getClassKey(
121: cauri);
122: SortedMap tail = map.tailMap(classKey);
123: if (tail.isEmpty()) {
124: // wraparound
125: tail = map;
126: }
127: // target node is value of nearest subsequent key
128: return (String) tail.get(tail.firstKey());
129: }
130:
131: protected void initialTasks() {
132: super .initialTasks();
133: try {
134: loadMap();
135: } catch (IOException e) {
136: e.printStackTrace();
137: throw new RuntimeException(e);
138: }
139: }
140:
141: /**
142: * Retrieve and parse the mapping specification from a local path or
143: * HTTP URL.
144: *
145: * @throws IOException
146: */
147: protected void loadMap() throws IOException {
148: map.clear();
149: String mapSource = (String) getUncheckedAttribute(null,
150: ATTR_MAP_SOURCE);
151: Reader reader = null;
152: if (!mapSource.startsWith("http://")) {
153: // file-based source
154: File source = new File(mapSource);
155: if (!source.isAbsolute()) {
156: source = new File(getSettingsHandler().getOrder()
157: .getController().getDisk(), mapSource);
158: }
159: reader = new FileReader(source);
160: } else {
161: URLConnection conn = (new URL(mapSource)).openConnection();
162: reader = new InputStreamReader(conn.getInputStream());
163: }
164: reader = new BufferedReader(reader);
165: Iterator iter = new RegexpLineIterator(new LineReadingIterator(
166: (BufferedReader) reader),
167: RegexpLineIterator.COMMENT_LINE,
168: RegexpLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
169: RegexpLineIterator.ENTRY);
170: while (iter.hasNext()) {
171: String[] entry = ((String) iter.next()).split("\\s+");
172: map.put(entry[0], entry[1]);
173: }
174: reader.close();
175: }
176: }
|