001: /* HashCrawlMapper
002: *
003: * Created on Sep 30, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor;
024:
025: import java.util.regex.Matcher;
026:
027: import org.archive.crawler.datamodel.CandidateURI;
028: import org.archive.crawler.settings.SimpleType;
029: import org.archive.util.TextUtils;
030:
031: import st.ata.util.FPGenerator;
032:
033: /**
034: * Maps URIs to one of N crawler names by applying a hash to the
035: * URI's (possibly-transformed) classKey.
036: *
037: * @author gojomo
038: * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
039: */
040: public class HashCrawlMapper extends CrawlMapper {
041: private static final long serialVersionUID = 1L;
042:
043: /** count of crawlers */
044: public static final String ATTR_CRAWLER_COUNT = "crawler-count";
045: public static final Long DEFAULT_CRAWLER_COUNT = new Long(1);
046:
047: /** regex pattern for reducing classKey */
048: public static final String ATTR_REDUCE_PATTERN = "reduce-prefix-pattern";
049: public static final String DEFAULT_REDUCE_PATTERN = "";
050:
051: // /** replace pattern for reducing classKey */
052: // public static final String ATTR_REPLACE_PATTERN = "replace-pattern";
053: // public static final String DEFAULT_REPLACE_PATTERN = "";
054:
055: long bucketCount = 1;
056: String reducePattern = null;
057:
058: // String replacePattern = null;
059:
060: /**
061: * Constructor.
062: * @param name Name of this processor.
063: */
064: public HashCrawlMapper(String name) {
065: super (
066: name,
067: "HashCrawlMapper. Maps URIs to a numerically named "
068: + "crawler by hashing the URI's (possibly transfored) "
069: + "classKey to one of the specified number of buckets.");
070: addElementToDefinition(new SimpleType(ATTR_CRAWLER_COUNT,
071: "Number of crawlers among which to split up the URIs. "
072: + "Their names are assumed to be 0..N-1.",
073: DEFAULT_CRAWLER_COUNT));
074: addElementToDefinition(new SimpleType(
075: ATTR_REDUCE_PATTERN,
076: "A regex pattern to apply to the classKey, using "
077: + "the first match as the mapping key. If empty (the"
078: + "default), use the full classKey.",
079: DEFAULT_REDUCE_PATTERN));
080: }
081:
082: /**
083: * Look up the crawler node name to which the given CandidateURI
084: * should be mapped.
085: *
086: * @param cauri CandidateURI to consider
087: * @return String node name which should handle URI
088: */
089: protected String map(CandidateURI cauri) {
090: // get classKey, via frontier to generate if necessary
091: String key = getController().getFrontier().getClassKey(cauri);
092: return mapString(key, reducePattern, bucketCount);
093: }
094:
095: protected void initialTasks() {
096: super .initialTasks();
097: bucketCount = (Long) getUncheckedAttribute(null,
098: ATTR_CRAWLER_COUNT);
099: kickUpdate();
100: }
101:
102: @Override
103: public void kickUpdate() {
104: super .kickUpdate();
105: reducePattern = (String) getUncheckedAttribute(null,
106: ATTR_REDUCE_PATTERN);
107: }
108:
109: public static String mapString(String key, String reducePattern,
110: long bucketCount) {
111: if (reducePattern != null && reducePattern.length() > 0) {
112: Matcher matcher = TextUtils.getMatcher(reducePattern, key);
113: if (matcher.find()) {
114: key = matcher.group();
115: }
116: TextUtils.recycleMatcher(matcher);
117: }
118: long fp = FPGenerator.std64.fp(key);
119: long bucket = fp % bucketCount;
120: return Long.toString(bucket >= 0 ? bucket : -bucket);
121: }
122: }
|