01: /* BucketQueueAssignmentPolicy
02: *
03: * $Header$
04: *
05: * Created on May 06, 2005
06: *
07: * Copyright (C) 2005 Christian Kohlschuetter
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: *
25: */
26: package org.archive.crawler.frontier;
27:
28: import org.apache.commons.httpclient.URIException;
29: import org.archive.crawler.datamodel.CandidateURI;
30: import org.archive.crawler.datamodel.CrawlHost;
31: import org.archive.crawler.framework.CrawlController;
32:
33: /**
34: * Uses the target IPs as basis for queue-assignment,
35: * distributing them over a fixed number of sub-queues.
36: *
37: * @author Christian Kohlschuetter
38: */
39: public class BucketQueueAssignmentPolicy extends QueueAssignmentPolicy {
40: private static final int DEFAULT_NOIP_BITMASK = 1023;
41: private static final int DEFAULT_QUEUES_HOSTS_MODULO = 1021;
42:
43: public String getClassKey(final CrawlController controller,
44: final CandidateURI curi) {
45:
46: CrawlHost host;
47: try {
48: host = controller.getServerCache().getHostFor(
49: curi.getUURI().getReferencedHost());
50: } catch (URIException e) {
51: // FIXME error handling
52: e.printStackTrace();
53: host = null;
54: }
55: if (host == null) {
56: return "NO-HOST";
57: } else if (host.getIP() == null) {
58: return "NO-IP-".concat(Integer.toString(Math.abs(host
59: .getHostName().hashCode())
60: & DEFAULT_NOIP_BITMASK));
61: } else {
62: return Integer.toString(Math.abs(host.getIP().hashCode())
63: % DEFAULT_QUEUES_HOSTS_MODULO);
64: }
65: }
66:
67: public int maximumNumberOfKeys() {
68: return DEFAULT_NOIP_BITMASK + DEFAULT_QUEUES_HOSTS_MODULO + 2;
69: }
70: }
|