01: /* HostnameQueueAssignmentPolicy
02: *
03: * $Id: HostnameQueueAssignmentPolicy.java 3838 2005-09-21 23:00:47Z gojomo $
04: *
05: * Created on Oct 5, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.frontier;
26:
27: import java.util.logging.Level;
28: import java.util.logging.Logger;
29:
30: import org.apache.commons.httpclient.URIException;
31: import org.archive.crawler.datamodel.CandidateURI;
32: import org.archive.crawler.framework.CrawlController;
33: import org.archive.net.UURI;
34: import org.archive.net.UURIFactory;
35:
36: /**
37: * QueueAssignmentPolicy based on the hostname:port evident in the given
38: * CrawlURI.
39: *
40: * @author gojomo
41: */
42: public class HostnameQueueAssignmentPolicy extends
43: QueueAssignmentPolicy {
44: private static final Logger logger = Logger
45: .getLogger(HostnameQueueAssignmentPolicy.class.getName());
46: /**
47: * When neat host-based class-key fails us
48: */
49: private static String DEFAULT_CLASS_KEY = "default...";
50:
51: private static final String DNS = "dns";
52:
53: public String getClassKey(CrawlController controller,
54: CandidateURI cauri) {
55: String scheme = cauri.getUURI().getScheme();
56: String candidate = null;
57: try {
58: if (scheme.equals(DNS)) {
59: if (cauri.getVia() != null) {
60: // Special handling for DNS: treat as being
61: // of the same class as the triggering URI.
62: // When a URI includes a port, this ensures
63: // the DNS lookup goes atop the host:port
64: // queue that triggered it, rather than
65: // some other host queue
66: UURI viaUuri = UURIFactory.getInstance(cauri
67: .flattenVia());
68: candidate = viaUuri.getAuthorityMinusUserinfo();
69: // adopt scheme of triggering URI
70: scheme = viaUuri.getScheme();
71: } else {
72: candidate = cauri.getUURI().getReferencedHost();
73: }
74: } else {
75: candidate = cauri.getUURI().getAuthorityMinusUserinfo();
76: }
77:
78: if (candidate == null || candidate.length() == 0) {
79: candidate = DEFAULT_CLASS_KEY;
80: }
81: } catch (URIException e) {
82: logger.log(Level.INFO,
83: "unable to extract class key; using default", e);
84: candidate = DEFAULT_CLASS_KEY;
85: }
86: if (scheme != null && scheme.equals(UURIFactory.HTTPS)) {
87: // If https and no port specified, add default https port to
88: // distinguish https from http server without a port.
89: if (!candidate.matches(".+:[0-9]+")) {
90: candidate += UURIFactory.HTTPS_PORT;
91: }
92: }
93: // Ensure classKeys are safe as filenames on NTFS
94: return candidate.replace(':', '#');
95: }
96:
97: }
|