001: /* SurtAuthorityQueueAssignmentPolicy
002: *
003: * $Id: SurtAuthorityQueueAssignmentPolicy.java 3889 2005-10-11 23:09:45Z gojomo $
004: *
005: * Created on Oct 5, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.frontier;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CandidateURI;
032: import org.archive.crawler.framework.CrawlController;
033: import org.archive.net.UURI;
034: import org.archive.net.UURIFactory;
035:
036: /**
037: * SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
038: */
039: public class SurtAuthorityQueueAssignmentPolicy extends
040: QueueAssignmentPolicy {
041: private static final Logger logger = Logger
042: .getLogger(SurtAuthorityQueueAssignmentPolicy.class
043: .getName());
044: /**
045: * When neat host-based class-key fails us
046: */
047: private static String DEFAULT_CLASS_KEY = "default...";
048:
049: private static final String DNS = "dns";
050:
051: public String getClassKey(CrawlController controller,
052: CandidateURI cauri) {
053: String scheme = cauri.getUURI().getScheme();
054: String candidate = null;
055: try {
056: if (scheme.equals(DNS)) {
057: UURI effectiveuuri;
058: if (cauri.getVia() != null) {
059: // Special handling for DNS: treat as being
060: // of the same class as the triggering URI.
061: // When a URI includes a port, this ensures
062: // the DNS lookup goes atop the host:port
063: // queue that triggered it, rather than
064: // some other host queue
065: effectiveuuri = UURIFactory.getInstance(cauri
066: .flattenVia());
067: } else {
068: // To get the dns surt form, create a fake http version
069: // (Gordon suggestion).
070: effectiveuuri = UURIFactory.getInstance("http://"
071: + cauri.getUURI().getPath());
072: }
073: candidate = getSurtAuthority(effectiveuuri
074: .getSurtForm());
075: } else {
076: candidate = getSurtAuthority(cauri.getUURI()
077: .getSurtForm());
078: }
079:
080: if (candidate == null || candidate.length() == 0) {
081: candidate = DEFAULT_CLASS_KEY;
082: }
083: } catch (URIException e) {
084: logger.log(Level.INFO,
085: "unable to extract class key; using default", e);
086: candidate = DEFAULT_CLASS_KEY;
087: }
088: // Ensure classKeys are safe as filenames on NTFS
089: return candidate.replace(':', '#');
090: }
091:
092: protected String getSurtAuthority(String surt) {
093: int indexOfOpen = surt.indexOf("://(");
094: int indexOfClose = surt.indexOf(")");
095: if (indexOfOpen == -1 || indexOfClose == -1
096: || ((indexOfOpen + 4) >= indexOfClose)) {
097: return DEFAULT_CLASS_KEY;
098: }
099: return surt.substring(indexOfOpen + 4, indexOfClose);
100: }
101: }
|