01: /* OnDomainsDecideRule
02: *
03: * $Id: OnDomainsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
04: *
05: * Created on Apr 5, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.deciderules;
26:
27: import org.archive.util.SurtPrefixSet;
28:
29: /**
30: * Rule applies configured decision to any URIs that
31: * are on one of the domains in the configured set of
32: * domains, filled from the seed set.
33: *
34: * @author gojomo
35: */
36: public class OnDomainsDecideRule extends SurtPrefixedDecideRule {
37:
38: private static final long serialVersionUID = -3872369060554558805L;
39:
40: //private static final Logger logger =
41: // Logger.getLogger(OnDomainsDecideRule.class.getName());
42: /**
43: * Usual constructor.
44: * @param name
45: */
46: public OnDomainsDecideRule(String name) {
47: super (name);
48: setDescription("OnDomainsDecideRule. Makes the configured decision "
49: + "for any URI which is inside one of the domains in the "
50: + "configured set of domains (derived from the seed"
51: + "list, with 'www' removed when present).");
52: // disable direct setting of SURTs-related options
53: //getElementFromDefinition(ATTR_SEEDS_AS_SURT_PREFIXES).setTransient(true);
54: //getElementFromDefinition(ATTR_SURTS_SOURCE_FILE).setTransient(true);
55: // leaving surts-dump as option helpful for debugging/learning, for now
56: //getElementFromDefinition(ATTR_SURTS_DUMP_FILE).setTransient(true);
57: }
58:
59: /**
60: * Patch the SURT prefix set so that it only includes host-enforcing prefixes
61: *
62: * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
63: */
64: protected void readPrefixes() {
65: buildSurtPrefixSet();
66: surtPrefixes.convertAllPrefixesToDomains();
67: dumpSurtPrefixSet();
68: }
69:
70: protected String prefixFrom(String uri) {
71: return SurtPrefixSet.convertPrefixToDomain(super
72: .prefixFrom(uri));
73: }
74: }
|