01: /* AcceptRule
02: *
03: * $Id: TooManyHopsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
04: *
05: * Created on Apr 1, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.deciderules;
26:
27: import org.archive.crawler.datamodel.CandidateURI;
28: import org.archive.crawler.settings.SimpleType;
29: import org.archive.crawler.settings.Type;
30:
31: /**
32: * Rule REJECTs any CrawlURIs whose total number of hops (length of the
33: * hopsPath string, traversed links of any type) is over a threshold.
34: * Otherwise returns PASS.
35: *
36: * @author gojomo
37: */
38: public class TooManyHopsDecideRule extends PredicatedDecideRule {
39:
40: private static final long serialVersionUID = -5429536193865916670L;
41:
42: private static final String ATTR_MAX_HOPS = "max-hops";
43:
44: /**
45: * Default access so available to test code.
46: */
47: static final Integer DEFAULT_MAX_HOPS = new Integer(20);
48:
49: /**
50: * Usual constructor.
51: * @param name Name of this DecideRule.
52: */
53: public TooManyHopsDecideRule(String name) {
54: super (name);
55: setDescription("TooManyHopsDecideRule. REJECTs URIs discovered "
56: + "after too many hops (followed links of any type) from seed.");
57: addElementToDefinition(new SimpleType(ATTR_MAX_HOPS, "Max path"
58: + " depth for which this filter will match",
59: DEFAULT_MAX_HOPS));
60: // make default REJECT (overriding superclass) & always-default
61: Type type = addElementToDefinition(new SimpleType(
62: ATTR_DECISION, "Decision to be applied", REJECT,
63: ALLOWED_TYPES));
64: type.setTransient(true);
65: }
66:
67: /**
68: * Evaluate whether given object is over the threshold number of
69: * hops.
70: *
71: * @param object
72: * @return true if the mx-hops is exceeded
73: */
74: protected boolean evaluate(Object object) {
75: try {
76: CandidateURI curi = (CandidateURI) object;
77: return curi.getPathFromSeed() != null
78: && curi.getPathFromSeed().length() > getThresholdHops(object);
79: } catch (ClassCastException e) {
80: // if not CrawlURI, always disregard
81: return false;
82: }
83: }
84:
85: /**
86: * @param obj Conext object.
87: * @return hops cutoff threshold
88: */
89: private int getThresholdHops(Object obj) {
90: return ((Integer) getUncheckedAttribute(obj, ATTR_MAX_HOPS))
91: .intValue();
92: }
93: }
|