01: /* Copyright (C) 2003 Internet Archive.
02: *
03: * This file is part of the Heritrix web crawler (crawler.archive.org).
04: *
05: * Heritrix is free software; you can redistribute it and/or modify
06: * it under the terms of the GNU Lesser Public License as published by
07: * the Free Software Foundation; either version 2.1 of the License, or
08: * any later version.
09: *
10: * Heritrix is distributed in the hope that it will be useful,
11: * but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13: * GNU Lesser Public License for more details.
14: *
15: * You should have received a copy of the GNU Lesser Public License
16: * along with Heritrix; if not, write to the Free Software
17: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18: *
19: * CrawlScope.java
20: * Created on Oct 1, 2003
21: *
22: * $Header$
23: */
24: package org.archive.crawler.scope;
25:
26: /**
27: * A CrawlScope instance defines which URIs are "in"
28: * a particular crawl.
29: *
30: * It is essentially a Filter which determines, looking at
31: * the totality of information available about a
32: * CandidateURI/CrawlURI instamce, if that URI should be
33: * scheduled for crawling.
34: *
35: * <p>Dynamic information inherent in the discovery of the
36: * URI -- such as the path by which it was discovered --
37: * may be considered.
38: *
39: * <p>Dynamic information which requires the consultation
40: * of external and potentially volatile information --
41: * such as current robots.txt requests and the history
42: * of attempts to crawl the same URI -- should NOT be
43: * considered. Those potentially high-latency decisions
44: * should be made at another step. .
45: *
46: * @author gojomo
47: *
48: */
49: public class BroadScope extends ClassicScope {
50:
51: private static final long serialVersionUID = -2354234238454865888L;
52:
53: /**
54: * Constructor.
55: *
56: * @param name Name of this crawlscope.
57: */
58: public BroadScope(String name) {
59: super (name);
60: setDescription("BroadScope: A scope for broad crawls. Crawls made"
61: + " with this scope will not be limited to the hosts or domains of"
62: + " its seeds. NOTE: BroadScoped crawls will eventually run out of"
63: + " memory (See Release Notes).");
64: }
65:
66: /**
67: * @param o the URI to check.
68: * @return True if transitive filter accepts passed object.
69: */
70: protected boolean transitiveAccepts(Object o) {
71: return true;
72: }
73:
74: /** Check if URI is accepted by the focus of this scope.
75: *
76: * This method should be overridden in subclasses.
77: *
78: * @param o the URI to check.
79: * @return True if focus filter accepts passed object.
80: */
81: protected boolean focusAccepts(Object o) {
82: return true;
83: }
84: }
|