001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * BasicScope.java
020: * Created on Oct 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.scope;
025:
026: import java.util.Iterator;
027:
028: import org.archive.crawler.deciderules.DecidingScope;
029: import org.archive.crawler.filter.FilePatternFilter;
030: import org.archive.crawler.filter.TransclusionFilter;
031: import org.archive.crawler.framework.Filter;
032: import org.archive.net.UURI;
033:
034: /**
035: * A core CrawlScope suitable for the most common
036: * crawl needs.
037: *
038: * Roughly, its logic is that a URI is included if:
039: *
040: * (( isSeed(uri) || focusFilter.accepts(uri) )
041: * || transitiveFilter.accepts(uri) )
042: * && ! excludeFilter.accepts(uri)
043: *
044: * The focusFilter may be specified by either:
045: * - adding a 'mode' attribute to the
046: * <code>scope</code> element. mode="broad" is equivalent
047: * to no focus; modes "path", "host", and "domain"
048: * imply a SeedExtensionFilter will be used, with
049: * the <code>scope</code> element providing its configuration
050: * - adding a <code>focus</code> subelement
051: * If unspecified, the focusFilter will default to
052: * an accepts-all filter.
053: *
054: * The transitiveFilter may be specified by supplying
055: * a <code>transitive</code> subelement. If unspecified, a
056: * TransclusionFilter will be used, with the <code>scope</code>
057: * element providing its configuration.
058: *
059: * The excludeFilter may be specified by supplying
060: * a <code>exclude</code> subelement. If unspecified, a
061: * accepts-none filter will be used -- meaning that
062: * no URIs will pass the filter and thus be excluded.
063: *
064: * @author gojomo
065: * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
066: */
067: public class HostScope extends SeedCachingScope {
068:
069: private static final long serialVersionUID = -6257664892667267266L;
070:
071: public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
072: public static final String ATTR_ADDITIONAL_FOCUS_FILTER = "additionalScopeFocus";
073:
074: Filter additionalFocusFilter;
075: Filter transitiveFilter;
076:
077: public HostScope(String name) {
078: super (name);
079: setDescription("HostScope: A scope for host crawls *Deprecated* Use "
080: + "DecidingScope instead. Crawls made with this scope"
081: + " will be limited to the hosts its seeds. Thus if one of"
082: + " the seeds is 'archive.org' the subdomain"
083: + " 'crawler.archive.org' will not be crawled."
084: + " 'www.host' is considered to be the same as host.");
085: additionalFocusFilter = (Filter) addElementToDefinition(new FilePatternFilter(
086: ATTR_ADDITIONAL_FOCUS_FILTER));
087: this .transitiveFilter = (Filter) addElementToDefinition(new TransclusionFilter(
088: ATTR_TRANSITIVE_FILTER));
089: }
090:
091: /**
092: * @param o
093: * @return True if transitive filter accepts passed object.
094: */
095: protected boolean transitiveAccepts(Object o) {
096: if (this .transitiveFilter == null) {
097: return true;
098: }
099: return this .transitiveFilter.accepts(o);
100: }
101:
102: /**
103: * @param o
104: * @return True if focus filter accepts passed object.
105: */
106: protected boolean focusAccepts(Object o) {
107: UURI u = UURI.from(o);
108: if (u == null) {
109: return false;
110: }
111: // Get the seeds to refresh
112: Iterator iter = seedsIterator();
113: while (iter.hasNext()) {
114: if (isSameHost((UURI) iter.next(), u)) {
115: checkClose(iter);
116: return true;
117: }
118: }
119: // if none found, fail
120: checkClose(iter);
121: return false;
122: }
123:
124: // Javadoc inherited.
125: @Override
126: protected boolean additionalFocusAccepts(Object o) {
127: return additionalFocusFilter.accepts(o);
128: }
129:
130: }
|