001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * BasicScope.java
020: * Created on Oct 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.scope;
025:
026: import java.util.Iterator;
027: import java.util.logging.Logger;
028:
029: import org.apache.commons.httpclient.URIException;
030: import org.archive.crawler.deciderules.DecidingScope;
031: import org.archive.crawler.filter.FilePatternFilter;
032: import org.archive.crawler.filter.TransclusionFilter;
033: import org.archive.crawler.framework.Filter;
034: import org.archive.net.UURI;
035:
036: /**
037: * A core CrawlScope suitable for the most common
038: * crawl needs.
039: *
040: * Roughly, its logic is that a URI is included if:
041: *
042: * (( isSeed(uri) || focusFilter.accepts(uri) )
043: * || transitiveFilter.accepts(uri) )
044: * && ! excludeFilter.accepts(uri)
045: *
046: * The focusFilter may be specified by either:
047: * - adding a 'mode' attribute to the
048: * <code>scope</code> element. mode="broad" is equivalent
049: * to no focus; modes "path", "host", and "domain"
050: * imply a SeedExtensionFilter will be used, with
051: * the <code>scope</code> element providing its configuration
052: * - adding a <code>focus</code> subelement
053: * If unspecified, the focusFilter will default to
054: * an accepts-all filter.
055: *
056: * The transitiveFilter may be specified by supplying
057: * a <code>transitive</code> subelement. If unspecified, a
058: * TransclusionFilter will be used, with the <code>scope</code>
059: * element providing its configuration.
060: *
061: * The excludeFilter may be specified by supplying
062: * a <code>exclude</code> subelement. If unspecified, a
063: * accepts-none filter will be used -- meaning that
064: * no URIs will pass the filter and thus be excluded.
065: *
066: * @author gojomo
067: * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
068: */
069: public class PathScope extends SeedCachingScope {
070:
071: private static final long serialVersionUID = -2217024073240277527L;
072:
073: private static Logger logger = Logger
074: .getLogger("org.archive.crawler.basic.PathScope");
075:
076: public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
077: public static final String ATTR_ADDITIONAL_FOCUS_FILTER = "additionalScopeFocus";
078:
079: Filter additionalFocusFilter;
080: Filter transitiveFilter;
081:
082: public PathScope(String name) {
083: super (name);
084: setDescription("PathScope: A scope for path crawls *Deprecated* Use "
085: + "DecidingScope instead. Crawls made with this scope"
086: + " will be limited to a specific portion of the hosts its seeds"
087: + " provide. More specifically the paths those seeds provide."
088: + " For example if one of the seeds is 'archive.org/example/'"
089: + " all URIs under the path 'examples' will be crawled (like"
090: + " 'archive.org/examples/hello.html') but not URIs in other"
091: + " paths or root (i.e. 'archive.org/index.html).");
092: this .additionalFocusFilter = (Filter) addElementToDefinition(new FilePatternFilter(
093: ATTR_ADDITIONAL_FOCUS_FILTER));
094: this .transitiveFilter = (Filter) addElementToDefinition(new TransclusionFilter(
095: ATTR_TRANSITIVE_FILTER));
096: }
097:
098: /**
099: * @param o
100: * @return True if transitive filter accepts passed object.
101: */
102: protected boolean transitiveAccepts(Object o) {
103: if (this .transitiveFilter == null) {
104: return true;
105: }
106: return this .transitiveFilter.accepts(o);
107: }
108:
109: /**
110: * @param o
111: * @return True if focus filter accepts passed object.
112: */
113: protected boolean focusAccepts(Object o) {
114: UURI u = UURI.from(o);
115: if (u == null) {
116: return false;
117: }
118: // Get the seeds to refresh
119: Iterator iter = seedsIterator();
120: while (iter.hasNext()) {
121: UURI s = (UURI) iter.next();
122: if (isSameHost(s, u)) {
123: try {
124: // Protect against non-parseable URIs. See
125: // "[ 910120 ] java.net.URI#getHost fails when
126: // leading digit"
127: if (s.getPath() == null || u.getPath() == null) {
128: continue;
129: }
130: } catch (URIException e) {
131: logger.severe("Failed get path on " + u + " or "
132: + s + ": " + e.getMessage());
133: }
134: try {
135: if (s.getPath().regionMatches(0, u.getPath(), 0,
136: s.getPath().lastIndexOf('/'))) {
137: // matches up to last '/'
138: checkClose(iter);
139: return true;
140: } else {
141: // no match; try next seed
142: continue;
143: }
144: } catch (URIException e) {
145: logger.severe("Failed get path on " + u + " or "
146: + s + ": " + e.getMessage());
147: }
148: }
149: }
150: // if none found, fail
151: checkClose(iter);
152: return false;
153: }
154:
155: // Javadoc inherited
156: @Override
157: protected boolean additionalFocusAccepts(Object o) {
158: return this.additionalFocusFilter.accepts(o);
159: }
160:
161: }
|