001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * BasicScope.java
020: * Created on Oct 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.scope;
025:
026: import java.util.Iterator;
027: import java.util.logging.Logger;
028:
029: import org.apache.commons.httpclient.URIException;
030: import org.archive.crawler.deciderules.DecidingScope;
031: import org.archive.crawler.filter.FilePatternFilter;
032: import org.archive.crawler.filter.TransclusionFilter;
033: import org.archive.crawler.framework.Filter;
034: import org.archive.net.UURI;
035:
036: /**
037: * A core CrawlScope suitable for the most common
038: * crawl needs.
039: *
040: * Roughly, its logic is that a URI is included if:
041: *
042: * (( isSeed(uri) || focusFilter.accepts(uri) )
043: * || transitiveFilter.accepts(uri) )
044: * && ! excludeFilter.accepts(uri)
045: *
046: * The focusFilter may be specified by either:
047: * - adding a 'mode' attribute to the
048: * <code>scope</code> element. mode="broad" is equivalent
049: * to no focus; modes "path", "host", and "domain"
050: * imply a SeedExtensionFilter will be used, with
051: * the <code>scope</code> element providing its configuration
052: * - adding a <code>focus</code> subelement
053: * If unspecified, the focusFilter will default to
054: * an accepts-all filter.
055: *
056: * The transitiveFilter may be specified by supplying
057: * a <code>transitive</code> subelement. If unspecified, a
058: * TransclusionFilter will be used, with the <code>scope</code>
059: * element providing its configuration.
060: *
061: * The excludeFilter may be specified by supplying
062: * a <code>exclude</code> subelement. If unspecified, a
063: * accepts-none filter will be used -- meaning that
064: * no URIs will pass the filter and thus be excluded.
065: *
066: * @author gojomo
067: * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
068: */
069: public class DomainScope extends SeedCachingScope {
070:
071: private static final long serialVersionUID = 648062105277258820L;
072:
073: private static final Logger logger = Logger
074: .getLogger(DomainScope.class.getName());
075:
076: public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
077: public static final String ATTR_ADDITIONAL_FOCUS_FILTER = "additionalScopeFocus";
078: public static final String DOT = ".";
079:
080: Filter additionalFocusFilter;
081: Filter transitiveFilter;
082:
083: public DomainScope(String name) {
084: super (name);
085: setDescription("DomainScope: A scope for domain crawls *Deprecated* Use "
086: + "DecidingScope instead. Crawls made with this"
087: + " scope will be limited to the domain of its seeds. It will"
088: + " however reach subdomains of the seeds' original domains."
089: + " www[#].host is considered to be the same as host.");
090: this .additionalFocusFilter = (Filter) addElementToDefinition(new FilePatternFilter(
091: ATTR_ADDITIONAL_FOCUS_FILTER));
092: this .transitiveFilter = (Filter) addElementToDefinition(new TransclusionFilter(
093: ATTR_TRANSITIVE_FILTER));
094: }
095:
096: /**
097: * @param o
098: * @return True if transitive filter accepts passed object.
099: */
100: protected boolean transitiveAccepts(Object o) {
101: return this .transitiveFilter.accepts(o);
102: }
103:
104: /**
105: * Check if an URI is part of this scope.
106: *
107: * @param o An instance of UURI or of CandidateURI.
108: * @return True if focus filter accepts passed object.
109: */
110: protected boolean focusAccepts(Object o) {
111: UURI u = UURI.from(o);
112: if (u == null) {
113: return false;
114: }
115: // Get the seeds to refresh and then get an iterator inside a
116: // synchronization block. The seeds list may get updated during our
117: // iteration. This will throw a concurrentmodificationexception unless
118: // we synchronize.
119: String seedDomain = null;
120: String candidateDomain = null;
121:
122: // Get candidate domain where www[0-9]*\. is stripped.
123: try {
124: candidateDomain = u.getHostBasename();
125: } catch (URIException e1) {
126: logger
127: .severe("UURI getHostBasename failed for candidate URI: "
128: + u);
129: }
130: if (candidateDomain == null) {
131: // either an opaque, unfetchable, or unparseable URI
132: return false;
133: }
134:
135: Iterator iter = seedsIterator();
136: while (iter.hasNext()) {
137: UURI s = (UURI) iter.next();
138: // Get seed domain where www[0-9]*\. is stripped.
139: try {
140: seedDomain = s.getHostBasename();
141: } catch (URIException e) {
142: logger.severe("UURI getHostBasename failed for seed: "
143: + s);
144: }
145: if (seedDomain == null) {
146: // GetHost can come back null. See bug item
147: // [ 910120 ] java.net.URI#getHost fails when leading digit
148: continue;
149: }
150:
151: // Check if stripped hosts are same.
152: if (seedDomain.equals(candidateDomain)) {
153: checkClose(iter);
154: return true;
155: }
156:
157: // Hosts are not same. Adjust seed basename to check if
158: // candidate domain ends with .seedDomain
159: seedDomain = DOT + seedDomain;
160: if (seedDomain.regionMatches(0, candidateDomain,
161: candidateDomain.length() - seedDomain.length(),
162: seedDomain.length())) {
163: // Domain suffix congruence
164: checkClose(iter);
165: return true;
166: } // Else keep trying other seeds
167: }
168: // if none found, fail
169: checkClose(iter);
170: return false;
171: }
172:
173: protected boolean additionalFocusAccepts(Object o) {
174: return additionalFocusFilter.accepts(o);
175: }
176: }
|