001: /* Copyright (C) 2005 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * ClassicScope.java
020: * Created on Apr 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.scope;
025:
026: import javax.management.AttributeNotFoundException;
027:
028: import org.archive.crawler.datamodel.CandidateURI;
029: import org.archive.crawler.extractor.Link; //import org.archive.crawler.filter.OrFilter;
030: import org.archive.crawler.framework.CrawlScope;
031: import org.archive.crawler.settings.SimpleType;
032:
033: /**
034: * ClassicScope: superclass with shared Scope behavior for
035: * most common scopes.
036: *
037: * Roughly, its logic is captured in innerAccept(). A URI is
038: * included if:
039: * <pre>
040: * forceAccepts(uri)
041: * || (((isSeed(uri)
042: * || focusAccepts(uri))
043: * || additionalFocusAccepts(uri)
044: * || transitiveAccepts(uri))
045: * && !excludeAccepts(uri));</pre>
046: *
047: * Subclasses should override focusAccepts, additionalFocusAccepts,
048: * and transitiveAccepts.
049: *
050: * The excludeFilter may be specified by supplying
051: * a <code>exclude</code> subelement. If unspecified, a
052: * accepts-none filter will be used -- meaning that
053: * no URIs will pass the filter and thus be excluded.
054: *
055: * @author gojomo
056: */
057: public class ClassicScope extends CrawlScope {
058:
059: private static final long serialVersionUID = 4494905304855590002L;
060:
061: //private static final Logger logger = Logger.getLogger(ClassicScope.class
062: // .getName());
063:
064: public static final String ATTR_EXCLUDE_FILTER = "exclude-filter";
065: public static final String ATTR_FORCE_ACCEPT_FILTER = "force-accept-filter";
066:
067: public static final String ATTR_MAX_LINK_HOPS = "max-link-hops";
068:
069: public static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
070:
071: // FIXME: Replace deprecated OrFilter with non-deprecated something
072:
073: @SuppressWarnings("deprecation")
074: private org.archive.crawler.filter.OrFilter excludeFilter;
075: @SuppressWarnings("deprecation")
076: private org.archive.crawler.filter.OrFilter forceAcceptFilter;
077:
078: /**
079: * @param name
080: * ignored by superclass
081: */
082: @SuppressWarnings("deprecation")
083: public ClassicScope(String name) {
084: super (name);
085: addElementToDefinition(new SimpleType(
086: ATTR_MAX_LINK_HOPS,
087: "Max link hops to include. URIs more than this number "
088: + "of links from a seed will not be ruled in-scope. (Such "
089: + "determination does not preclude later inclusion if a "
090: + "shorter path is later discovered.)",
091: new Integer(25)));
092: addElementToDefinition(new SimpleType(
093: ATTR_MAX_TRANS_HOPS,
094: "Max transitive hops (embeds, referrals, preconditions) to "
095: + "include. URIs reached by more than this number of transitive "
096: + "hops will not be ruled in-scope, even if otherwise on an "
097: + "in-focus site. (Such determination does not preclude later "
098: + " inclusion if a shorter path is later discovered.)",
099: new Integer(5)));
100: this .excludeFilter = (org.archive.crawler.filter.OrFilter) addElementToDefinition(new org.archive.crawler.filter.OrFilter(
101: ATTR_EXCLUDE_FILTER));
102: this .forceAcceptFilter = (org.archive.crawler.filter.OrFilter) addElementToDefinition(new org.archive.crawler.filter.OrFilter(
103: ATTR_FORCE_ACCEPT_FILTER));
104: this .forceAcceptFilter.setExpertSetting(true);
105:
106: // Try to preserve the values of these attributes when we exchange
107: // scopes.
108: setPreservedFields(new String[] { ATTR_SEEDS,
109: ATTR_MAX_LINK_HOPS, ATTR_MAX_TRANS_HOPS,
110: ATTR_EXCLUDE_FILTER, ATTR_FORCE_ACCEPT_FILTER });
111: }
112:
113: /**
114: * Default constructor.
115: */
116: public ClassicScope() {
117: this (CrawlScope.ATTR_NAME);
118: }
119:
120: /**
121: * Returns whether the given object (typically a CandidateURI) falls within
122: * this scope.
123: *
124: * @param o
125: * Object to test.
126: * @return Whether the given object (typically a CandidateURI) falls within
127: * this scope.
128: */
129: protected final boolean innerAccepts(Object o) {
130: return forceAccepts(o)
131: || (((isSeed(o) || focusAccepts(o))
132: || additionalFocusAccepts(o) || transitiveAccepts(o)) && !excludeAccepts(o));
133: }
134:
135: /**
136: * Check if URI is accepted by the additional focus of this scope.
137: *
138: * This method should be overridden in subclasses.
139: *
140: * @param o
141: * the URI to check.
142: * @return True if additional focus filter accepts passed object.
143: */
144: protected boolean additionalFocusAccepts(Object o) {
145: return false;
146: }
147:
148: /**
149: * @param o
150: * the URI to check.
151: * @return True if transitive filter accepts passed object.
152: */
153: protected boolean transitiveAccepts(Object o) {
154: return false;
155: }
156:
157: /**
158: * @param o the URI to check.
159: * @return True if force-accepts filter accepts passed object.
160: */
161: protected boolean forceAccepts(Object o) {
162: return false;
163: }
164:
165: /**
166: * Check if URI is accepted by the focus of this scope.
167: *
168: * This method should be overridden in subclasses.
169: *
170: * @param o
171: * the URI to check.
172: * @return True if focus filter accepts passed object.
173: */
174: protected boolean focusAccepts(Object o) {
175: // The CrawlScope doesn't accept any URIs
176: return false;
177: }
178:
179: /**
180: * Check if URI is excluded by any filters.
181: *
182: * @param o
183: * the URI to check.
184: * @return True if exclude filter accepts passed object.
185: */
186: @SuppressWarnings("deprecation")
187: protected boolean excludeAccepts(Object o) {
188: return (this .excludeFilter.isEmpty(o)) ? exceedsMaxHops(o)
189: : this .excludeFilter.accepts(o) || exceedsMaxHops(o);
190: }
191:
192: /**
193: * Check if there are too many hops
194: *
195: * @param o
196: * URI to check.
197: * @return true if too many hops.
198: */
199: protected boolean exceedsMaxHops(Object o) {
200: if (!(o instanceof CandidateURI)) {
201: return false;
202: }
203:
204: int maxLinkHops = 0;
205: // int maxTransHops = 0;
206:
207: try {
208: maxLinkHops = ((Integer) getAttribute(o, ATTR_MAX_LINK_HOPS))
209: .intValue();
210: // maxTransHops = ((Integer) getAttribute(o, ATTR_MAX_TRANS_HOPS))
211: // .intValue();
212: } catch (AttributeNotFoundException e) {
213: // TODO Auto-generated catch block
214: e.printStackTrace();
215: }
216:
217: CandidateURI cand = (CandidateURI) o;
218:
219: String path = cand.getPathFromSeed();
220: int linkCount = 0;
221: int transCount = 0;
222: for (int i = path.length() - 1; i >= 0; i--) {
223: if (path.charAt(i) == Link.NAVLINK_HOP) {
224: linkCount++;
225: } else if (linkCount == 0) {
226: transCount++;
227: }
228: }
229: // return (linkCount > maxLinkHops) || (transCount > maxTransHops);
230: // base only on links, don't treat trans count as hard max
231: return (linkCount > maxLinkHops);
232: }
233:
234: /**
235: * Take note of a situation (such as settings edit) where involved
236: * reconfiguration (such as reading from external files) may be necessary.
237: */
238: @SuppressWarnings("deprecation")
239: public void kickUpdate() {
240: super.kickUpdate();
241: excludeFilter.kickUpdate();
242: }
243: }
|