001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * TransclusionFilter.java
020: * Created on Oct 3, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.filter;
025:
026: import javax.management.AttributeNotFoundException;
027:
028: import org.archive.crawler.datamodel.CandidateURI;
029: import org.archive.crawler.deciderules.DecideRule;
030: import org.archive.crawler.deciderules.DecidingFilter;
031: import org.archive.crawler.extractor.Link;
032: import org.archive.crawler.framework.CrawlScope;
033: import org.archive.crawler.framework.Filter;
034: import org.archive.crawler.scope.ClassicScope;
035: import org.archive.crawler.settings.SimpleType;
036:
037: /**
038: * Filter which accepts CandidateURI/CrawlURI instances which contain more
039: * than zero but fewer than max-trans-hops entries at the end of their
040: * discovery path.
041: *
042: * @author Gordon Mohr
043: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
044: * equivalent {@link DecideRule}.
045: */
046: public class TransclusionFilter extends Filter {
047:
048: private static final long serialVersionUID = 4251767672778714051L;
049:
050: private static final String ATTR_MAX_SPECULATIVE_HOPS = "max-speculative-hops";
051: private static final String ATTR_MAX_REFERRAL_HOPS = "max-referral-hops";
052: private static final String ATTR_MAX_EMBED_HOPS = "max-embed-hops";
053: private static final int DEFAULT_MAX_TRANS_HOPS = 4;
054:
055: /**
056: * Default speculative hops.
057: *
058: * No more than 1
059: */
060: private static final int DEFAULT_MAX_SPECULATIVE_HOPS = 1;
061:
062: /**
063: * Default maximum referral hops.
064: *
065: * No limit beside the overall trans limit
066: */
067: private static final int DEFAULT_MAX_REFERRAL_HOPS = -1;
068:
069: /**
070: * Default embedded link hops.
071: *
072: * No limit beside the overall trans limit
073: */
074: private static final int DEFAULT_MAX_EMBED_HOPS = -1;
075:
076: int maxTransHops = DEFAULT_MAX_TRANS_HOPS;
077: int maxSpeculativeHops = DEFAULT_MAX_SPECULATIVE_HOPS;
078: int maxReferralHops = DEFAULT_MAX_REFERRAL_HOPS;
079: int maxEmbedHops = DEFAULT_MAX_EMBED_HOPS;
080:
081: // // 1-3 trailing P(recondition)/R(eferral)/E(mbed)/X(speculative-embed) hops
082: // private static final String TRANSCLUSION_PATH = ".*[PREX][PREX]?[PREX]?$";
083:
084: /**
085: * @param name
086: */
087: public TransclusionFilter(String name) {
088: super (name, "Transclusion filter *Deprecated* Use"
089: + "DecidingFilter and equivalent DecideRule instead.");
090:
091: addElementToDefinition(new SimpleType(
092: ATTR_MAX_SPECULATIVE_HOPS,
093: "Maximum number of consecutive speculative (i.e. URIs"
094: + " extracted that we are not sure if they are embeds or"
095: + " not) hops to allow.\nA value of -1 means no upper limit.",
096: new Integer(DEFAULT_MAX_SPECULATIVE_HOPS)));
097: addElementToDefinition(new SimpleType(ATTR_MAX_REFERRAL_HOPS,
098: "Maximum number of consecutive referral hops to allow.\n"
099: + "A value of -1 means no upper limit.",
100: new Integer(DEFAULT_MAX_REFERRAL_HOPS)));
101: addElementToDefinition(new SimpleType(ATTR_MAX_EMBED_HOPS,
102: "Maximum number of consecutive embed hops to allow.\n"
103: + "A value of -1 means no upper limit.",
104: new Integer(DEFAULT_MAX_EMBED_HOPS)));
105: }
106:
107: /* (non-Javadoc)
108: * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
109: */
110: protected boolean innerAccepts(Object o) {
111: if (!(o instanceof CandidateURI)) {
112: return false;
113: }
114: String path = ((CandidateURI) o).getPathFromSeed();
115: int transCount = 0;
116: int specCount = 0;
117: int refCount = 0;
118: int embedCount = 0;
119: loop: for (int i = path.length() - 1; i >= 0; i--) {
120: // everything except 'L' is considered transitive
121: switch (path.charAt(i)) {
122: case Link.NAVLINK_HOP: {
123: break loop;
124: }
125: case Link.PREREQ_HOP: {
126: if (transCount == 0) {
127: // always consider a trailing P as a 1-hop trans inclusion; disregard previous hops
128: transCount++;
129: break loop;
130: }
131: // otherwise, just count as another regular trans hop
132: break;
133: }
134: case Link.SPECULATIVE_HOP: {
135: specCount++;
136: break;
137: }
138: case Link.REFER_HOP: {
139: refCount++;
140: break;
141: }
142: case Link.EMBED_HOP: {
143: embedCount++;
144: break;
145: }
146: // FIXME: what is 'D'?
147: // 'D's get a free pass
148: }
149: transCount++;
150: }
151:
152: readMaxValues(o);
153:
154: // This is a case of possible transclusion
155: return (transCount > 0)
156: // ...and the overall number of hops isn't too high
157: && (transCount <= this .maxTransHops)
158: // ...and the number of spec-hops isn't too high
159: && (this .maxSpeculativeHops < 0 || specCount <= this .maxSpeculativeHops)
160: // ...and the number of referral-hops isn't too high
161: && (this .maxReferralHops < 0 || refCount <= this .maxReferralHops)
162: // ...and the number of embed-hops isn't too high
163: && (this .maxEmbedHops < 0 || embedCount <= this .maxEmbedHops);
164: }
165:
166: public void readMaxValues(Object o) {
167: try {
168: CrawlScope scope = (CrawlScope) globalSettings().getModule(
169: CrawlScope.ATTR_NAME);
170: this .maxTransHops = ((Integer) scope.getAttribute(o,
171: ClassicScope.ATTR_MAX_TRANS_HOPS)).intValue();
172: this .maxSpeculativeHops = ((Integer) getAttribute(o,
173: ATTR_MAX_SPECULATIVE_HOPS)).intValue();
174: this .maxReferralHops = ((Integer) getAttribute(o,
175: ATTR_MAX_REFERRAL_HOPS)).intValue();
176: this .maxEmbedHops = ((Integer) getAttribute(o,
177: ATTR_MAX_EMBED_HOPS)).intValue();
178: } catch (AttributeNotFoundException e) {
179: // TODO Auto-generated catch block
180: e.printStackTrace();
181: }
182: }
183:
184: }
|