001: /* LinksScoper
002: *
003: * $Id: LinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $
004: *
005: * Created on Oct 2, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: *
025: */
026: package org.archive.crawler.postprocessor;
027:
028: import java.util.Collection;
029: import java.util.HashSet;
030: import java.util.Iterator;
031: import java.util.logging.Level;
032: import java.util.logging.Logger;
033:
034: import javax.management.AttributeNotFoundException;
035:
036: import org.apache.commons.httpclient.URIException;
037: import org.archive.crawler.datamodel.CandidateURI;
038: import org.archive.crawler.datamodel.CrawlURI;
039: import org.archive.crawler.datamodel.FetchStatusCodes;
040: import org.archive.crawler.deciderules.DecideRule;
041: import org.archive.crawler.deciderules.DecideRuleSequence;
042: import org.archive.crawler.extractor.Link;
043: import org.archive.crawler.framework.Filter;
044: import org.archive.crawler.framework.Scoper;
045: import org.archive.crawler.settings.MapType;
046: import org.archive.crawler.settings.SimpleType;
047: import org.archive.crawler.settings.Type;
048:
049: /**
050: * Determine which extracted links are within scope.
051: * TODO: To test scope, requires that Link be converted to
052: * a CandidateURI. Make it so don't have to make a CandidateURI to test
053: * if Link is in scope.
054: * <p>Since this scoper has to create CandidateURIs, no sense
055: * discarding them since later in the processing chain CandidateURIs rather
056: * than Links are whats needed scheduling extracted links w/ the
057: * Frontier (Frontier#schedule expects CandidateURI, not Link). This class
058: * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
059: *
060: * @author gojomo
061: * @author stack
062: */
063: public class LinksScoper extends Scoper implements FetchStatusCodes {
064:
065: private static final long serialVersionUID = -4074442117992496793L;
066:
067: private static Logger LOGGER = Logger.getLogger(LinksScoper.class
068: .getName());
069:
070: private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS = "seed-redirects-new-seed";
071:
072: private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS = new Boolean(
073: true);
074:
075: public static final String ATTR_REJECTLOG_DECIDE_RULES = "scope-rejected-url-rules";
076:
077: public static final String ATTR_PREFERENCE_DEPTH_HOPS = "preference-depth-hops";
078:
079: private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS = new Integer(
080: -1);
081:
082: /**
083: * Instance of rejected uris log filters.
084: */
085: private MapType rejectLogFilters = null;
086:
087: /**
088: * @param name Name of this filter.
089: */
090: public LinksScoper(String name) {
091: super (name, "LinksScoper. Rules on which extracted links "
092: + "are within configured scope.");
093:
094: Type t;
095: t = addElementToDefinition(new SimpleType(
096: ATTR_SEED_REDIRECTS_NEW_SEEDS,
097: "If enabled, any URL found because a seed redirected to it "
098: + "(original seed returned 301 or 302), will also be treated "
099: + "as a seed.",
100: DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
101: t.setExpertSetting(true);
102:
103: t = addElementToDefinition(new SimpleType(
104: ATTR_PREFERENCE_DEPTH_HOPS,
105: "Number of hops (of any sort) from a seed up to which a URI has higher "
106: + "priority scheduling than any remaining seed. For example, if set to 1 items one "
107: + "hop (link, embed, redirect, etc.) away from a seed will be scheduled "
108: + "with HIGH priority. If set to -1, no "
109: + "preferencing will occur, and a breadth-first search with seeds "
110: + "processed before discovered links will proceed. If set to zero, a "
111: + "purely depth-first search will proceed, with all discovered links processed "
112: + "before remaining seeds. Seed redirects are treated as one hop from a seed.",
113: DEFAULT_PREFERENCE_DEPTH_HOPS));
114: t.setExpertSetting(true);
115:
116: addElementToDefinition(new DecideRuleSequence(
117: ATTR_REJECTLOG_DECIDE_RULES,
118: "DecideRules which, if their final decision on a link is "
119: + "not REJECT, cause the otherwise scope-rejected links to "
120: + "be logged"));
121:
122: }
123:
124: protected void innerProcess(final CrawlURI curi) {
125: if (LOGGER.isLoggable(Level.FINEST)) {
126: LOGGER.finest(getName() + " processing " + curi);
127: }
128:
129: // If prerequisites, nothing to be done in here.
130: if (curi.hasPrerequisiteUri()) {
131: handlePrerequisite(curi);
132: return;
133: }
134:
135: // Don't extract links of error pages.
136: if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
137: curi.clearOutlinks();
138: return;
139: }
140:
141: if (curi.outlinksSize() <= 0) {
142: // No outlinks to process.
143: return;
144: }
145:
146: final boolean redirectsNewSeeds = ((Boolean) getUncheckedAttribute(
147: curi, ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
148: int preferenceDepthHops = ((Integer) getUncheckedAttribute(
149: curi, ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
150: Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
151: for (final Iterator i = curi.getOutObjects().iterator(); i
152: .hasNext();) {
153: Object o = i.next();
154: if (o instanceof Link) {
155: final Link wref = (Link) o;
156: try {
157: final int directive = getSchedulingFor(curi, wref,
158: preferenceDepthHops);
159: final CandidateURI caURI = curi.createCandidateURI(
160: curi.getBaseURI(), wref, directive,
161: considerAsSeed(curi, wref,
162: redirectsNewSeeds));
163: if (isInScope(caURI)) {
164: inScopeLinks.add(caURI);
165: }
166: } catch (URIException e) {
167: getController().logUriError(e, curi.getUURI(),
168: wref.getDestination().toString());
169: }
170: } else if (o instanceof CandidateURI) {
171: CandidateURI caURI = (CandidateURI) o;
172: if (isInScope(caURI)) {
173: inScopeLinks.add(caURI);
174: }
175: } else {
176: LOGGER.severe("Unexpected type: " + o);
177: }
178: }
179: // Replace current links collection w/ inscopeLinks. May be
180: // an empty collection.
181: curi.replaceOutlinks(inScopeLinks);
182: }
183:
184: /**
185: * The CrawlURI has a prerequisite; apply scoping and update
186: * Link to CandidateURI in manner analogous to outlink handling.
187: * @param curi CrawlURI with prereq to consider
188: */
189: protected void handlePrerequisite(CrawlURI curi) {
190: try {
191: // Create prerequisite CandidateURI
192: CandidateURI caUri = curi.createCandidateURI(curi
193: .getBaseURI(), (Link) curi.getPrerequisiteUri());
194: int prereqPriority = curi.getSchedulingDirective() - 1;
195: if (prereqPriority < 0) {
196: prereqPriority = 0;
197: LOGGER.severe("Unable to promote prerequisite " + caUri
198: + " above " + curi);
199: }
200: caUri.setSchedulingDirective(prereqPriority);
201: caUri.setForceFetch(true);
202: if (isInScope(caUri)) {
203: // replace link with CandidateURI
204: curi.setPrerequisiteUri(caUri);
205: } else {
206: // prerequisite is out-of-scope; mark CrawlURI as error,
207: // preventinting normal S_DEFERRED handling
208: curi
209: .setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
210: }
211: } catch (URIException ex) {
212: Object[] array = { curi, curi.getPrerequisiteUri() };
213: getController().uriErrors.log(Level.INFO, ex.getMessage(),
214: array);
215: } catch (NumberFormatException e) {
216: // UURI.createUURI will occasionally throw this error.
217: Object[] array = { curi, curi.getPrerequisiteUri() };
218: getController().uriErrors.log(Level.INFO, e.getMessage(),
219: array);
220: }
221: }
222:
223: protected void outOfScope(CandidateURI caUri) {
224: super .outOfScope(caUri);
225: if (!LOGGER.isLoggable(Level.INFO)) {
226: return;
227: }
228: // TODO: Fix filters so work on CandidateURI.
229: CrawlURI curi = (caUri instanceof CrawlURI) ? (CrawlURI) caUri
230: : new CrawlURI(caUri.getUURI());
231: if (rulesAccept(getRejectLogRules(curi), curi)) {
232: LOGGER.info(curi.getUURI().toString());
233: }
234: }
235:
236: protected DecideRule getRejectLogRules(Object o) {
237: try {
238: return (DecideRule) getAttribute(o,
239: ATTR_REJECTLOG_DECIDE_RULES);
240: } catch (AttributeNotFoundException e) {
241: throw new RuntimeException(e);
242: }
243: }
244:
245: private boolean considerAsSeed(final CrawlURI curi,
246: final Link wref, final boolean redirectsNewSeeds) {
247: // Check if this is a seed with a 301 or 302.
248: if (curi.isSeed()
249: && (curi.getFetchStatus() == 301 || curi
250: .getFetchStatus() == 302)
251: && wref.getHopType() == Link.REFER_HOP) {
252: // Check if redirects from seeds should be treated as seeds.
253: if (redirectsNewSeeds) {
254: return true;
255: }
256: }
257: return false;
258: }
259:
260: /**
261: * Determine scheduling for the <code>curi</code>.
262: * As with the LinksScoper in general, this only handles extracted links,
263: * seeds do not pass through here, but are given MEDIUM priority.
264: * Imports into the frontier similarly do not pass through here,
265: * but are given NORMAL priority.
266: */
267: protected int getSchedulingFor(final CrawlURI curi,
268: final Link wref, final int preferenceDepthHops) {
269: final char c = wref.getHopType();
270: if (LOGGER.isLoggable(Level.FINEST)) {
271: LOGGER.finest(curi + " with path=" + curi.getPathFromSeed()
272: + " isSeed=" + curi.isSeed() + " with fetchStatus="
273: + curi.getFetchStatus() + " -> "
274: + wref.getDestination() + " type " + c
275: + " with context=" + wref.getContext());
276: }
277:
278: switch (c) {
279: case Link.REFER_HOP:
280: // Treat redirects somewhat urgently
281: // This also ensures seed redirects remain seed priority
282: return (preferenceDepthHops >= 0 ? CandidateURI.HIGH
283: : CandidateURI.MEDIUM);
284: default:
285: if (preferenceDepthHops == 0)
286: return CandidateURI.HIGH;
287: // this implies seed redirects are treated as path
288: // length 1, which I belive is standard.
289: // curi.getPathFromSeed() can never be null here, because
290: // we're processing a link extracted from curi
291: if (preferenceDepthHops > 0
292: && curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
293: return CandidateURI.HIGH;
294: // Everything else normal (at least for now)
295: return CandidateURI.NORMAL;
296: }
297: }
298: }
|