01: /* FrontierScheduler
02: *
03: * $Id: FrontierScheduler.java 4671 2006-09-26 23:47:15Z paul_jack $
04: *
05: * Created on June 6, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: *
25: */
26: package org.archive.crawler.postprocessor;
27:
28: import java.util.logging.Level;
29: import java.util.logging.Logger;
30:
31: import org.archive.crawler.datamodel.CandidateURI;
32: import org.archive.crawler.datamodel.CrawlURI;
33: import org.archive.crawler.datamodel.FetchStatusCodes;
34: import org.archive.crawler.framework.Processor;
35:
36: /**
37: * 'Schedule' with the Frontier CandidateURIs being carried by the passed
38: * CrawlURI.
39: * Adds either prerequisites or whatever is in CrawlURI outlinks to the
40: * Frontier. Run a Scoper ahead of this processor so only links that
41: * are in-scope get scheduled.
42: * @author stack
43: */
44: public class FrontierScheduler extends Processor implements
45: FetchStatusCodes {
46:
47: private static final long serialVersionUID = -5178775477602250542L;
48:
49: private static Logger LOGGER = Logger
50: .getLogger(FrontierScheduler.class.getName());
51:
52: /**
53: * @param name Name of this filter.
54: */
55: public FrontierScheduler(String name) {
56: super (
57: name,
58: "FrontierScheduler. 'Schedule' with the Frontier "
59: + "any CandidateURIs carried by the passed CrawlURI. "
60: + "Run a Scoper before this "
61: + "processor so links that are not in-scope get bumped from the "
62: + "list of links (And so those in scope get promoted from Link "
63: + "to CandidateURI).");
64: }
65:
66: protected void innerProcess(final CrawlURI curi) {
67: if (LOGGER.isLoggable(Level.FINEST)) {
68: LOGGER.finest(getName() + " processing " + curi);
69: }
70:
71: // Handle any prerequisites when S_DEFERRED for prereqs
72: if (curi.hasPrerequisiteUri()
73: && curi.getFetchStatus() == S_DEFERRED) {
74: handlePrerequisites(curi);
75: return;
76: }
77:
78: synchronized (this ) {
79: for (CandidateURI cauri : curi.getOutCandidates()) {
80: schedule(cauri);
81: }
82: }
83: }
84:
85: protected void handlePrerequisites(CrawlURI curi) {
86: schedule((CandidateURI) curi.getPrerequisiteUri());
87: }
88:
89: /**
90: * Schedule the given {@link CandidateURI CandidateURI} with the Frontier.
91: * @param caUri The CandidateURI to be scheduled.
92: */
93: protected void schedule(CandidateURI caUri) {
94: getController().getFrontier().schedule(caUri);
95: }
96: }
|