01: /* Copyright (C) 2003 Internet Archive.
02: *
03: * This file is part of the Heritrix web crawler (crawler.archive.org).
04: *
05: * Heritrix is free software; you can redistribute it and/or modify
06: * it under the terms of the GNU Lesser Public License as published by
07: * the Free Software Foundation; either version 2.1 of the License, or
08: * any later version.
09: *
10: * Heritrix is distributed in the hope that it will be useful,
11: * but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13: * GNU Lesser Public License for more details.
14: *
15: * You should have received a copy of the GNU Lesser Public License
16: * along with Heritrix; if not, write to the Free Software
17: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18: */
19: package org.archive.crawler.event;
20:
21: import org.archive.crawler.datamodel.CrawlURI;
22:
23: /**
24: * An interface for objects that want to be notified
25: * of a CrawlURI disposition (happens each time a
26: * curi has been through the processors).
27: * Classes implementing this interface can register with
28: * the CrawlController to receive these events.
29: * <p>
30: * This interface is to facilitate the gathering of
31: * statistics on a running crawl.
32: * <p>
33: * <b>WARNING:</b> One of these methods <i>will</i> be
34: * called for <b>each</b> CrawlURI that is processed.
35: * It is therefor imperative that the methods execute
36: * quickly!
37: * <p>
38: * Also note that the object implementing this interface
39: * must under <b>no circumstances</b> maintain a reference
40: * to the CrawlURI beyond the scope of the relevant method
41: * body!
42: *
43: * @author Kristinn Sigurdsson
44: *
45: * @see org.archive.crawler.framework.CrawlController
46: */
47: public interface CrawlURIDispositionListener {
48: /**
49: * Notification of a successfully crawled URI
50: *
51: * @param curi The relevant CrawlURI
52: */
53: public void crawledURISuccessful(CrawlURI curi);
54:
55: /**
56: * Notification of a failed crawl of a URI that
57: * will be retried (failure due to possible transient
58: * problems).
59: *
60: * @param curi The relevant CrawlURI
61: */
62: public void crawledURINeedRetry(CrawlURI curi);
63:
64: /**
65: * Notification of a crawled URI that is to be disregarded.
66: * Usually this means that the robots.txt file for the
67: * relevant site forbids this from being crawled and we are
68: * therefor not going to keep it. Other reasons may apply.
69: * In all cases this means that it <i>was</i> successfully
70: * downloaded but will not be stored.
71: *
72: * @param curi The relevant CrawlURI
73: */
74: public void crawledURIDisregard(CrawlURI curi);
75:
76: /**
77: * Notification of a failed crawling of a URI. The failure
78: * is of a type that precludes retries (either by it's very
79: * nature or because it has been retried to many times)
80: *
81: * @param curi The relevant CrawlURI
82: */
83: public void crawledURIFailure(CrawlURI curi);
84:
85: }
|