01: /* LinkExtractor
02: *
03: * $Id: LinkExtractor.java 3704 2005-07-18 17:30:21Z stack-sf $
04: *
05: * Created on Mar 16, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.extractor;
26:
27: import java.io.InputStream;
28: import java.nio.charset.Charset;
29: import java.util.Iterator;
30:
31: import org.archive.crawler.extractor.Link;
32: import org.archive.net.UURI;
33:
34: /**
35: * LinkExtractor is a general interface for classes which, when given an
36: * InputStream and Charset, can scan for Links and return them via
37: * an Iterator interface.
38: *
39: * Implementors may in fact complete all extraction on the first
40: * hasNext(), then trickle Links out from an internal collection,
41: * depending on whether the link-extraction technique used is amenable
42: * to incremental scanning.
43: *
44: * ROUGH DRAFT IN PROGRESS / incomplete... untested...
45: *
46: * @author gojomo
47: */
48: public interface LinkExtractor extends Iterator {
49: /**
50: * Setup the LinkExtractor to operate on the given stream and charset,
51: * considering the given contextURI as the initial 'base' URI for
52: * resolving relative URIs.
53: *
54: * May be called to 'reset' a LinkExtractor to start with new input.
55: *
56: * @param source source URI
57: * @param base base URI (usually the source URI) for URI derelativizing
58: * @param content input stream of content to scan for links
59: * @param charset Charset to consult to decode stream to characters
60: * @param listener ExtractErrorListener to notify, rather than raising
61: * exception through extraction loop
62: */
63: public void setup(UURI source, UURI base, InputStream content,
64: Charset charset, ExtractErrorListener listener);
65:
66: /**
67: * Convenience version of above for common case where source and base are
68: * same.
69: *
70: * @param sourceandbase URI to use as source and base for derelativizing
71: * @param content input stream of content to scan for links
72: * @param charset Charset to consult to decode stream to characters
73: * @param listener ExtractErrorListener to notify, rather than raising
74: * exception through extraction loop
75: */
76: public void setup(UURI sourceandbase, InputStream content,
77: Charset charset, ExtractErrorListener listener);
78:
79: /**
80: * Alternative to Iterator.next() which returns type Link.
81: * @return a discovered Link
82: */
83: public Link nextLink();
84:
85: /**
86: * Discard all state and release any used resources.
87: */
88: public void reset();
89: }
|