01: package net.matuschek.http;
02:
03: /*********************************************
04: Copyright (c) 2001 by Daniel Matuschek
05: *********************************************/
06:
07: import java.io.IOException;
08: import java.net.URL;
09:
10: /**
11: * An HttpDocManager does something with an HttpDoc.
12: * It is used by the WebRobot to store the retrieved documents.
13: * @see net.matuschek.http.AbstractHttpDocManager
14: *
15: * @author Daniel Matuschek
16: * @version $Id: HttpDocManager.java,v 1.3 2003/02/27 18:40:19 oliver_schmidt Exp $
17: */
18:
19: public interface HttpDocManager {
20:
21: /**
22: * "Processes" a document (without storing it).
23: * Either direct processing or collecting urls and later processing.
24: * Most documents should be stored (for reruns) but not all of them should be
25: * processed (Maybe you only want to process PDF documents).
26: *
27: * @param doc a HttpDoc object to process. This may also be null
28: * @exception DocManagerException will be thrown if an error occurs
29: * while processing the document.
30: */
31: void processDocument(HttpDoc doc) throws DocManagerException;
32:
33: /**
34: * Stores a document. Usually this will store the document somewhere (file
35: * system, database, ...). It is also possible that this will not store the
36: * whole documents, but extract information from it and process this
37: * information.
38: * Most documents should be stored (for reruns) but not all of them should be
39: * processed (Maybe you only want to process PDF documents).
40: *
41: * @param doc a HttpDoc object to store. This may also be null
42: * @exception DocManagerException will be thrown if an error occurs
43: * while storing the document.
44: */
45: void storeDocument(HttpDoc doc) throws DocManagerException;
46:
47: /**
48: * Removes a document from cache
49: *
50: * @param doc a HttpDoc object to store. This may also be null
51: * @exception DocManagerException will be thrown if an error occurs
52: * while storing the document.
53: */
54: public void removeDocument(URL url);
55:
56: /**
57: * Returns URL of a stored document with the same content or null.
58: *
59: * @param doc
60: * @return URL of duplicate document as String or null
61: * @throws IOException
62: */
63: public String findDuplicate(HttpDoc doc) throws IOException;
64:
65: /**
66: * If a HttpDocManager stores the complete HttpDocs, it is possible
67: * to use it as a cache. Using this method it is possible to access the cached
68: * objects. If a HttpDocManager can't be used as a cache, it should always
69: * return null.
70: *
71: * @return a cached HttpDoc for this URL or null
72: */
73: HttpDoc retrieveFromCache(URL u);
74:
75: /**
76: * Should be called if the instance is not used any more.
77: * Some resources might need to be released.
78: */
79: public void finish();
80:
81: }
|