001: /* UriUniqFilter
002: *
003: * Created on Apr 17, 2003
004: *
005: * Copyright (C) 2003 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.datamodel;
024:
025: import java.io.File;
026:
027: /**
028: * A UriUniqFilter passes URI objects to a destination
029: * (receiver) if the passed URI object has not been previously seen.
030: *
031: * If already seen, the passed URI object is dropped.
032: *
033: * <p>For efficiency in comparison against a large history of
034: * seen URIs, URI objects may not be passed immediately, unless
035: * the addNow() is used or a flush() is forced.
036: *
037: * @author gojomo
038: * @version $Date: 2005-12-16 03:10:54 +0000 (Fri, 16 Dec 2005) $, $Revision: 4036 $
039: */
040: public interface UriUniqFilter {
041: /**
042: * @return Count of already seen URIs.
043: */
044: public long count();
045:
046: /**
047: * Count of items added, but not yet filtered in or out.
048: *
049: * Some implementations may buffer up large numbers of pending
050: * items to be evaluated in a later large batch/scan/merge with
051: * disk files.
052: *
053: * @return Count of items added not yet evaluated
054: */
055: public long pending();
056:
057: /**
058: * Receiver of uniq URIs.
059: *
060: * Items that have not been seen before are pass through to this object.
061: * @param receiver Object that will be passed items. Must implement
062: * HasUriReceiver interface.
063: */
064: public void setDestination(HasUriReceiver receiver);
065:
066: /**
067: * Add given uri, if not already present.
068: * @param key Usually a canonicalized version of <code>value</code>.
069: * This is the key used doing lookups, forgets and insertions on the
070: * already included list.
071: * @param value item to add.
072: */
073: public void add(String key, CandidateURI value);
074:
075: /**
076: * Immediately add uri.
077: * @param key Usually a canonicalized version of <code>uri</code>.
078: * This is the key used doing lookups, forgets and insertions on the
079: * already included list.
080: * @param value item to add.
081: */
082: public void addNow(String key, CandidateURI value);
083:
084: /**
085: * Add given uri, all the way through to underlying destination, even
086: * if already present.
087: *
088: * (Sometimes a URI must be fetched, or refetched, for example when
089: * DNS or robots info expires or the operator forces a refetch. A
090: * normal add() or addNow() would drop the URI without forwarding
091: * on once it is determmined to already be in the filter.)
092: *
093: * @param key Usually a canonicalized version of <code>uri</code>.
094: * This is the key used doing lookups, forgets and insertions on the
095: * already included list.
096: * @param value item to add.
097: */
098: public void addForce(String key, CandidateURI value);
099:
100: /**
101: * Note item as seen, without passing through to receiver.
102: * @param key Usually a canonicalized version of an <code>URI</code>.
103: * This is the key used doing lookups, forgets and insertions on the
104: * already included list.
105: */
106: public void note(String key);
107:
108: /**
109: * Forget item was seen
110: * @param key Usually a canonicalized version of an <code>URI</code>.
111: * This is the key used doing lookups, forgets and insertions on the
112: * already included list.
113: * @param value item to add.
114: */
115: public void forget(String key, CandidateURI value);
116:
117: /**
118: * Request that any pending items be added/dropped. Implementors
119: * may ignore the request if a flush would be too expensive/too
120: * soon.
121: *
122: * @return Number added.
123: */
124: public long requestFlush();
125:
126: /**
127: * Close down any allocated resources.
128: * Makes sense calling this when checkpointing.
129: */
130: public void close();
131:
132: /**
133: * Set a File to receive a log for replay profiling.
134: */
135: public void setProfileLog(File logfile);
136:
137: /**
138: * URIs that have not been seen before 'visit' this 'Visitor'.
139: *
140: * Usually implementations of Frontier implement this interface.
141: * @author gojomo
142: */
143: public interface HasUriReceiver {
144: /**
145: * @param item Candidate uri tem that is 'visiting'.
146: */
147: public void receive(CandidateURI item);
148: }
149: }
|