001: /* BloomUriUniqFilter
002: *
003: * $Id: BloomUriUniqFilter.java 4647 2006-09-22 18:39:39Z paul_jack $
004: *
005: * Created on June 21, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.util;
026:
027: import java.io.Serializable;
028: import java.util.logging.Logger;
029:
030: import org.archive.crawler.datamodel.CandidateURI;
031: import org.archive.util.BloomFilter;
032: import org.archive.util.BloomFilter32bitSplit;
033:
034: /**
035: * A MG4J BloomFilter-based implementation of an AlreadySeen list.
036: *
037: * This implementation performs adequately without blowing out
038: * the heap through to very large numbers of URIs. See
039: * <a href="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
040: *
041: * It is inherent to Bloom filters that as they get 'saturated', their
042: * false-positive rate rises. The default parameters used by this class
043: * attempt to maintain a 1-in-4 million (1 in 2^22) false-positive chance
044: * through 125 million unique inserts, which creates a filter structure
045: * about 495MB in size.
046: *
047: * You may use the following system properties to tune the size and
048: * false-positive rate of the bloom filter structure used by this class:
049: *
050: * org.archive.crawler.util.BloomUriUniqFilter.expected-size (default 125000000)
051: * org.archive.crawler.util.BloomUriUniqFilter.hash-count (default 22)
052: *
053: * The resulting filter will take up approximately...
054: *
055: * 1.44 * expected-size * hash-count / 8
056: *
057: * ...bytes.
058: *
059: * The default size is very close to the maximum practical size of the
060: * Bloom filter implementation, BloomFilter32bitSplit, created in the
061: * initialize() method, due to integer arithmetic limits.
062: *
063: * If you need a larger filter, you should edit the initialize
064: * method to intantiate a BloomFilter64bit instead.
065: *
066: * @author gojomo
067: * @version $Date: 2006-09-22 18:39:39 +0000 (Fri, 22 Sep 2006) $, $Revision: 4647 $
068: */
069: public class BloomUriUniqFilter extends SetBasedUriUniqFilter implements
070: Serializable {
071: private static final long serialVersionUID = 1061526253773091309L;
072:
073: private static Logger LOGGER = Logger
074: .getLogger(BloomUriUniqFilter.class.getName());
075:
076: BloomFilter bloom; // package access for testing convenience
077: protected int expected_n; // remember bloom contruction param
078:
079: protected static final String EXPECTED_SIZE_KEY = ".expected-size";
080: protected static final String HASH_COUNT_KEY = ".hash-count";
081:
082: // these defaults create a bloom filter that is
083: // 1.44*125mil*22/8 ~= 495MB in size, and at full
084: // capacity will give a false contained indication
085: // 1/(2^22) ~= 1 in every 4 million probes
086: private static final int DEFAULT_EXPECTED_SIZE = 125000000; // 125 million
087: private static final int DEFAULT_HASH_COUNT = 22; // 1 in 4 million false pos
088:
089: /**
090: * Default constructor
091: */
092: public BloomUriUniqFilter() {
093: super ();
094: String ns = System.getProperty(this .getClass().getName()
095: + EXPECTED_SIZE_KEY);
096: int n = (ns == null) ? DEFAULT_EXPECTED_SIZE : Integer
097: .parseInt(ns);
098: String ds = System.getProperty(this .getClass().getName()
099: + HASH_COUNT_KEY);
100: int d = (ds == null) ? DEFAULT_HASH_COUNT : Integer
101: .parseInt(ds);
102: initialize(n, d);
103: }
104:
105: /**
106: * Constructor.
107: *
108: * @param n the expected number of elements.
109: * @param d the number of hash functions; if the filter adds not more
110: * than <code>n</code> elements, false positives will happen with
111: * probability 2<sup>-<var>d</var></sup>.
112: */
113: public BloomUriUniqFilter(final int n, final int d) {
114: super ();
115: initialize(n, d);
116: }
117:
118: /**
119: * Initializer shared by constructors.
120: *
121: * @param n the expected number of elements.
122: * @param d the number of hash functions; if the filter adds not more
123: * than <code>n</code> elements, false positives will happen with
124: * probability 2<sup>-<var>d</var></sup>.
125: */
126: protected void initialize(final int n, final int d) {
127: this .expected_n = n;
128: bloom = new BloomFilter32bitSplit(n, d);
129: }
130:
131: public void forget(String canonical, CandidateURI item) {
132: // TODO? could use in-memory exception list of currently-forgotten items
133: LOGGER.severe("forget(\"" + canonical
134: + "\",CandidateURI) not supported");
135: }
136:
137: protected boolean setAdd(CharSequence uri) {
138: boolean added = bloom.add(uri);
139: // warn if bloom has reached its expected size (and its false-pos
140: // rate will now exceed the theoretical/designed level)
141: if (added && (count() == expected_n)) {
142: LOGGER.warning("Bloom has reached expected limit "
143: + expected_n);
144: }
145: return added;
146: }
147:
148: protected long setCount() {
149: return bloom.size();
150: }
151:
152: protected boolean setRemove(CharSequence uri) {
153: throw new UnsupportedOperationException();
154: }
155: }
|