001: /* FPUriUniqFilterTest
002: *
003: * $Id: BloomUriUniqFilterTest.java 4647 2006-09-22 18:39:39Z paul_jack $
004: *
005: * Created on Sep 15, 2004.
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.util;
026:
027: import java.io.FileNotFoundException;
028: import java.io.IOException;
029: import java.util.ArrayList;
030: import java.util.Iterator;
031: import java.util.logging.Logger;
032:
033: import junit.framework.TestCase;
034:
035: import org.apache.commons.httpclient.URIException;
036: import org.archive.crawler.datamodel.CandidateURI;
037: import org.archive.crawler.datamodel.UriUniqFilter;
038: import org.archive.net.UURI;
039: import org.archive.net.UURIFactory;
040:
041: /**
042: * Test BloomUriUniqFilter.
043: * @author gojomo
044: */
045: public class BloomUriUniqFilterTest extends TestCase implements
046: UriUniqFilter.HasUriReceiver {
047: private Logger logger = Logger
048: .getLogger(BloomUriUniqFilterTest.class.getName());
049:
050: private BloomUriUniqFilter filter = null;
051:
052: /**
053: * Set to true if we visited received.
054: */
055: private boolean received = false;
056:
057: protected void setUp() throws Exception {
058: super .setUp();
059: this .filter = new BloomUriUniqFilter(2000, 24);
060: this .filter.setDestination(this );
061: }
062:
063: public void testAdding() throws URIException {
064: this .filter.add(this .getUri(), new CandidateURI(UURIFactory
065: .getInstance(this .getUri())));
066: this .filter.addNow(this .getUri(), new CandidateURI(UURIFactory
067: .getInstance(this .getUri())));
068: this .filter.addForce(this .getUri(), new CandidateURI(
069: UURIFactory.getInstance(this .getUri())));
070: // Should only have add 'this' once.
071: assertTrue("Count is off", this .filter.count() == 1);
072: }
073:
074: /**
075: * Test inserting.
076: * @throws URIException
077: * @throws IOException
078: * @throws FileNotFoundException
079: */
080: public void testWriting() throws URIException {
081: long start = System.currentTimeMillis();
082: ArrayList<UURI> list = new ArrayList<UURI>(1000);
083: int count = 0;
084: final int MAX_COUNT = 1000;
085: for (; count < MAX_COUNT; count++) {
086: assertEquals("count off", count, filter.count());
087: UURI u = UURIFactory.getInstance("http://www" + count
088: + ".archive.org/" + count + "/index.html");
089: assertFalse("already contained " + u.toString(),
090: filter.bloom.contains(u.toString()));
091: logger.fine("adding " + u.toString());
092: filter.add(u.toString(), new CandidateURI(u));
093: assertTrue("not in bloom", filter.bloom.contains(u
094: .toString()));
095: if (count > 0 && ((count % 100) == 0)) {
096: list.add(u);
097: }
098: }
099: logger.fine("Added " + count + " in "
100: + (System.currentTimeMillis() - start));
101:
102: start = System.currentTimeMillis();
103: for (Iterator i = list.iterator(); i.hasNext();) {
104: UURI uuri = (UURI) i.next();
105: filter.add(uuri.toString(), new CandidateURI(uuri));
106: }
107: logger.fine("Readded subset " + list.size() + " in "
108: + (System.currentTimeMillis() - start));
109:
110: assertTrue("Count is off: " + filter.count(),
111: filter.count() == MAX_COUNT);
112: }
113:
114: public void testNote() {
115: filter.note(this .getUri());
116: assertFalse("Receiver was called", this .received);
117: }
118:
119: // FORGET CURRENTLY UNSUPPORTED IN BloomUriUniqFilter
120: // public void testForget() throws URIException {
121: // this.filter.forget(this.getUri(),
122: // new CandidateURI(UURIFactory.getInstance(this.getUri())));
123: // assertTrue("Didn't forget", this.filter.count() == 0);
124: // }
125:
126: public void receive(CandidateURI item) {
127: this .received = true;
128: }
129:
130: public String getUri() {
131: return "http://www.archive.org";
132: }
133: }
|