001: /* BenchmarkUriUniqFilters
002: *
003: * $Id: BenchmarkUriUniqFilters.java 4647 2006-09-22 18:39:39Z paul_jack $
004: *
005: * Created on Jun 22, 2005.
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.util;
026:
027: import java.io.BufferedReader;
028: import java.io.BufferedWriter;
029: import java.io.File;
030: import java.io.FileReader;
031: import java.io.FileWriter;
032: import java.io.IOException;
033:
034: import org.archive.crawler.datamodel.CandidateURI;
035: import org.archive.crawler.datamodel.UriUniqFilter;
036: import org.archive.util.fingerprint.MemLongFPSet;
037:
038: /**
039: * BenchmarkUriUniqFilters
040: *
041: * @author gojomo
042: */
043: public class BenchmarkUriUniqFilters implements
044: UriUniqFilter.HasUriReceiver {
045: // private Logger LOGGER =
046: // Logger.getLogger(BenchmarkUriUniqFilters.class.getName());
047:
048: private BufferedWriter out; // optional to dump uniq items
049: String current; // current line/URI being checked
050:
051: /**
052: * Test the UriUniqFilter implementation (MemUriUniqFilter,
053: * BloomUriUniqFilter, or BdbUriUniqFilter) named in first
054: * argument against the file of one-per-line URIs named
055: * in the second argument.
056: *
057: * @param args from cmd-line
058: * @throws IOException
059: */
060: public static void main(String[] args) throws IOException {
061: (new BenchmarkUriUniqFilters()).instanceMain(args);
062: }
063:
064: public void instanceMain(String[] args) throws IOException {
065: String testClass = args[0];
066: String inputFilename = args[1];
067: long start = System.currentTimeMillis();
068: UriUniqFilter uniq = createUriUniqFilter(testClass);
069: long created = System.currentTimeMillis();
070: BufferedReader br = new BufferedReader(new FileReader(
071: inputFilename));
072: if (args.length > 2) {
073: String outputFilename = args[2];
074: out = new BufferedWriter(new FileWriter(outputFilename));
075: }
076: int added = 0;
077: while ((current = br.readLine()) != null) {
078: added++;
079: uniq.add(current, null);
080: }
081: uniq.close();
082: long finished = System.currentTimeMillis();
083: if (out != null) {
084: out.close();
085: }
086: System.out.println(added + " adds");
087: System.out.println(uniq.count() + " retained");
088: System.out.println((created - start) + "ms to setup UUF");
089: System.out.println((finished - created)
090: + "ms to perform all adds");
091: }
092:
093: private UriUniqFilter createUriUniqFilter(String testClass)
094: throws IOException {
095: UriUniqFilter uniq = null;
096: if (BdbUriUniqFilter.class.getName().endsWith(testClass)) {
097: ;
098: // BDB setup
099: File tmpDir = File.createTempFile("uuf", "benchmark");
100: tmpDir.delete();
101: tmpDir.mkdir();
102: uniq = new BdbUriUniqFilter(tmpDir, 50);
103: } else if (BloomUriUniqFilter.class.getName().endsWith(
104: testClass)) {
105: // bloom setup
106: uniq = new BloomUriUniqFilter();
107: } else if (MemUriUniqFilter.class.getName().endsWith(testClass)) {
108: // mem hashset
109: uniq = new MemUriUniqFilter();
110: } else if (FPUriUniqFilter.class.getName().endsWith(testClass)) {
111: // mem fp set (open-addressing) setup
112: uniq = new FPUriUniqFilter(new MemLongFPSet(21, 0.75f));
113: }
114: uniq.setDestination(this );
115: return uniq;
116: }
117:
118: /* (non-Javadoc)
119: * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
120: */
121: public void receive(CandidateURI item) {
122: if (out != null) {
123: try {
124: // we assume all tested filters are immediate passthrough so
125: // we can use 'current'; a buffering filter would change this
126: // assumption
127: out.write(current);
128: out.write("\n");
129: } catch (IOException e) {
130: // TODO Auto-generated catch block
131: e.printStackTrace();
132: }
133: }
134: }
135: }
|