001: /* BdbUriUniqFilter
002: *
003: * $Id: BdbUriUniqFilter.java 4927 2007-02-21 10:18:39Z gojomo $
004: *
005: * Created on September 17, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.util;
026:
027: import java.io.File;
028: import java.io.IOException;
029: import java.io.ObjectOutputStream;
030: import java.io.Serializable;
031: import java.util.logging.Level;
032: import java.util.logging.Logger;
033:
034: import st.ata.util.FPGenerator;
035:
036: import com.sleepycat.bind.tuple.LongBinding;
037: import com.sleepycat.je.Database;
038: import com.sleepycat.je.DatabaseConfig;
039: import com.sleepycat.je.DatabaseEntry;
040: import com.sleepycat.je.DatabaseException;
041: import com.sleepycat.je.DatabaseNotFoundException;
042: import com.sleepycat.je.Environment;
043: import com.sleepycat.je.EnvironmentConfig;
044: import com.sleepycat.je.OperationStatus;
045:
046: /**
047: * A BDB implementation of an AlreadySeen list.
048: *
049: * This implementation performs adequately without blowing out
050: * the heap. See
051: * <a href="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
052: *
053: * <p>Makes keys that have URIs from same server close to each other. Mercator
054: * and 2.3.5 'Elminating Already-Visited URLs' in 'Mining the Web' by Soumen
055: * Chakrabarti talk of a two-level key with the first 24 bits a hash of the
056: * host plus port and with the last 40 as a hash of the path. Testing
057: * showed adoption of such a scheme halving lookup times (This implementation
058: * actually concatenates scheme + host in first 24 bits and path + query in
059: * trailing 40 bits).
060: *
061: * @author stack
062: * @version $Date: 2007-02-21 10:18:39 +0000 (Wed, 21 Feb 2007) $, $Revision: 4927 $
063: */
064: public class BdbUriUniqFilter extends SetBasedUriUniqFilter implements
065: Serializable {
066: private static final long serialVersionUID = -8099357538178524011L;
067:
068: private static Logger logger = Logger
069: .getLogger(BdbUriUniqFilter.class.getName());
070:
071: protected boolean createdEnvironment = false;
072: protected long lastCacheMiss = 0;
073: protected long lastCacheMissDiff = 0;
074: protected transient Database alreadySeen = null;
075: static protected DatabaseEntry ZERO_LENGTH_ENTRY = new DatabaseEntry(
076: new byte[0]);
077: private static final String DB_NAME = "alreadySeenUrl";
078: protected long count = 0;
079: private long aggregatedLookupTime = 0;
080:
081: private static final String COLON_SLASH_SLASH = "://";
082:
083: /**
084: * Shutdown default constructor.
085: */
086: protected BdbUriUniqFilter() {
087: super ();
088: }
089:
090: /**
091: * Constructor.
092: * @param environment A bdb environment ready-configured.
093: * @throws IOException
094: */
095: public BdbUriUniqFilter(Environment environment) throws IOException {
096: super ();
097: try {
098: initialize(environment);
099: } catch (DatabaseException e) {
100: throw new IOException(e.getMessage());
101: }
102: }
103:
104: /**
105: * Constructor.
106: * @param bdbEnv The directory that holds the bdb environment. Will
107: * make a database under here if doesn't already exit. Otherwise
108: * reopens any existing dbs.
109: * @throws IOException
110: */
111: public BdbUriUniqFilter(File bdbEnv) throws IOException {
112: this (bdbEnv, -1);
113: }
114:
115: /**
116: * Constructor.
117: * @param bdbEnv The directory that holds the bdb environment. Will
118: * make a database under here if doesn't already exit. Otherwise
119: * reopens any existing dbs.
120: * @param cacheSizePercentage Percentage of JVM bdb allocates as
121: * its cache. Pass -1 to get default cache size.
122: * @throws IOException
123: */
124: public BdbUriUniqFilter(File bdbEnv, final int cacheSizePercentage)
125: throws IOException {
126: super ();
127: if (!bdbEnv.exists()) {
128: bdbEnv.mkdirs();
129: }
130: EnvironmentConfig envConfig = new EnvironmentConfig();
131: envConfig.setAllowCreate(true);
132: if (cacheSizePercentage > 0 && cacheSizePercentage < 100) {
133: envConfig.setCachePercent(cacheSizePercentage);
134: }
135: try {
136: createdEnvironment = true;
137: initialize(new Environment(bdbEnv, envConfig));
138: } catch (DatabaseException e) {
139: throw new IOException(e.getMessage());
140: }
141: }
142:
143: /**
144: * Method shared by constructors.
145: * @param env Environment to use.
146: * @throws DatabaseException
147: */
148: protected void initialize(Environment env) throws DatabaseException {
149: DatabaseConfig dbConfig = getDatabaseConfig();
150: dbConfig.setAllowCreate(true);
151: try {
152: env.truncateDatabase(null, DB_NAME, false);
153: } catch (DatabaseNotFoundException e) {
154: // Ignored
155: }
156: open(env, dbConfig);
157: }
158:
159: /**
160: * @return DatabaseConfig to use
161: */
162: protected DatabaseConfig getDatabaseConfig() {
163: DatabaseConfig dbConfig = new DatabaseConfig();
164: dbConfig.setDeferredWrite(true);
165: return dbConfig;
166: }
167:
168: /**
169: * Call after deserializing an instance of this class. Will open the
170: * already seen in passed environment.
171: * @param env DB Environment to use.
172: * @throws DatabaseException
173: */
174: public void reopen(final Environment env) throws DatabaseException {
175: DatabaseConfig dbConfig = getDatabaseConfig();
176: open(env, dbConfig);
177: }
178:
179: protected void open(final Environment env,
180: final DatabaseConfig dbConfig) throws DatabaseException {
181: this .alreadySeen = env.openDatabase(null, DB_NAME, dbConfig);
182: }
183:
184: public synchronized void close() {
185: Environment env = null;
186: if (this .alreadySeen != null) {
187: try {
188: env = this .alreadySeen.getEnvironment();
189: if (logger.isLoggable(Level.INFO)) {
190: logger.info("Count of alreadyseen on close "
191: + Long.toString(count));
192: }
193: this .alreadySeen.sync();
194: this .alreadySeen.close();
195: } catch (DatabaseException e) {
196: logger.severe(e.getMessage());
197: }
198: this .alreadySeen = null;
199: }
200: if (env != null && createdEnvironment) {
201: try {
202: // This sync flushes whats in RAM. Its expensive operation.
203: // Without, data can be lost. Not for transactional operation.
204: env.sync();
205: env.close();
206: } catch (DatabaseException e) {
207: logger.severe(e.getMessage());
208: }
209: }
210: }
211:
212: public synchronized long getCacheMisses() throws DatabaseException {
213: long cacheMiss = this .alreadySeen.getEnvironment().getStats(
214: null).getNCacheMiss();
215: this .lastCacheMissDiff = cacheMiss - this .lastCacheMiss;
216: this .lastCacheMiss = cacheMiss;
217: return this .lastCacheMiss;
218: }
219:
220: public long getLastCacheMissDiff() {
221: return this .lastCacheMissDiff;
222: }
223:
224: /**
225: * Create fingerprint.
226: * Pubic access so test code can access createKey.
227: * @param uri URI to fingerprint.
228: * @return Fingerprint of passed <code>url</code>.
229: */
230: public static long createKey(CharSequence uri) {
231: String url = uri.toString();
232: int index = url.indexOf(COLON_SLASH_SLASH);
233: if (index > 0) {
234: index = url
235: .indexOf('/', index + COLON_SLASH_SLASH.length());
236: }
237: CharSequence hostPlusScheme = (index == -1) ? url : url
238: .subSequence(0, index);
239: long tmp = FPGenerator.std24.fp(hostPlusScheme);
240: return tmp | (FPGenerator.std40.fp(url) >>> 24);
241: }
242:
243: protected boolean setAdd(CharSequence uri) {
244: DatabaseEntry key = new DatabaseEntry();
245: LongBinding.longToEntry(createKey(uri), key);
246: long started = 0;
247:
248: OperationStatus status = null;
249: try {
250: if (logger.isLoggable(Level.INFO)) {
251: started = System.currentTimeMillis();
252: }
253: status = alreadySeen.putNoOverwrite(null, key,
254: ZERO_LENGTH_ENTRY);
255: if (logger.isLoggable(Level.INFO)) {
256: aggregatedLookupTime += (System.currentTimeMillis() - started);
257: }
258: } catch (DatabaseException e) {
259: logger.severe(e.getMessage());
260: }
261: if (status == OperationStatus.SUCCESS) {
262: count++;
263: if (logger.isLoggable(Level.INFO)) {
264: final int logAt = 10000;
265: if (count > 0 && ((count % logAt) == 0)) {
266: logger.info("Average lookup "
267: + (aggregatedLookupTime / logAt) + "ms.");
268: aggregatedLookupTime = 0;
269: }
270: }
271: }
272: if (status == OperationStatus.KEYEXIST) {
273: return false; // not added
274: } else {
275: return true;
276: }
277: }
278:
279: protected long setCount() {
280: return count;
281: }
282:
283: protected boolean setRemove(CharSequence uri) {
284: DatabaseEntry key = new DatabaseEntry();
285: LongBinding.longToEntry(createKey(uri), key);
286: OperationStatus status = null;
287: try {
288: status = alreadySeen.delete(null, key);
289: } catch (DatabaseException e) {
290: logger.severe(e.getMessage());
291: }
292: if (status == OperationStatus.SUCCESS) {
293: count--;
294: return true; // removed
295: } else {
296: return false; // not present
297: }
298: }
299:
300: public long flush() {
301: // We always write but this might be place to do the sync
302: // when checkpointing? TODO.
303: return 0;
304: }
305:
306: private void writeObject(ObjectOutputStream oos) throws IOException {
307: // sync deferred-write database
308: try {
309: alreadySeen.sync();
310: } catch (DatabaseException e) {
311: // TODO Auto-generated catch block
312: throw new RuntimeException(e);
313: }
314: oos.defaultWriteObject();
315: }
316: }
|