001: /* AbstractLongFPSet
002: *
003: * $Id: AbstractLongFPSet.java 3437 2005-05-06 02:49:04Z stack-sf $
004: *
005: * Created on Oct 20, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.util;
026:
027: import java.io.Serializable;
028: import java.util.logging.Logger;
029:
030: import org.archive.util.fingerprint.LongFPSet;
031:
032: /**
033: * Shell of functionality for a Set of primitive long fingerprints, held
034: * in an array of possibly-empty slots.
035: *
036: * The implementation of that holding array is delegated to subclasses.
037: *
038: * <p>Capacity is always a power of 2.
039: *
040: * <p>Fingerprints are already assumed to be well-distributed, so the
041: * hashed position for a value is just its high-order bits.
042: *
043: * @author gojomo
044: * @version $Date: 2005-05-06 02:49:04 +0000 (Fri, 06 May 2005) $, $Revision: 3437 $
045: */
046: public abstract class AbstractLongFPSet implements LongFPSet,
047: Serializable {
048: private static Logger logger = Logger
049: .getLogger("org.archive.util.AbstractLongFPSet");
050:
051: /**
052: * A constant used to indicate that a slot in the set storage is empty.
053: * A zero or positive value means slot is filled
054: */
055: protected static byte EMPTY = -1;
056:
057: /** the capacity of this set, specified as the exponent of a power of 2 */
058: protected int capacityPowerOfTwo;
059:
060: /** The load factor, as a fraction. This gives the amount of free space
061: * to keep in the Set. */
062: protected float loadFactor;
063:
064: /** The current number of elements in the set */
065: protected long count;
066:
067: /**
068: * To support serialization
069: * TODO: verify needed?
070: */
071: public AbstractLongFPSet() {
072: super ();
073: }
074:
075: /**
076: * Create a new AbstractLongFPSet with a given capacity and load Factor
077: *
078: * @param capacityPowerOfTwo The capacity as the exponent of a power of 2.
079: * e.g if the capacity is <code>4</code> this means <code>2^^4</code>
080: * entries
081: * @param loadFactor The load factor as a fraction. This gives the amount
082: * of free space to keep in the Set.
083: */
084: public AbstractLongFPSet(final int capacityPowerOfTwo,
085: float loadFactor) {
086: this .capacityPowerOfTwo = capacityPowerOfTwo;
087: this .loadFactor = loadFactor;
088: this .count = 0;
089: }
090:
091: /**
092: * Does this set contain the given value?
093: *
094: * @see org.archive.util.fingerprint.LongFPSet#contains(long)
095: */
096: public boolean contains(long val) {
097: long i = indexFor(val);
098: if (slotHasData(i)) {
099: noteAccess(i);
100: return true;
101: }
102: return false;
103: }
104:
105: /**
106: * Check the state of a slot in the storage.
107: *
108: * @param i the index of the slot to check
109: * @return -1 if slot is filled; nonegative if full.
110: */
111: protected abstract int getSlotState(long i);
112:
113: /**
114: * Note access (hook for subclass cache-replacement strategies)
115: *
116: * @param index The index of the slot to check.
117: */
118: private void noteAccess(long index) {
119: // by default do nothing
120: // cache subclasses may use to update access counts, etc.
121: }
122:
123: /**
124: * Return the number of entries in this set.
125: *
126: * @see org.archive.util.fingerprint.LongFPSet#count()
127: */
128: public long count() {
129: return count;
130: }
131:
132: /**
133: * Add the given value to this set
134: *
135: * @see org.archive.util.fingerprint.LongFPSet#add(long)
136: */
137: public boolean add(long val) {
138: logger.finest("Adding " + val);
139: long i = indexFor(val);
140: if (slotHasData(i)) {
141: // positive index indicates already in set
142: return false;
143: }
144: // we have a possible slot now, which is encoded as a negative number
145:
146: // check for space, and grow if needed
147: if ((count + 1) > (loadFactor * (1 << capacityPowerOfTwo))) {
148: makeSpace();
149: // find new i
150: i = indexFor(val);
151: assert i < 0 : "slot should be empty";
152: }
153:
154: i = asDataSlot(i); // convert to positive index
155: setAt(i, val);
156: count++;
157: noteAccess(i);
158: return true;
159: }
160:
161: /**
162: * Make additional space to keep the load under the target
163: * loadFactor level.
164: *
165: * Subclasses may grow or discard entries to satisfy.
166: */
167: protected abstract void makeSpace();
168:
169: /**
170: * Set the stored value at the given slot.
171: *
172: * @param i the slot index
173: * @param l the value to set
174: */
175: protected abstract void setAt(long i, long l);
176:
177: /**
178: * Get the stored value at the given slot.
179: *
180: * @param i the slot index
181: * @return The stored value at the given slot.
182: */
183: protected abstract long getAt(long i);
184:
185: /**
186: * Given a value, check the store for its existence. If it exists, it
187: * will return the index where the value resides. Otherwise it return
188: * an encoded index, which is a possible storage location for the value.
189: *
190: * <p>Note, if we have a loading factor less than 1.0, there should always
191: * be an empty location where we can store the value
192: *
193: * @param val the fingerprint value to check for
194: * @return The (positive) index where the value already resides,
195: * or an empty index where it could be inserted (encoded as a
196: * negative number).
197: */
198: private long indexFor(long val) {
199: long candidateIndex = startIndexFor(val);
200: while (true) {
201: if (getSlotState(candidateIndex) < 0) {
202: // slot empty; return negative number encoding index
203: return asEmptySlot(candidateIndex);
204: }
205: if (getAt(candidateIndex) == val) {
206: // already present; return positive index
207: return candidateIndex;
208: }
209: candidateIndex++;
210: if (candidateIndex == 1 << capacityPowerOfTwo) {
211: candidateIndex = 0; // wraparound
212: }
213: }
214: }
215:
216: /**
217: * Return the recommended storage index for the given value.
218: * Assumes values are already well-distributed; merely uses
219: * high-order bits.
220: *
221: * @param val
222: * @return The recommended storage index for the given value.
223: */
224: private long startIndexFor(long val) {
225: return (val >>> (64 - capacityPowerOfTwo));
226: }
227:
228: public boolean remove(long l) {
229: long i = indexFor(l);
230: if (!slotHasData(i)) {
231: // not present, not changed
232: return false;
233: }
234: removeAt(i);
235: return true;
236: }
237:
238: /**
239: * Remove the value at the given index, relocating its
240: * successors as necessary.
241: *
242: * @param index
243: */
244: protected void removeAt(long index) {
245: count--;
246: clearAt(index);
247: long probeIndex = index + 1;
248: while (true) {
249: if (probeIndex == 1 << capacityPowerOfTwo) {
250: probeIndex = 0; //wraparound
251: }
252: if (getSlotState(probeIndex) < 0) {
253: // vacant
254: break;
255: }
256: long val = getAt(probeIndex);
257: long newIndex = indexFor(val);
258: if (newIndex != probeIndex) {
259: // value must shift down
260: newIndex = asDataSlot(newIndex); // positivize
261: relocate(val, probeIndex, newIndex);
262: }
263: probeIndex++;
264: }
265: }
266:
267: protected abstract void clearAt(long index);
268:
269: protected abstract void relocate(long value, long fromIndex,
270: long toIndex);
271:
272: /**
273: * Low-cost, non-definitive (except when true) contains
274: * test. Default answer of false is acceptable.
275: *
276: * @see org.archive.util.fingerprint.LongFPSet#quickContains(long)
277: */
278: public boolean quickContains(long fp) {
279: return false;
280: }
281:
282: /**
283: * given a slot index, which could or could not be empty, return it as
284: * a slot index indicating an non-empty slot
285: *
286: * @param index the slot index to convert
287: * @return the index, converted to represent an slot with data
288: */
289: private long asDataSlot(final long index) {
290: if (slotHasData(index)) { // slot already has data
291: return index;
292: }
293: return -(index + 1);
294: }
295:
296: /**
297: * Given a slot index, which could or could not be empty, return it as
298: * a slot index indicating an empty slot
299: * @param index the slot index to convert
300: * @return the index, converted to represent an empty slot
301: */
302: private long asEmptySlot(final long index) {
303: if (!slotHasData(index)) { // already empty slot
304: return index;
305: }
306: return -index - 1;
307: }
308:
309: /**
310: * Does this index represent a slot with data?
311: *
312: * @param index the index to check
313: * @return <code>true</code> if the slot has data
314: */
315: private boolean slotHasData(final long index) {
316: return index >= 0;
317: }
318: }
|