001: package it.unimi.dsi.mg4j.index.wired;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2006 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023: import it.unimi.dsi.fastutil.ints.IntIterator;
024: import it.unimi.dsi.fastutil.ints.IntIterators;
025: import it.unimi.dsi.fastutil.ints.IntSet;
026: import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
027: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
029: import it.unimi.dsi.fastutil.objects.ReferenceSet;
030: import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
031: import it.unimi.dsi.mg4j.index.AbstractIndexReader;
032: import it.unimi.dsi.mg4j.index.BitStreamIndex;
033: import it.unimi.dsi.mg4j.index.Index;
034: import it.unimi.dsi.mg4j.index.IndexIterator;
035: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
036: import it.unimi.dsi.mg4j.index.payload.Payload;
037: import it.unimi.dsi.io.InputBitStream;
038: import it.unimi.dsi.util.Interval;
039: import it.unimi.dsi.mg4j.search.IntervalIterator;
040: import it.unimi.dsi.mg4j.search.IntervalIterators;
041: import it.unimi.dsi.bits.Fast;
042: import it.unimi.dsi.Util;
043: import java.io.IOException;
044: import java.util.NoSuchElementException;
045: import org.apache.log4j.Logger;
046:
047: public class SkipGammaDeltaGammaDeltaBitStreamIndexReader extends
048: AbstractIndexReader {
049: @SuppressWarnings("unused")
050: private static final Logger LOGGER = Util
051: .getLogger(SkipGammaDeltaGammaDeltaBitStreamIndexReader.class);
052: /** The reference index. */
053: protected final BitStreamIndex index;
054: private final static boolean ASSERTS = false;
055: private final static boolean DEBUG = false;
056: /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
057: protected final BitStreamIndexReaderIndexIterator indexIterator;
058:
059: /** Creates a new skip index reader, with the specified underlying {@link Index} and input bit stream.
060: *
061: * @param index the index.
062: * @param ibs the underlying bit stream.
063: */
064: public SkipGammaDeltaGammaDeltaBitStreamIndexReader(
065: final BitStreamIndex index, final InputBitStream ibs) {
066: this .index = index;
067: this .indexIterator = new BitStreamIndexReaderIndexIterator(
068: this , ibs);
069: }
070:
071: protected static final class BitStreamIndexReaderIndexIterator
072: extends AbstractIndexIterator implements IndexIterator {
073: /** The enclosing instance. */
074: private final SkipGammaDeltaGammaDeltaBitStreamIndexReader parent;
075: /** The reference index. */
076: protected final BitStreamIndex index;
077: /** The underlying input bit stream. */
078: protected final InputBitStream ibs;
079: /** The enclosed interval iterator. */
080: private final IndexIntervalIterator intervalIterator;
081: /** A singleton set containing the enclosed interval iterator. */
082: private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;
083: /** The key index. */
084: private final Index keyIndex;
085: /** The cached copy of {@link #index index.pointerCoding}. */
086: protected final Coding pointerCoding;
087: /** The cached copy of {@link #index index.countCoding}. */
088: protected final Coding countCoding;
089: /** The cached copy of {@link #index index.positionCoding}. */
090: protected final Coding positionCoding;
091: /** The current term. */
092: protected int currentTerm = -1;
093: /** The current frequency. */
094: protected int frequency;
095: /** Whether the current terms has pointers at all (this happens when the {@link #frequency} is smaller than the number of documents). */
096: protected boolean hasPointers;
097: /** The current count (if this index contains counts). */
098: protected int count;
099: /** The last document pointer we read from current list, -1 if we just read the frequency,
100: * {@link Integer#MAX_VALUE} if we are beyond the end of list. */
101: protected int currentDocument;
102: /** The number of the document record we are going to read inside the current inverted list. */
103: protected int numberOfDocumentRecord;
104: /** This variable tracks the current state of the reader. */
105: protected int state;
106: /** The parameter <code>h</code> (the maximum height of a skip tower). */
107: public final int height;
108: /** The quantum. */
109: public final int quantum;
110: /** The bit mask giving the remainder of the division by {@link #quantum}. */
111: public final int quantumModuloMask;
112: /** The shift giving result of the division by {@link #quantum}. */
113: public final int quantumDivisionShift;
114: /** The maximum height of a skip tower in the current block. May be less than {@link #height} if the block is defective,
115: * and will be -1 on defective quanta (no tower at all). */
116: private int maxh;
117: /** The maximum valid index of the current skip tower, if any. */
118: private int s;
119: /** The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If {@link #maxh} is negative, the value is undefined. */
120: private int lowest;
121: /** We have <var>w</var> = <var>Hq</var>. */
122: private final int w;
123: /** The bit mask giving the remainder of the division by {@link #w}. */
124: private final int wModuloMask;
125: /** The shift giving result of the division by {@link #w}. */
126: private final int wDivisionShift;
127: /** The Golomb modulus for a top pointer skip, for each level. */
128: private int[] towerTopB;
129: /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */
130: private int[] towerTopLog2B;
131: /** The Golomb modulus for a lower pointer skip, for each level. */
132: private int[] towerLowerB;
133: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
134: private int[] towerLowerLog2B;
135: /** The prediction for a pointer skip, for each level. */
136: private int[] pointerPrediction;
137: /** An array to decode bit skips. */
138: private long[] bitSkip;
139: /** An array to decode the pointer skips. */
140: private int[] pointerSkip;
141: /** The number of bits read just after reading the last skip tower. */
142: private long readBitsAtLastSkipTower;
143: /** The document pointer corresponding to the last skip tower. */
144: private int pointerAtLastSkipTower;
145: /** The current quantum bit length, as provided by the index. */
146: private int quantumBitLength;
147: /** The current entry bit length, as provided by the index. */
148: private int entryBitLength;
149: /** This value of {@link #state} means that we are positioned just before a tower. */
150: private static final int BEFORE_TOWER = 0;
151: /** This value of {@link #state} can be assumed only in indices that contain a payload; it
152: * means that we are positioned just before the payload for the current document record. */
153: private static final int BEFORE_PAYLOAD = 1;
154: /** This value of {@link #state} can be assumed only in indices that contain counts; it
155: * means that we are positioned just before the count for the current document record. */
156: private static final int BEFORE_COUNT = 2;
157: /** This value of {@link #state} can be assumed only in indices that contain document positions;
158: * it means that we are positioned just before the position list of the current document record. */
159: private static final int BEFORE_POSITIONS = 3;
160: /** This value of {@link #state} means that we are at the start of a new document record,
161: * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == {@link #frequency}),
162: * in which case we are at the end of the inverted list, and {@link #endOfList()} is true. */
163: private static final int BEFORE_POINTER = 4;
164: /** The cached position array. */
165: protected int[] positionCache = new int[16];
166:
167: public BitStreamIndexReaderIndexIterator(
168: final SkipGammaDeltaGammaDeltaBitStreamIndexReader parent,
169: final InputBitStream ibs) {
170: this .parent = parent;
171: this .ibs = ibs;
172: index = parent.index;
173: keyIndex = index.keyIndex;
174: pointerCoding = index.pointerCoding;
175: if (index.hasPayloads)
176: throw new IllegalStateException();
177: if (!index.hasCounts)
178: throw new IllegalStateException();
179: countCoding = index.countCoding;
180: if (!index.hasPositions)
181: throw new IllegalStateException();
182: positionCoding = index.positionCoding;
183: intervalIterator = index.hasPositions ? new IndexIntervalIterator()
184: : null;
185: singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps
186: .singleton(keyIndex,
187: (IntervalIterator) intervalIterator)
188: : null;
189: quantum = index.quantum;
190: height = index.height;
191: if ((quantum == -1) != (height == -1))
192: throw new IllegalArgumentException();
193: w = (1 << height) * quantum;
194: quantumModuloMask = quantum - 1;
195: wModuloMask = w - 1;
196: quantumDivisionShift = Fast.mostSignificantBit(quantum);
197: wDivisionShift = Fast.mostSignificantBit(w);
198: bitSkip = new long[height + 1];
199: pointerSkip = new int[height + 1];
200: towerTopB = new int[height + 1];
201: towerTopLog2B = new int[height + 1];
202: towerLowerB = new int[height + 1];
203: towerLowerLog2B = new int[height + 1];
204: pointerPrediction = new int[height + 1];
205: }
206:
207: /** Positions the index on the inverted list of a given term.
208: *
209: * <p>This method can be called at any time. Note that it is <em>always</em> possible
210: * to call this method with argument 0, even if offsets have not been loaded.
211: *
212: * @param term a term.
213: */
214: protected void position(final int term) throws IOException {
215: if (term == 0) {
216: ibs.position(0);
217: ibs.readBits(0);
218: } else {
219: if (index.offsets == null)
220: throw new IllegalStateException(
221: "You cannot position an index without offsets");
222: final long offset = index.offsets.getLong(term);
223: ibs.position(offset);
224: // TODO: Can't we set this to 0?
225: ibs.readBits(offset);
226: }
227: currentTerm = term;
228: readFrequency();
229: }
230:
231: public int termNumber() {
232: return currentTerm;
233: }
234:
235: protected IndexIterator advance() throws IOException {
236: if (currentTerm == index.numberOfTerms - 1)
237: return null;
238: if (currentTerm != -1) {
239: skipTo(Integer.MAX_VALUE);
240: nextDocument(); // This guarantees we have no garbage before the frequency
241: }
242: currentTerm++;
243: readFrequency();
244: return this ;
245: }
246:
247: private void readFrequency() throws IOException {
248: // Read the frequency
249: frequency = ibs.readGamma() + 1;
250: hasPointers = frequency < index.numberOfDocuments;
251: quantumBitLength = entryBitLength = -1;
252: lowest = Integer.MAX_VALUE;
253: if (ASSERTS)
254: for (int i = height; i > Math
255: .min(
256: height,
257: Fast
258: .mostSignificantBit(frequency >> quantumDivisionShift)); i--)
259: towerTopB[i] = towerLowerB[i] = pointerPrediction[i] = -1;
260: final long pointerQuantumSigma = BitStreamIndex
261: .quantumSigma(frequency, index.numberOfDocuments,
262: quantum);
263: for (int i = Math
264: .min(
265: height,
266: Fast
267: .mostSignificantBit(frequency >> quantumDivisionShift)); i >= 0; i--) {
268: towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
269: pointerQuantumSigma, i + 1);
270: towerTopLog2B[i] = Fast
271: .mostSignificantBit(towerTopB[i]);
272: towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
273: pointerQuantumSigma, i);
274: towerLowerLog2B[i] = Fast
275: .mostSignificantBit(towerLowerB[i]);
276: pointerPrediction[i] = (int) ((quantum * (1L << i)
277: * index.numberOfDocuments + frequency / 2) / frequency);
278: }
279: count = -1;
280: currentDocument = -1;
281: numberOfDocumentRecord = -1;
282: state = BEFORE_POINTER;
283: }
284:
285: public Index index() {
286: return keyIndex;
287: }
288:
289: public int frequency() {
290: return frequency;
291: }
292:
293: private void ensureCurrentDocument() {
294: if (currentDocument < 0)
295: throw new IllegalStateException(
296: "nextDocument() has never been called for (term="
297: + currentTerm + ")");
298: if (currentDocument == Integer.MAX_VALUE)
299: throw new IllegalStateException(
300: "This reader is positioned beyond the end of list of (term="
301: + currentTerm + ")");
302: }
303:
304: /** Returns whether there are no more document records in the current inverted list.
305: *
306: * <p>This method returns true if the last document pointer of the current inverted
307: * list has been read. It makes no distinction as to where (inside the last document
308: * record) this reader is currently positioned. In particular, this method will
309: * return true independently of whether count and positions have been read or not (we
310: * note by passing that this is the only sensible behaviour, as you can build indices
311: * with or without counts/positions).
312: *
313: * <p>This method will return true also when this reader is positioned <em>beyond</em>
314: * the last document pointer. In this case, {@link #currentDocumentPointer()} will
315: * return {@link Integer#MAX_VALUE}.
316: *
317: * @return true whether there are no more document records in the current inverted list.
318: */
319: private boolean endOfList() {
320: if (ASSERTS)
321: assert numberOfDocumentRecord <= frequency;
322: return numberOfDocumentRecord >= frequency - 1;
323: }
324:
325: public int document() {
326: if (ASSERTS)
327: ensureCurrentDocument();
328: return currentDocument;
329: }
330:
331: public Payload payload() throws IOException {
332: if (DEBUG)
333: System.err.println(this + ".payload()");
334: if (ASSERTS)
335: ensureCurrentDocument();
336: throw new UnsupportedOperationException("This index ("
337: + index + ") does not contain payloads");
338: }
339:
340: public int count() throws IOException {
341: if (DEBUG)
342: System.err.println(this + ".count()");
343: if (count != -1)
344: return count;
345: if (ASSERTS)
346: ensureCurrentDocument();
347: if (state == BEFORE_TOWER)
348: readTower();
349: {
350: state = BEFORE_POSITIONS;
351: count = ibs.readGamma() + 1;
352: }
353: return count;
354: }
355:
356: /** We read positions, assuming state <= BEFORE_POSITIONS */
357: @SuppressWarnings("unused")
358: protected void updatePositionCache() throws IOException {
359: if (ASSERTS)
360: assert state <= BEFORE_POSITIONS;
361: if (state < BEFORE_POSITIONS) {
362: if (state == BEFORE_TOWER)
363: readTower();
364: if (state == BEFORE_COUNT) {
365: state = BEFORE_POSITIONS;
366: count = ibs.readGamma() + 1;
367: }
368: }
369: if (count > positionCache.length)
370: positionCache = new int[Math.max(
371: positionCache.length * 2, count)];
372: final int[] occ = positionCache;
373: state = BEFORE_POINTER;
374: ibs.readDeltas(occ, count);
375: for (int i = 1; i < count; i++)
376: occ[i] += occ[i - 1] + 1;
377: }
378:
379: public IntIterator positions() throws IOException {
380: if (ASSERTS)
381: ensureCurrentDocument();
382: if (state <= BEFORE_POSITIONS)
383: updatePositionCache();
384: return IntIterators.wrap(positionCache, 0, count);
385: }
386:
387: public int[] positionArray() throws IOException {
388: if (ASSERTS)
389: ensureCurrentDocument();
390: if (state <= BEFORE_POSITIONS)
391: updatePositionCache();
392: return positionCache;
393: }
394:
395: // TODO: check who's using this (positionArray() is actually faster now)
396: public int positions(final int[] position) throws IOException {
397: if (ASSERTS)
398: ensureCurrentDocument();
399: if (state <= BEFORE_POSITIONS)
400: updatePositionCache(); // And also that positions have been read
401: if (position.length < count)
402: return -count;
403: for (int i = count; i-- != 0;)
404: position[i] = this .positionCache[i];
405: return count;
406: }
407:
408: public int nextDocument() throws IOException {
409: if (DEBUG)
410: System.err.println("{" + this + "} nextDocument()");
411: if (state != BEFORE_POINTER) {
412: if (state == BEFORE_TOWER)
413: readTower();
414: if (state == BEFORE_COUNT) {
415: state = BEFORE_POSITIONS;
416: count = ibs.readGamma() + 1;
417: }
418: if (state == BEFORE_POSITIONS) {
419: // Here we just skip; note that the state change is necessary if endOfList() is true
420: state = BEFORE_POINTER;
421: ibs.skipDeltas(count);
422: }
423: }
424: if (endOfList())
425: return -1;
426: if (hasPointers) {// We do not write pointers for everywhere occurring terms.
427: currentDocument += ibs.readDelta() + 1;
428: } else
429: currentDocument++;
430: numberOfDocumentRecord++;
431: state = BEFORE_COUNT;
432: count = -1;
433: if ((numberOfDocumentRecord & quantumModuloMask) == 0)
434: state = BEFORE_TOWER;
435: return currentDocument;
436: }
437:
438: /** Reads the entire skip tower for the current position.
439: */
440: private void readTower() throws IOException {
441: readTower(-1);
442: }
443:
444: /** Reads the skip tower for the current position, possibly skipping part of the tower.
445: *
446: * <P>Note that this method will update {@link #state} only if it reads the entire tower,
447: * otherwise the state remains {@link #BEFORE_TOWER}.
448: *
449: * @param pointer the tower will be read up to the first entry smaller than or equal to this pointer; use
450: * -1 to guarantee that the entire tower will be read.
451: */
452: private void readTower(final int pointer) throws IOException {
453: int i, j, k, cacheOffset, cache, towerLength = 0;
454: long bitsAtTowerStart = 0;
455: boolean truncated = false;
456: if (ASSERTS)
457: assert numberOfDocumentRecord % quantum == 0;
458: if (ASSERTS && state != BEFORE_TOWER)
459: throw new IllegalStateException(
460: "readTower() called in state " + state);
461: cacheOffset = (numberOfDocumentRecord & wModuloMask);
462: k = cacheOffset >> quantumDivisionShift;
463: if (ASSERTS && k == 0) { // Invalidate current tower data
464: it.unimi.dsi.fastutil.ints.IntArrays.fill(pointerSkip,
465: Integer.MAX_VALUE);
466: it.unimi.dsi.fastutil.longs.LongArrays.fill(bitSkip,
467: Integer.MAX_VALUE);
468: }
469: // Compute the height of the current skip tower.
470: s = (k == 0) ? height : Fast.leastSignificantBit(k);
471: cache = frequency - w
472: * (numberOfDocumentRecord >> wDivisionShift);
473: if (cache < w) {
474: maxh = Fast
475: .mostSignificantBit((cache >> quantumDivisionShift)
476: - k);
477: if (maxh < s) {
478: s = maxh;
479: truncated = true;
480: } else
481: truncated = false;
482: } else {
483: cache = w;
484: maxh = height;
485: truncated = k == 0;
486: }
487: //assert w == cache || k == 0 || lastMaxh == Fast.mostSignificantBit( k ^ ( cache/quantum ) ) : lastMaxh +","+ (Fast.mostSignificantBit( k ^ ( cache/quantum ) ));
488: i = s;
489: if (s >= 0) {
490: if (k == 0) {
491: if (quantumBitLength < 0) {
492: quantumBitLength = ibs.readDelta();
493: entryBitLength = ibs.readDelta();
494: } else {
495: quantumBitLength += Fast.nat2int(ibs
496: .readDelta());
497: entryBitLength += Fast.nat2int(ibs.readDelta());
498: }
499: if (DEBUG)
500: System.err
501: .println("{" + this
502: + "} quantum bit length="
503: + quantumBitLength
504: + " entry bit length="
505: + entryBitLength);
506: }
507: if (DEBUG)
508: System.err.println("{" + this
509: + "} Reading tower; pointer=" + pointer
510: + " maxh=" + maxh + " s=" + s);
511: if (s > 0) {
512: towerLength = entryBitLength * (s + 1)
513: + Fast.nat2int(ibs.readDelta());
514: if (DEBUG)
515: System.err.println("{" + this
516: + "} Tower length=" + towerLength);
517: }
518: // We store the number of bits read at the start of the tower (just after the length).
519: bitsAtTowerStart = ibs.readBits();
520: if (truncated) {
521: if (DEBUG)
522: System.err.println("{" + this
523: + "} Truncated--reading tops");
524: // We read the tower top.
525: pointerSkip[s] = Fast.nat2int(ibs.readGolomb(
526: towerTopB[s], towerTopLog2B[s]))
527: + pointerPrediction[s];
528: bitSkip[s] = quantumBitLength * (1 << s)
529: + entryBitLength * ((1 << s + 1) - s - 2)
530: + Fast.nat2int(ibs.readLongDelta());
531: } else {
532: // We copy the tower top from the lowest inherited entry suitably updated.
533: pointerSkip[s] = pointerSkip[s + 1]
534: - (currentDocument - pointerAtLastSkipTower);
535: bitSkip[s] = bitSkip[s + 1]
536: - (bitsAtTowerStart - readBitsAtLastSkipTower)
537: - towerLength;
538: }
539: // We read the remaining part of the tower, at least until we point after pointer.
540: if (currentDocument + pointerSkip[i] > pointer) {
541: for (i = s - 1; i >= 0; i--) {
542: pointerSkip[i] = Fast.nat2int(ibs.readGolomb(
543: towerLowerB[i], towerLowerLog2B[i]))
544: + pointerSkip[i + 1] / 2;
545: bitSkip[i] = (bitSkip[i + 1] - entryBitLength
546: * (i + 1))
547: / 2 - Fast.nat2int(ibs.readLongDelta());
548: if (DEBUG
549: && currentDocument + pointerSkip[i] <= pointer)
550: System.err
551: .println("{"
552: + this
553: + "} stopping reading at i="
554: + i
555: + " as currentDocument ("
556: + currentDocument
557: + ") plus pointer skip ("
558: + pointerSkip[i]
559: + ") is smaller than or equal target ("
560: + pointer + ")");
561: if (currentDocument + pointerSkip[i] <= pointer)
562: break;
563: }
564: }
565: }
566: /* If we did not read the entire tower, we need to fix the skips we read (as they
567: * are offsets from the *end* of the tower) and moreover we must make unusable the
568: * rest of the tower (for asserts). */
569: if (i > 0) {
570: final long fix = ibs.readBits() - bitsAtTowerStart;
571: for (j = s; j >= i; j--)
572: bitSkip[j] += towerLength - fix;
573: if (ASSERTS)
574: for (; j >= 0; j--)
575: pointerSkip[j] = Integer.MAX_VALUE;
576: } else
577: state = BEFORE_COUNT;
578: // We update the inherited tower.
579: final long deltaBits = ibs.readBits()
580: - readBitsAtLastSkipTower;
581: final int deltaPointers = currentDocument
582: - pointerAtLastSkipTower;
583: for (j = Fast.mostSignificantBit(k
584: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
585: bitSkip[j] -= deltaBits;
586: pointerSkip[j] -= deltaPointers;
587: }
588: readBitsAtLastSkipTower = ibs.readBits();
589: pointerAtLastSkipTower = currentDocument;
590: lowest = i < 0 ? 0 : i;
591: if (DEBUG) {
592: System.err.println("{" + this + "} "
593: + "Computed skip tower (lowest: " + lowest
594: + ") for document record number "
595: + numberOfDocumentRecord + " (pointer "
596: + currentDocument + ") from " + Math.max(i, 0)
597: + ": ");
598: System.err.print("% ");
599: for (j = 0; j <= s; j++)
600: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
601: + " ");
602: System.err.print(" [");
603: for (; j <= height; j++)
604: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
605: + " ");
606: System.err.print("]");
607: System.err.println();
608: }
609: }
610:
611: /*
612: public int skip( final int n ) throws IOException {
613: int i, k, cacheOffset, start = numberOfDocumentRecord, skip = 0;
614:
615: if ( DEBUG ) System.err.println( "{" + this + "} " + "Going to enter linear skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );
616: if ( n < 0 ) throw new IllegalArgumentException();
617: if ( n == 0 ) return 0;
618:
619: // If we are just at the start of a list, let us read the first pointer.
620: if ( numberOfDocumentRecord == -1 ) readDocumentPointer();
621: if ( state == BEFORE_TOWER ) readTower( -1 );
622:
623: if ( DEBUG ) System.err.println( "{" + this + "} " + "Entering skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );
624:
625: for(;;) {
626: if ( DEBUG ) System.err.println( "{" + this + "} " + "In for loop, lastDoc=" + currentDocument + ", maxh=" + maxh + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );
627:
628: cacheOffset = ( numberOfDocumentRecord & wModuloMask );
629: k = cacheOffset >> quantumDivisionShift;
630:
631: if ( maxh < 0 ) break; // Defective quantum--no tower.
632:
633: for( i = Fast.mostSignificantBit( k ^ ( Math.min( w, frequency - numberOfDocumentRecord + cacheOffset ) >> quantumDivisionShift ) ); i >= 0; i-- )
634: if ( ( skip = ( ( k & - ( 1 << i ) ) + ( 1 << i ) ) * quantum - cacheOffset ) <= n ) break;
635:
636: if ( i >= 0 ) {
637: ibs.skip( bitSkip[ i ] - ( ibs.readBits() - readBitsAtLastSkipTower ) );
638: state = BEFORE_TOWER;
639: currentDocument = pointerSkip[ i ] + pointerAtLastSkipTower;
640: numberOfDocumentRecord += skip;
641: // If we skipped beyond the end of the list, we invalidate the current document.
642: if ( numberOfDocumentRecord == frequency ) currentDocument = -1;
643: readTower( -1 );
644: count = -1; // We must invalidate count as readDocumentPointer() would do.
645: if ( endOfList() ) return numberOfDocumentRecord - start;
646: }
647: else break;
648: }
649:
650: if ( DEBUG ) System.err.println( "{" + this + "} " + "Completing sequentially, lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );
651:
652: while( numberOfDocumentRecord - start < n ) {
653: if ( endOfList() ) break;
654: readDocumentPointer();
655: }
656: return numberOfDocumentRecord - start;
657: }
658: */
659: public int skipTo(final int p) throws IOException {
660: if (DEBUG)
661: System.err.println(this + ".skipTo(" + p
662: + ") [currentDocument=" + currentDocument
663: + ", numberOfDocumentRecord="
664: + numberOfDocumentRecord + ", endOfList()="
665: + endOfList());
666: // If we are just at the start of a list, let us read the first pointer.
667: if (numberOfDocumentRecord == -1)
668: nextDocument(); // TODO: shouldn't we just read the tower?
669: if (currentDocument >= p) {
670: if (DEBUG)
671: System.err.println(this
672: + ": No skip necessary, returning "
673: + currentDocument);
674: return currentDocument;
675: }
676: if (state == BEFORE_TOWER)
677: readTower(p);
678: final int[] pointerSkip = this .pointerSkip;
679: for (;;) {
680: if (ASSERTS)
681: assert maxh < 0 || lowest > 0
682: || pointerSkip[0] != Integer.MAX_VALUE;
683: // If on a defective quantum (no tower) or p is inside the current quantum (no need to scan the tower) we bail out.
684: if (maxh < 0 || lowest == 0
685: && pointerAtLastSkipTower + pointerSkip[0] > p)
686: break;
687: if (DEBUG)
688: System.err.println(this
689: + ": In for loop, currentDocument="
690: + currentDocument + ", maxh=" + maxh
691: + ", numberOfDocumentRecord="
692: + numberOfDocumentRecord + ", p=" + p);
693: final int cacheOffset = (numberOfDocumentRecord & wModuloMask);
694: final int k = cacheOffset >> quantumDivisionShift;
695: final int top = Fast
696: .mostSignificantBit(k
697: ^ (Math.min(w, frequency
698: - numberOfDocumentRecord
699: + cacheOffset) >> quantumDivisionShift));
700: int i;
701: for (i = lowest; i <= top; i++) {
702: if (ASSERTS && (k & 1 << i) != 0)
703: assert pointerSkip[i] == pointerSkip[i + 1];
704: if (ASSERTS)
705: assert pointerSkip[i] != Integer.MAX_VALUE : "Invalid pointer skip "
706: + i
707: + " (lowest="
708: + lowest
709: + ", top="
710: + top + ")";
711: if (pointerAtLastSkipTower + pointerSkip[i] > p)
712: break;
713: }
714: if (--i < 0)
715: break;
716: if (ASSERTS)
717: assert i >= lowest : i + " < " + lowest;
718: if (DEBUG)
719: System.err.println(this
720: + ": Safely after for with i=" + i
721: + ", P[i]=" + pointerSkip[i] + ", A[i]="
722: + bitSkip[i]);
723: if (DEBUG)
724: System.err
725: .println(this
726: + ": ["
727: + ibs.readBits()
728: + "] Skipping "
729: + (bitSkip[i] - (ibs.readBits() - readBitsAtLastSkipTower))
730: + " bits ("
731: + (((k & -(1 << i)) + (1 << i))
732: * quantum - cacheOffset)
733: + " records) to get to document pointer "
734: + (currentDocument + pointerSkip[i]));
735: ibs.skip(bitSkip[i]
736: - (ibs.readBits() - readBitsAtLastSkipTower));
737: state = BEFORE_TOWER;
738: currentDocument = pointerSkip[i]
739: + pointerAtLastSkipTower;
740: numberOfDocumentRecord += ((k & -(1 << i)) + (1 << i))
741: * quantum - cacheOffset;
742: // If we skipped beyond the end of the list, we invalidate the current document.
743: if (numberOfDocumentRecord == frequency) {
744: currentDocument = Integer.MAX_VALUE;
745: state = BEFORE_POINTER; // We are actually before a frequency, but we must avoid that calls to nextDocument() read anything
746: } else
747: readTower(p); // Note that if we are exactly on the destination pointer, we will read the entire tower.
748: count = -1; // We must invalidate count as readDocumentPointer() would do.
749: if (endOfList()) {
750: if (DEBUG)
751: System.err.println(this
752: + ".toSkip(): end-of-list, returning "
753: + currentDocument);
754: // Note that if p == Integer.MAX_VALUE, we are certainly beyond end-of-list
755: return p == Integer.MAX_VALUE ? Integer.MAX_VALUE
756: : currentDocument;
757: }
758: }
759: if (DEBUG)
760: System.err.println(this
761: + ": Completing sequentially, currentDocument="
762: + currentDocument + ", numberOfDocumentRecord="
763: + numberOfDocumentRecord + ", p=" + p);
764: while (currentDocument < p) {
765: if (DEBUG)
766: System.err
767: .println(this
768: + ": Skipping sequentially (second), currentDocument="
769: + currentDocument
770: + ", numberOfDocumentRecord="
771: + numberOfDocumentRecord + ", p="
772: + p);
773: if (nextDocument() == -1) {
774: if (DEBUG)
775: System.err.println(this
776: + ": end-of-list, returning MAX_VALUE");
777: return Integer.MAX_VALUE;
778: }
779: }
780: if (DEBUG)
781: System.err.println(this + ".toSkip(): Returning "
782: + currentDocument);
783: return currentDocument;
784: }
785:
786: public void dispose() throws IOException {
787: parent.close();
788: }
789:
790: public boolean hasNext() {
791: return !endOfList();
792: }
793:
794: public int nextInt() {
795: if (!hasNext())
796: throw new NoSuchElementException();
797: try {
798: return nextDocument();
799: } catch (IOException e) {
800: throw new RuntimeException(e);
801: }
802: }
803:
804: public String toString() {
805: return index + " [" + currentTerm + "]";
806: }
807:
808: /** An interval iterator returning the positions of the current document as singleton intervals. */
809: private final class IndexIntervalIterator extends
810: AbstractObjectIterator<Interval> implements
811: IntervalIterator {
812: int pos = -1;
813:
814: public void reset() throws IOException {
815: pos = -1;
816: if (state <= BEFORE_POSITIONS)
817: updatePositionCache(); // This guarantees the position cache is ok
818: }
819:
820: public void intervalTerms(final IntSet terms) {
821: terms
822: .add(BitStreamIndexReaderIndexIterator.this .currentTerm);
823: }
824:
825: public boolean hasNext() {
826: return pos < count - 1;
827: }
828:
829: public Interval next() {
830: if (!hasNext())
831: throw new NoSuchElementException();
832: return Interval.valueOf(positionCache[++pos]);
833: }
834:
835: public Interval nextInterval() {
836: return pos < count - 1 ? Interval
837: .valueOf(positionCache[++pos]) : null;
838: }
839:
840: public int extent() {
841: return 1;
842: }
843:
844: public String toString() {
845: return index + ": " + term + "[doc=" + currentDocument
846: + ", count=" + count + ", pos=" + pos + "]";
847: }
848: };
849:
850: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
851: throws IOException {
852: intervalIterator();
853: return singletonIntervalIterator;
854: }
855:
856: public IntervalIterator intervalIterator() throws IOException {
857: return intervalIterator(keyIndex);
858: }
859:
860: public IntervalIterator intervalIterator(final Index index)
861: throws IOException {
862: if (ASSERTS)
863: ensureCurrentDocument();
864: // TODO: this was if ( index != keyIndex || hasPayloads )
865: if (index != keyIndex)
866: return IntervalIterators.TRUE;
867: if (ASSERTS)
868: assert intervalIterator != null;
869: intervalIterator.reset();
870: return intervalIterator;
871: }
872:
873: public ReferenceSet<Index> indices() {
874: return index.singletonSet;
875: }
876: }
877:
878: private IndexIterator documents(final CharSequence term,
879: final int termNumber) throws IOException {
880: indexIterator.term(term);
881: indexIterator.position(termNumber);
882: return indexIterator;
883: }
884:
885: public IndexIterator documents(final int term) throws IOException {
886: return documents(null, term);
887: }
888:
889: public IndexIterator documents(final CharSequence term)
890: throws IOException {
891: if (closed)
892: throw new IllegalStateException("This "
893: + getClass().getSimpleName() + " has been closed");
894: if (index.termMap != null) {
895: final int termIndex = (int) index.termMap.getLong(term);
896: if (termIndex == -1)
897: return index.emptyIndexIterator;
898: return documents(term, termIndex);
899: }
900: throw new UnsupportedOperationException("Index " + index
901: + " has no term map");
902: }
903:
904: @Override
905: public IndexIterator nextIterator() throws IOException {
906: return indexIterator.advance();
907: }
908:
909: public String toString() {
910: return getClass().getSimpleName() + "[" + index + "]";
911: }
912:
913: public void close() throws IOException {
914: super.close();
915: indexIterator.ibs.close();
916: }
917: }
|