0001: package it.unimi.dsi.mg4j.index.wired;
0002:
0003: /*
0004: * MG4J: Managing Gigabytes for Java
0005: *
0006: * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna
0007: *
0008: * This library is free software; you can redistribute it and/or modify it
0009: * under the terms of the GNU Lesser General Public License as published by the Free
0010: * Software Foundation; either version 2.1 of the License, or (at your option)
0011: * any later version.
0012: *
0013: * This library is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0016: * for more details.
0017: *
0018: * You should have received a copy of the GNU Lesser General Public License
0019: * along with this program; if not, write to the Free Software
0020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0021: *
0022: */
0023:
0024: import it.unimi.dsi.fastutil.ints.IntIterator;
0025: import it.unimi.dsi.fastutil.ints.IntIterators;
0026: import it.unimi.dsi.fastutil.ints.IntSet;
0027: import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
0028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
0029: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
0030: import it.unimi.dsi.fastutil.objects.ReferenceSet;
0031: import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
0032: import it.unimi.dsi.mg4j.index.AbstractIndexReader;
0033: import it.unimi.dsi.mg4j.index.BitStreamHPIndex;
0034: import it.unimi.dsi.mg4j.index.Index;
0035: import it.unimi.dsi.mg4j.index.IndexIterator;
0036: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
0037: import it.unimi.dsi.mg4j.index.payload.Payload;
0038: import it.unimi.dsi.io.InputBitStream;
0039: import it.unimi.dsi.util.Interval;
0040: import it.unimi.dsi.mg4j.search.IntervalIterator;
0041: import it.unimi.dsi.mg4j.search.IntervalIterators;
0042: import it.unimi.dsi.bits.Fast;
0043: import it.unimi.dsi.Util;
0044:
0045: import java.io.IOException;
0046: import java.util.NoSuchElementException;
0047:
0048: import org.apache.log4j.Logger;
0049:
0050: import it.unimi.dsi.mg4j.index.BitStreamIndex;
0051:
0052: public class GammaDeltaGammaDeltaBitStreamHPIndexReader extends
0053: AbstractIndexReader {
0054: @SuppressWarnings("unused")
0055: private static final Logger LOGGER = Util
0056: .getLogger(GammaDeltaGammaDeltaBitStreamHPIndexReader.class);
0057:
0058: private final static boolean ASSERTS = false;
0059: private final static boolean DEBUG = false;
0060: private final static boolean COOKIES = false;
0061:
0062: /** The reference index. */
0063: protected final BitStreamHPIndex index;
0064: /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
0065: protected final BitStreamHPIndexReaderIndexIterator indexIterator;
0066:
0067: /**
0068: * Creates a new skip index reader, with the specified underlying {@link Index} and input bit
0069: * stream.
0070: *
0071: * @param index the index.
0072: * @param ibs the underlying bit stream.
0073: */
0074: public GammaDeltaGammaDeltaBitStreamHPIndexReader(
0075: final BitStreamHPIndex index, final InputBitStream ibs,
0076: final InputBitStream positions) {
0077: this .index = index;
0078: this .indexIterator = new BitStreamHPIndexReaderIndexIterator(
0079: this , ibs, positions);
0080: }
0081:
0082: protected static final class BitStreamHPIndexReaderIndexIterator
0083: extends AbstractIndexIterator implements IndexIterator {
0084: /** The enclosing instance. */
0085: private final GammaDeltaGammaDeltaBitStreamHPIndexReader parent;
0086:
0087: /** The reference index. */
0088: protected final BitStreamHPIndex index;
0089:
0090: /** The underlying input bit stream. */
0091: protected final InputBitStream ibs;
0092:
0093: /** The underlying positions input bit stream. */
0094: private final InputBitStream positions;
0095:
0096: /** The enclosed interval iterator. */
0097: private final IndexIntervalIterator intervalIterator;
0098:
0099: /** A singleton set containing the enclosed interval iterator. */
0100: private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;
0101:
0102: /** The key index. */
0103: private final Index keyIndex;
0104:
0105: /** The cached copy of {@link #index index.pointerCoding}. */
0106: protected final Coding pointerCoding;
0107:
0108: /** The cached copy of {@link #index index.countCoding}. */
0109: protected final Coding countCoding;
0110:
0111: /** The cached copy of {@link #index index.positionCoding}. */
0112: protected final Coding positionCoding;
0113: /** The current term. */
0114: @SuppressWarnings("hiding")
0115: protected int term = -1;
0116: /** The current frequency. */
0117: protected int frequency;
0118: /**
0119: * Whether the current terms has pointers at all (this happens when the {@link #frequency}
0120: * is smaller than the number of documents).
0121: */
0122: protected boolean hasPointers;
0123: /** The current count (if this index contains counts). */
0124: protected int count;
0125: /**
0126: * The last document pointer we read from current list, -1 if we just read the frequency,
0127: * {@link Integer#MAX_VALUE} if we are beyond the end of list.
0128: */
0129: protected int currentDocument;
0130: /** The number of the document record we are going to read inside the current inverted list. */
0131: protected int numberOfDocumentRecord;
0132: /** This variable tracks the current state of the reader. */
0133: protected int state;
0134: /** The parameter <code>h</code> (the maximum height of a skip tower). */
0135: public final int height;
0136: /** The quantum. */
0137: public final int quantum;
0138: /** The bit mask giving the remainder of the division by {@link #quantum}. */
0139: public final int quantumModuloMask;
0140: /** The shift giving result of the division by {@link #quantum}. */
0141: public final int quantumDivisionShift;
0142: /**
0143: * The maximum height of a skip tower in the current block. May be less than {@link #height}
0144: * if the block is defective, and will be -1 on defective quanta (no tower at all).
0145: */
0146: private int maxh;
0147: /** The maximum valid index of the current skip tower, if any. */
0148: private int s;
0149: /**
0150: * The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If
0151: * {@link #maxh} is negative, the value is undefined.
0152: */
0153: private int lowest;
0154: /** We have <var>w</var> = <var>Hq</var>. */
0155: private int w;
0156: /** The bit mask giving the remainder of the division by {@link #w}. */
0157: private final int wModuloMask;
0158: /** The shift giving result of the division by {@link #w}. */
0159: private final int wDivisionShift;
0160: /** The Golomb modulus for a top pointer skip, for each level. */
0161: private int[] towerTopB;
0162: /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */
0163: private int[] towerTopLog2B;
0164: /** The Golomb modulus for a lower pointer skip, for each level. */
0165: private int[] towerLowerB;
0166: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
0167: private int[] towerLowerLog2B;
0168: /** The prediction for a pointer skip, for each level. */
0169: private int[] pointerPrediction;
0170: /** An array to decode bit skips. */
0171: private long[] bitSkip;
0172: /** An array to decode positions bit skips. */
0173: private long[] positionsBitSkip;
0174: /** An array to decode the pointer skips. */
0175: private int[] pointerSkip;
0176: /** The number of bits read just after reading the last skip tower. */
0177: private long readBitsAtLastSkipTower;
0178: /** The document pointer corresponding to the last skip tower. */
0179: private int pointerAtLastSkipTower;
0180: /** The number of positions that should be skipped manually
0181: * once the last read tower has been reached by skipping {@link #positionsToReadToReachCurrentPosition}.
0182: */
0183: private int positionsToReadToReachCurrentPosition;
0184: /** The offset in the positions stream from the start of the current quantum
0185: * to the start of the last tower.
0186: */
0187: private long positionsBitsOffset;
0188: /** The last increment added to {@link #positionsBitSkip}. */
0189: private long lastPositionsIncrement;
0190: /** The current quantum bit length, as provided by the index. */
0191: private int quantumBitLength;
0192: /** The current positions quantum bit length, as provided by the index. */
0193: private int positionsQuantumBitLength;
0194: /** The current entry bit length, as provided by the index. */
0195: private int entryBitLength;
0196: /** This value of {@link #state} means that we are positioned just before a tower. */
0197: private static final int BEFORE_TOWER = 0;
0198: /**
0199: * This value of {@link #state} can be assumed only in indices that contain counts; it means
0200: * that we are positioned just before the count for the current document record.
0201: */
0202: private static final int BEFORE_COUNT = 1;
0203: /**
0204: * This value of {@link #state} means that we are at the start of a new document record,
0205: * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} ==
0206: * {@link #frequency}), in which case we are at the end of the inverted list, and
0207: * {@link #endOfList()} is true.
0208: */
0209: private static final int BEFORE_POINTER = 2;
0210: /** Whether the positions for the current document pointer have not been fetched yet. */
0211: private boolean positionsUnread;
0212: /** The cached position array. */
0213: protected int[] positionCache = new int[16];
0214: /** The offset of the positions of the current {@link #term}. */
0215: protected long lastPositionsOffset;
0216:
0217: public BitStreamHPIndexReaderIndexIterator(
0218: final GammaDeltaGammaDeltaBitStreamHPIndexReader parent,
0219: final InputBitStream ibs, final InputBitStream positions) {
0220: this .parent = parent;
0221: this .ibs = ibs;
0222: this .positions = positions;
0223: index = parent.index;
0224: keyIndex = index.keyIndex;
0225: pointerCoding = index.pointerCoding;
0226: countCoding = index.countCoding;
0227: positionCoding = index.positionCoding;
0228: intervalIterator = index.hasPositions ? new IndexIntervalIterator()
0229: : null;
0230: singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps
0231: .singleton(keyIndex,
0232: (IntervalIterator) intervalIterator)
0233: : null;
0234: quantum = index.quantum;
0235: quantumModuloMask = quantum - 1;
0236: quantumDivisionShift = Fast.mostSignificantBit(quantum);
0237: height = index.height;
0238: w = (1 << height) * quantum;
0239: wModuloMask = w - 1;
0240: wDivisionShift = Fast.mostSignificantBit(w);
0241: bitSkip = new long[height + 1];
0242: positionsBitSkip = new long[height + 1];
0243: pointerSkip = new int[height + 1];
0244: towerTopB = new int[height + 1];
0245: towerTopLog2B = new int[height + 1];
0246: towerLowerB = new int[height + 1];
0247: towerLowerLog2B = new int[height + 1];
0248: pointerPrediction = new int[height + 1];
0249: }
0250:
0251: /** Debug variable, usually unused, that keeps track of the end of the positions stream for the current term (but not for term 0!). */
0252: private long positionsLength;
0253:
0254: /**
0255: * Positions the index on the inverted list of a given term.
0256: *
0257: * <p>This method can be called at any time. Note that it is <em>always</em> possible to
0258: * call this method with argument 0, even if offsets have not been loaded.
0259: *
0260: * @param term a term.
0261: */
0262: protected void position(final int term) throws IOException {
0263: if (term == 0) {
0264: if (ASSERTS) {
0265: // Get end of positions
0266: ibs.position(index.offsets.getLong(1));
0267: positionsLength = ibs.readLongDelta();
0268: }
0269: ibs.position(0);
0270: ibs.readBits(0);
0271: lastPositionsOffset = ibs.readLongDelta(); // This is 0 for sure
0272: positions.position(0);
0273: } else {
0274: if (index.offsets == null)
0275: throw new IllegalStateException(
0276: "You cannot position an index without offsets");
0277: if (ASSERTS) {
0278: // Get end of positions
0279: if (term < index.numberOfTerms - 1) {
0280: ibs.position(index.offsets.getLong(term + 1));
0281: positionsLength = ibs.readLongDelta();
0282: } else
0283: positionsLength = Long.MAX_VALUE; // Presently, no way to check
0284: }
0285: ibs.position(index.offsets.getLong(term));
0286: ibs.readBits(index.offsets.getLong(term));
0287: // Let us position the positions bitstream
0288: lastPositionsOffset = ibs.readLongDelta();
0289: positions.position(lastPositionsOffset);
0290: if (ASSERTS && positionsLength != Long.MAX_VALUE)
0291: positionsLength -= lastPositionsOffset;
0292: if (DEBUG && ASSERTS)
0293: System.err.println(this + ": positions for term "
0294: + term + " start at bit "
0295: + lastPositionsOffset + " ("
0296: + positionsLength + " bits)");
0297: }
0298: positions.readBits(0);
0299: this .term = term;
0300: readFrequency();
0301: }
0302:
0303: public int termNumber() {
0304: return term;
0305: }
0306:
0307: protected IndexIterator advance() throws IOException {
0308: if (term == index.numberOfTerms - 1)
0309: return null;
0310: if (term != -1) {
0311: skipTo(Integer.MAX_VALUE);
0312: // This guarantees we have no garbage before the frequency
0313: nextDocument();
0314: positionsUnread = false;
0315: }
0316: // We must skip the offset into the positions bitstream
0317: final long nextPositionsOffset = ibs.readLongDelta();
0318: positions.skip(nextPositionsOffset - lastPositionsOffset
0319: - positions.readBits());
0320: lastPositionsOffset = nextPositionsOffset;
0321: positions.readBits(0);
0322: term++;
0323: if (ASSERTS)
0324: positionsLength = -1; // Invalidate
0325: readFrequency();
0326: return this ;
0327: }
0328:
0329: private void readFrequency() throws IOException {
0330: // Read the frequency
0331: frequency = ibs.readGamma() + 1;
0332: if (DEBUG)
0333: System.err.println(this + ": Frequency for term "
0334: + term + " is " + frequency);
0335: hasPointers = frequency < index.numberOfDocuments;
0336: quantumBitLength = positionsQuantumBitLength = entryBitLength = -1;
0337: lowest = Integer.MAX_VALUE;
0338: if (ASSERTS)
0339: for (int i = height; i > Math
0340: .min(
0341: height,
0342: Fast
0343: .mostSignificantBit(frequency >> quantumDivisionShift)); i--)
0344: towerTopB[i] = towerLowerB[i] = pointerPrediction[i] = -1;
0345: final long pointerQuantumSigma = BitStreamIndex
0346: .quantumSigma(frequency, index.numberOfDocuments,
0347: quantum);
0348: for (int i = Math
0349: .min(
0350: height,
0351: Fast
0352: .mostSignificantBit(frequency >> quantumDivisionShift)); i >= 0; i--) {
0353: towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
0354: pointerQuantumSigma, i + 1);
0355: towerTopLog2B[i] = Fast
0356: .mostSignificantBit(towerTopB[i]);
0357: towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
0358: pointerQuantumSigma, i);
0359: towerLowerLog2B[i] = Fast
0360: .mostSignificantBit(towerLowerB[i]);
0361: pointerPrediction[i] = (int) ((quantum * (1L << i)
0362: * index.numberOfDocuments + frequency / 2) / frequency);
0363: }
0364: count = -1;
0365: currentDocument = -1;
0366: numberOfDocumentRecord = -1;
0367: positionsBitsOffset = 0;
0368: positionsBitSkip[0] = 0; // To avoid spurious tower updates on the first tower
0369: positionsToReadToReachCurrentPosition = 0;
0370: lastPositionsIncrement = 0;
0371: state = BEFORE_POINTER;
0372: }
0373:
0374: public Index index() {
0375: return keyIndex;
0376: }
0377:
0378: public int frequency() {
0379: return frequency;
0380: }
0381:
0382: private void ensureCurrentDocument() {
0383: if (currentDocument < 0)
0384: throw new IllegalStateException(
0385: "nextDocument() has never been called for (term="
0386: + term + ")");
0387: if (currentDocument == Integer.MAX_VALUE)
0388: throw new IllegalStateException(
0389: "This reader is positioned beyond the end of list of (term="
0390: + term + ")");
0391: }
0392:
0393: /**
0394: * Returns whether there are no more document records in the current inverted list.
0395: *
0396: * <p>This method returns true if the last document pointer of the current inverted list
0397: * has been read. It makes no distinction as to where (inside the last document record) this
0398: * reader is currently positioned. In particular, this method will return true independently
0399: * of whether count and positions have been read or not (we note by passing that this is the
0400: * only sensible behaviour, as you can build indices with or without counts/positions).
0401: *
0402: * <p>This method will return true also when this reader is positioned <em>beyond</em>
0403: * the last document pointer. In this case, {@link #currentDocumentPointer()} will return
0404: * {@link Integer#MAX_VALUE}.
0405: *
0406: * @return true whether there are no more document records in the current inverted list.
0407: */
0408: private boolean endOfList() {
0409: if (ASSERTS)
0410: assert numberOfDocumentRecord <= frequency;
0411: return numberOfDocumentRecord >= frequency - 1;
0412: }
0413:
0414: public int document() {
0415: if (ASSERTS)
0416: ensureCurrentDocument();
0417: return currentDocument;
0418: }
0419:
0420: public Payload payload() throws IOException {
0421: throw new UnsupportedOperationException("This index ("
0422: + index + ") does not contain payloads");
0423: }
0424:
0425: public int count() throws IOException {
0426: if (DEBUG)
0427: System.err.println(this + ".count()");
0428: if (count != -1)
0429: return count;
0430: if (ASSERTS)
0431: ensureCurrentDocument();
0432: if (state == BEFORE_TOWER)
0433: readTower();
0434: if (ASSERTS && state != BEFORE_COUNT)
0435: throw new IllegalStateException();
0436: state = BEFORE_POINTER;
0437: count = ibs.readGamma() + 1;
0438: return count;
0439: }
0440:
0441: protected void updatePositionCache() throws IOException {
0442: if (DEBUG)
0443: System.err.println(this + ".updatePositionCache()");
0444: positionsUnread = false;
0445: count(); // This will force reading the tower and updating positionsBitsOffset, if necessary
0446: if (positionsBitsOffset > positions.readBits()) {
0447: if (DEBUG)
0448: System.err.println(this
0449: + ": positionsBitsOffset="
0450: + positionsBitsOffset
0451: + ", positions.readBits()="
0452: + positions.readBits()
0453: + ", skipping by "
0454: + (positionsBitsOffset - positions
0455: .readBits()));
0456: positions.skip(positionsBitsOffset
0457: - positions.readBits());
0458: }
0459: if (ASSERTS)
0460: assert positionsToReadToReachCurrentPosition >= 0 : positionsToReadToReachCurrentPosition
0461: + " < 0";
0462: if (positionsToReadToReachCurrentPosition > 0) {
0463: if (DEBUG)
0464: System.err.println(this + ":Skipping sequentially "
0465: + positionsToReadToReachCurrentPosition
0466: + " positions...");
0467: // We skip, inside the current quantum, the positions we haven't read
0468: if (COOKIES) {
0469: positionsToReadToReachCurrentPosition--;
0470: if (positions.readDelta() != Integer.MAX_VALUE)
0471: throw new AssertionError();
0472: }
0473: positions
0474: .skipDeltas(positionsToReadToReachCurrentPosition);
0475: }
0476: // We must fix it so that nextDocument() will restore it to 0
0477: positionsToReadToReachCurrentPosition = -count;
0478: if (COOKIES)
0479: positionsToReadToReachCurrentPosition--;
0480: if (count > positionCache.length)
0481: positionCache = new int[Math.max(
0482: positionCache.length * 2, count)];
0483: final int[] occ = positionCache;
0484: if (COOKIES && positions.readDelta() != Integer.MAX_VALUE)
0485: throw new AssertionError();
0486: positions.readDeltas(occ, count);
0487: for (int i = 1; i < count; i++)
0488: occ[i] += occ[i - 1] + 1;
0489: }
0490:
0491: public IntIterator positions() throws IOException {
0492: if (ASSERTS)
0493: ensureCurrentDocument();
0494: if (positionsUnread)
0495: updatePositionCache();
0496: return IntIterators.wrap(positionCache, 0, count);
0497: }
0498:
0499: public int[] positionArray() throws IOException {
0500: if (ASSERTS)
0501: ensureCurrentDocument();
0502: if (positionsUnread)
0503: updatePositionCache();
0504: return positionCache;
0505: }
0506:
0507: // TODO: check who's using this (positionArray() is actually faster now)
0508: public int positions(final int[] position) throws IOException {
0509: if (ASSERTS)
0510: ensureCurrentDocument();
0511: if (positionsUnread)
0512: updatePositionCache(); // And also that positions have
0513: // been read
0514: if (position.length < count)
0515: return -count;
0516: for (int i = count; i-- != 0;)
0517: position[i] = this .positionCache[i];
0518: return count;
0519: }
0520:
0521: public int nextDocument() throws IOException {
0522: if (DEBUG)
0523: System.err.println("{" + this + "} nextDocument()");
0524: if (state != BEFORE_POINTER) {
0525: if (state == BEFORE_TOWER)
0526: readTower();
0527: if (state == BEFORE_COUNT) {
0528: count = ibs.readGamma() + 1;
0529: state = BEFORE_POINTER;
0530: }
0531: }
0532: if (endOfList())
0533: return -1;
0534: if (hasPointers) {// We do not write pointers for everywhere occurring terms.
0535: currentDocument += ibs.readDelta() + 1;
0536: } else
0537: currentDocument++;
0538: numberOfDocumentRecord++;
0539: if (ASSERTS && numberOfDocumentRecord > quantum)
0540: assert positionsBitsOffset > 0;
0541: if ((numberOfDocumentRecord & quantumModuloMask) == 0) {
0542: state = BEFORE_TOWER;
0543: positionsToReadToReachCurrentPosition = 0;
0544: } else {
0545: state = BEFORE_COUNT;
0546: if (ASSERTS)
0547: assert count > 0 : count + " <= " + 0;
0548: positionsToReadToReachCurrentPosition += count;
0549: if (COOKIES)
0550: positionsToReadToReachCurrentPosition++;
0551: }
0552: count = -1;
0553: positionsUnread = true;
0554: return currentDocument;
0555: }
0556:
0557: /**
0558: * Reads the entire skip tower for the current position. This method assumes that the tower
0559: * has been passed over sequentially, and correpondingly sets {@link #lastPositionsIncrement}
0560: * to the number of positions bits of the last quantum.
0561: */
0562: private void readTower() throws IOException {
0563: lastPositionsIncrement = maxh >= 0 ? positionsBitSkip[0]
0564: : 0;
0565: if (DEBUG)
0566: System.err.println(this
0567: + ": Setting lastPositionsIncrement to "
0568: + lastPositionsIncrement + " in readTower()");
0569: readTower(-1);
0570: if (DEBUG)
0571: System.err.println(this
0572: + ": Incrementing positionsBitsOffset by "
0573: + lastPositionsIncrement + " in readTower()");
0574: // TODO: this should be moved into readTower(int)
0575: positionsBitsOffset += lastPositionsIncrement;
0576: }
0577:
0578: /**
0579: * Reads the skip tower for the current position, possibly skipping part of the tower.
0580: *
0581: * <P>Note that this method will update {@link #state} only if it reads the entire tower,
0582: * otherwise the state remains {@link #BEFORE_TOWER}.
0583: *
0584: * @param pointer the tower will be read up to the first entry smaller than or equal to this
0585: * pointer; use -1 to guarantee that the entire tower will be read.
0586: */
0587: private void readTower(final int pointer) throws IOException {
0588: int i, j, k, cacheOffset, cache, towerLength = 0;
0589: long bitsAtTowerStart = 0;
0590: boolean truncated = false;
0591: if (ASSERTS)
0592: assert numberOfDocumentRecord % quantum == 0;
0593: if (ASSERTS && state != BEFORE_TOWER)
0594: throw new IllegalStateException(
0595: "readTower() called in state " + state);
0596: cacheOffset = (numberOfDocumentRecord & wModuloMask);
0597: k = cacheOffset >> quantumDivisionShift;
0598: if (ASSERTS && k == 0) { // Invalidate current tower data
0599: it.unimi.dsi.fastutil.ints.IntArrays.fill(pointerSkip,
0600: Integer.MAX_VALUE);
0601: it.unimi.dsi.fastutil.longs.LongArrays.fill(bitSkip,
0602: Integer.MAX_VALUE);
0603: it.unimi.dsi.fastutil.longs.LongArrays.fill(
0604: positionsBitSkip, Integer.MAX_VALUE);
0605: }
0606: // Compute the height of the current skip tower.
0607: s = (k == 0) ? height : Fast.leastSignificantBit(k);
0608: cache = frequency - w
0609: * (numberOfDocumentRecord >> wDivisionShift);
0610: if (cache < w) {
0611: maxh = Fast
0612: .mostSignificantBit((cache >> quantumDivisionShift)
0613: - k);
0614: if (maxh < s) {
0615: s = maxh;
0616: truncated = true;
0617: } else
0618: truncated = false;
0619: } else {
0620: cache = w;
0621: maxh = height;
0622: truncated = k == 0;
0623: }
0624: // assert w == cache || k == 0 || lastMaxh == Fast.mostSignificantBit( k ^ (
0625: // cache/quantum ) ) : lastMaxh +","+ (Fast.mostSignificantBit( k ^ ( cache/quantum )
0626: // ));
0627: i = s;
0628: if (s >= 0) {
0629: if (k == 0) {
0630: if (quantumBitLength < 0) {
0631: quantumBitLength = ibs.readDelta();
0632: positionsQuantumBitLength = ibs.readDelta();
0633: entryBitLength = ibs.readDelta();
0634: } else {
0635: quantumBitLength += Fast.nat2int(ibs
0636: .readDelta());
0637: positionsQuantumBitLength += Fast.nat2int(ibs
0638: .readDelta());
0639: entryBitLength += Fast.nat2int(ibs.readDelta());
0640: }
0641: if (DEBUG)
0642: System.err
0643: .println("{"
0644: + this
0645: + "} quantum bit length="
0646: + quantumBitLength
0647: + " positions quantum bit length="
0648: + positionsQuantumBitLength
0649: + " entry bit length="
0650: + entryBitLength);
0651: }
0652: if (DEBUG)
0653: System.err.println("{" + this
0654: + "} Reading tower; pointer=" + pointer
0655: + " maxh=" + maxh + " s=" + s);
0656: if (s > 0) {
0657: towerLength = entryBitLength * (s + 1)
0658: + Fast.nat2int(ibs.readDelta());
0659: if (DEBUG)
0660: System.err.println("{" + this
0661: + "} Tower length=" + towerLength);
0662: }
0663: // We store the number of bits read at the start of the tower (just after the
0664: // length).
0665: bitsAtTowerStart = ibs.readBits();
0666: if (truncated) {
0667: if (DEBUG)
0668: System.err.println("{" + this
0669: + "} Truncated--reading tops");
0670: // We read the tower top.
0671: pointerSkip[s] = Fast.nat2int(ibs.readGolomb(
0672: towerTopB[s], towerTopLog2B[s]))
0673: + pointerPrediction[s];
0674: bitSkip[s] = quantumBitLength * (1 << s)
0675: + entryBitLength * ((1 << s + 1) - s - 2)
0676: + Fast.nat2int(ibs.readLongDelta());
0677: positionsBitSkip[s] = positionsQuantumBitLength
0678: * (1 << s)
0679: + Fast.nat2int(ibs.readLongDelta());
0680: } else {
0681: // We copy the tower top from the lowest inherited entry suitably updated.
0682: pointerSkip[s] = pointerSkip[s + 1]
0683: - (currentDocument - pointerAtLastSkipTower);
0684: bitSkip[s] = bitSkip[s + 1]
0685: - (bitsAtTowerStart - readBitsAtLastSkipTower)
0686: - towerLength;
0687: // TODO: note that we could use the same method for pointers and bit skips!
0688: positionsBitSkip[s] = positionsBitSkip[s + 1]
0689: - positionsBitSkip[s];
0690: if (ASSERTS)
0691: assert positionsBitSkip[s] > 0 : positionsBitSkip[s]
0692: + " <= " + 0;
0693: }
0694: // We read the remaining part of the tower, at least until we point after pointer.
0695: if (currentDocument + pointerSkip[i] > pointer) {
0696: for (i = s - 1; i >= 0; i--) {
0697: pointerSkip[i] = Fast.nat2int(ibs.readGolomb(
0698: towerLowerB[i], towerLowerLog2B[i]))
0699: + pointerSkip[i + 1] / 2;
0700: bitSkip[i] = (bitSkip[i + 1] - entryBitLength
0701: * (i + 1))
0702: / 2 - Fast.nat2int(ibs.readLongDelta());
0703: positionsBitSkip[i] = positionsBitSkip[i + 1]
0704: / 2 - Fast.nat2int(ibs.readLongDelta());
0705: if (DEBUG
0706: && currentDocument + pointerSkip[i] <= pointer)
0707: System.err
0708: .println("{"
0709: + this
0710: + "} stopping reading at i="
0711: + i
0712: + " as currentDocument ("
0713: + currentDocument
0714: + ") plus pointer skip ("
0715: + pointerSkip[i]
0716: + ") is smaller than or equal target ("
0717: + pointer + ")");
0718: if (currentDocument + pointerSkip[i] <= pointer)
0719: break;
0720: }
0721: }
0722: }
0723: /*
0724: * If we did not read the entire tower, we need to fix the skips we read (as they are
0725: * offsets from the *end* of the tower) and moreover we must make unusable the rest of
0726: * the tower (for asserts).
0727: */
0728: if (i > 0) {
0729: final long fix = ibs.readBits() - bitsAtTowerStart;
0730: for (j = s; j >= i; j--)
0731: bitSkip[j] += towerLength - fix;
0732: if (ASSERTS)
0733: for (; j >= 0; j--)
0734: pointerSkip[j] = Integer.MAX_VALUE;
0735: } else
0736: state = BEFORE_COUNT;
0737: // We update the inherited tower.
0738: final long deltaBits = ibs.readBits()
0739: - readBitsAtLastSkipTower;
0740: final int deltaPointers = currentDocument
0741: - pointerAtLastSkipTower;
0742: if (ASSERTS)
0743: assert lastPositionsIncrement >= 0 : lastPositionsIncrement
0744: + " < " + 0;
0745: for (j = Fast.mostSignificantBit(k
0746: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
0747: bitSkip[j] -= deltaBits;
0748: positionsBitSkip[j] -= lastPositionsIncrement;
0749: pointerSkip[j] -= deltaPointers;
0750: }
0751: readBitsAtLastSkipTower = ibs.readBits();
0752: pointerAtLastSkipTower = currentDocument;
0753: lowest = i < 0 ? 0 : i;
0754: if (DEBUG) {
0755: System.err.println("{" + this + "} "
0756: + "Computed skip tower (lowest: " + lowest
0757: + ") for document record number "
0758: + numberOfDocumentRecord + " (pointer "
0759: + currentDocument + ") from " + Math.max(i, 0)
0760: + ": ");
0761: System.err.print("% ");
0762: for (j = 0; j <= s; j++)
0763: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
0764: + ":" + positionsBitSkip[j] + " ");
0765: System.err.print(" [");
0766: for (; j <= height; j++)
0767: System.err.print(pointerSkip[j] + ":" + bitSkip[j]
0768: + ":" + positionsBitSkip[j] + " ");
0769: System.err.print("]");
0770: System.err.println();
0771: }
0772: if (ASSERTS) {
0773: for (j = Fast.mostSignificantBit(k
0774: ^ (cache >> quantumDivisionShift)); j >= s + 1; j--) {
0775: assert positionsBitSkip[j] >= 0 : "positionsBitSkip["
0776: + j
0777: + "] = "
0778: + positionsBitSkip[j]
0779: + " < "
0780: + 0;
0781: assert positionsBitSkip[j] >= positionsBitSkip[j - 1] : "positionsBitSkip["
0782: + j
0783: + "] = "
0784: + positionsBitSkip[j]
0785: + " < "
0786: + positionsBitSkip[j - 1]
0787: + " = positionsBitSkip[" + (j - 1) + "]";
0788: }
0789: }
0790: }
0791:
0792: public int skipTo(final int p) throws IOException {
0793: if (DEBUG)
0794: System.err.println(this + ".skipTo(" + p
0795: + ") [currentDocument=" + currentDocument
0796: + ", numberOfDocumentRecord="
0797: + numberOfDocumentRecord
0798: + ", positionsBitsOffset="
0799: + positionsBitsOffset + "]");
0800: // If we are just at the start of a list, let us read the first pointer.
0801: if (numberOfDocumentRecord == -1)
0802: nextDocument(); // TODO: shouldn't we just read the
0803: // tower?
0804: if (currentDocument >= p) {
0805: if (DEBUG)
0806: System.err.println(this
0807: + ": No skip necessary, returning "
0808: + currentDocument);
0809: return currentDocument;
0810: }
0811: if (state == BEFORE_TOWER) {
0812: lastPositionsIncrement = maxh >= 0 ? positionsBitSkip[0]
0813: : 0;
0814: readTower(p);
0815: if (DEBUG)
0816: System.err.println(this
0817: + ": Incrementing positionsBitsOffset by "
0818: + lastPositionsIncrement
0819: + " in skipTo() initial phase");
0820: positionsBitsOffset += lastPositionsIncrement;
0821: }
0822: final int[] pointerSkip = this .pointerSkip;
0823: for (;;) {
0824: if (ASSERTS)
0825: assert maxh < 0 || lowest > 0
0826: || pointerSkip[0] != Integer.MAX_VALUE;
0827: // If on a defective quantum (no tower) or p is inside the current quantum (no
0828: // need to scan the tower) we bail out.
0829: if (maxh < 0 || lowest == 0
0830: && pointerAtLastSkipTower + pointerSkip[0] > p)
0831: break;
0832: if (DEBUG)
0833: System.err.println(this
0834: + ": In for loop, currentDocument="
0835: + currentDocument + ", maxh=" + maxh
0836: + ", numberOfDocumentRecord="
0837: + numberOfDocumentRecord + ", p=" + p);
0838: final int cacheOffset = (numberOfDocumentRecord & wModuloMask);
0839: final int k = cacheOffset >> quantumDivisionShift;
0840: final int top = Fast
0841: .mostSignificantBit(k
0842: ^ (Math.min(w, frequency
0843: - numberOfDocumentRecord
0844: + cacheOffset) >> quantumDivisionShift));
0845: int i;
0846: for (i = lowest; i <= top; i++) {
0847: if (ASSERTS && (k & 1 << i) != 0)
0848: assert pointerSkip[i] == pointerSkip[i + 1];
0849: if (ASSERTS)
0850: assert pointerSkip[i] != Integer.MAX_VALUE : "Invalid pointer skip "
0851: + i
0852: + " (lowest="
0853: + lowest
0854: + ", top="
0855: + top + ")";
0856: if (pointerAtLastSkipTower + pointerSkip[i] > p)
0857: break;
0858: }
0859: if (--i < 0)
0860: break;
0861: if (ASSERTS)
0862: assert i >= lowest : i + " < " + lowest;
0863: if (DEBUG)
0864: System.err.println(this
0865: + ": Safely after for with i=" + i
0866: + ", P[i]=" + pointerSkip[i] + ", A[i]="
0867: + bitSkip[i] + ", B[i]="
0868: + positionsBitSkip[i]);
0869: if (DEBUG)
0870: System.err
0871: .println(this
0872: + ": ["
0873: + ibs.readBits()
0874: + "] Skipping "
0875: + (bitSkip[i] - (ibs.readBits() - readBitsAtLastSkipTower))
0876: + " bits ("
0877: + (((k & -(1 << i)) + (1 << i))
0878: * quantum - cacheOffset)
0879: + " records) to get to document pointer "
0880: + (currentDocument + pointerSkip[i]));
0881: ibs.skip(bitSkip[i]
0882: - (ibs.readBits() - readBitsAtLastSkipTower));
0883: /* We accumulate the number of bits that must be skipped in the positions stream to reach the position
0884: * corresponding to the current tower. */
0885: if (DEBUG)
0886: System.err.println(this
0887: + ": Incrementing positionsBitsOffset by "
0888: + positionsBitSkip[i] + " (height " + i
0889: + ")");
0890: lastPositionsIncrement = positionsBitSkip[i];
0891: positionsBitsOffset += lastPositionsIncrement;
0892: if (ASSERTS && positionsLength != -1)
0893: assert positionsBitsOffset <= positionsLength : positionsBitsOffset
0894: + " > " + positionsLength;
0895: state = BEFORE_TOWER;
0896: currentDocument = pointerSkip[i]
0897: + pointerAtLastSkipTower;
0898: numberOfDocumentRecord += ((k & -(1 << i)) + (1 << i))
0899: * quantum - cacheOffset;
0900: // If we skipped beyond the end of the list, we invalidate the current document.
0901: if (numberOfDocumentRecord == frequency) {
0902: currentDocument = Integer.MAX_VALUE;
0903: positionsUnread = false;
0904: state = BEFORE_POINTER; // We are actually before a frequency, but we must
0905: // avoid that calls to nextDocument() read anything
0906: } else {
0907: positionsUnread = true;
0908: readTower(p); // Note that if we are exactly on the destination pointer, we will read the entire tower.
0909: }
0910: count = -1; // We must invalidate count as readDocumentPointer() would do.
0911: positionsToReadToReachCurrentPosition = 0;
0912: if (endOfList()) {
0913: if (DEBUG)
0914: System.err.println(this
0915: + ".toSkip(): end-of-list, returning "
0916: + currentDocument + ", state=" + state);
0917: // Note that if p == Integer.MAX_VALUE, we are certainly beyond end-of-list
0918: return p == Integer.MAX_VALUE ? Integer.MAX_VALUE
0919: : currentDocument;
0920: }
0921: }
0922: if (DEBUG)
0923: System.err.println(this
0924: + ": Completing sequentially, currentDocument="
0925: + currentDocument + ", numberOfDocumentRecord="
0926: + numberOfDocumentRecord + ", p=" + p);
0927: while (currentDocument < p) {
0928: if (DEBUG)
0929: System.err
0930: .println(this
0931: + ": Skipping sequentially (second), currentDocument="
0932: + currentDocument
0933: + ", numberOfDocumentRecord="
0934: + numberOfDocumentRecord + ", p="
0935: + p);
0936: if (nextDocument() == -1) {
0937: if (DEBUG)
0938: System.err.println(this
0939: + ": end-of-list, returning MAX_VALUE");
0940: return Integer.MAX_VALUE;
0941: }
0942: }
0943: if (DEBUG)
0944: System.err.println(this + ".toSkip(): Returning "
0945: + currentDocument);
0946: return currentDocument;
0947: }
0948:
0949: public void dispose() throws IOException {
0950: parent.close();
0951: }
0952:
0953: public boolean hasNext() {
0954: return !endOfList();
0955: }
0956:
0957: public int nextInt() {
0958: if (!hasNext())
0959: throw new NoSuchElementException();
0960: try {
0961: return nextDocument();
0962: } catch (IOException e) {
0963: throw new RuntimeException(e);
0964: }
0965: }
0966:
0967: public String toString() {
0968: return index + " [" + term + "]";
0969: }
0970:
0971: /**
0972: * An interval iterator returning the positions of the current document as singleton
0973: * intervals.
0974: */
0975: private final class IndexIntervalIterator extends
0976: AbstractObjectIterator<Interval> implements
0977: IntervalIterator {
0978: int pos = -1;
0979:
0980: public void reset() throws IOException {
0981: pos = -1;
0982: if (positionsUnread)
0983: updatePositionCache(); // This guarantees the position cache is ok
0984: }
0985:
0986: public void intervalTerms(final IntSet terms) {
0987: terms
0988: .add(BitStreamHPIndexReaderIndexIterator.this .term);
0989: }
0990:
0991: public boolean hasNext() {
0992: return pos < count - 1;
0993: }
0994:
0995: public Interval next() {
0996: if (!hasNext())
0997: throw new NoSuchElementException();
0998: return Interval.valueOf(positionCache[++pos]);
0999: }
1000:
1001: public Interval nextInterval() {
1002: return pos < count - 1 ? Interval
1003: .valueOf(positionCache[++pos]) : null;
1004: }
1005:
1006: public int extent() {
1007: return 1;
1008: }
1009:
1010: public String toString() {
1011: return index + ": " + term + "[doc=" + currentDocument
1012: + ", count=" + count + ", pos=" + pos + "]";
1013: }
1014: };
1015:
1016: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
1017: throws IOException {
1018: intervalIterator();
1019: return singletonIntervalIterator;
1020: }
1021:
1022: public IntervalIterator intervalIterator() throws IOException {
1023: return intervalIterator(keyIndex);
1024: }
1025:
1026: public IntervalIterator intervalIterator(final Index index)
1027: throws IOException {
1028: if (ASSERTS)
1029: ensureCurrentDocument();
1030: // TODO: this was if ( index != keyIndex || hasPayloads )
1031: if (index != keyIndex)
1032: return IntervalIterators.TRUE;
1033: if (ASSERTS)
1034: assert intervalIterator != null;
1035: intervalIterator.reset();
1036: return intervalIterator;
1037: }
1038:
1039: public ReferenceSet<Index> indices() {
1040: return index.singletonSet;
1041: }
1042: }
1043:
1044: private IndexIterator documents(final CharSequence term,
1045: final int termNumber) throws IOException {
1046: indexIterator.term(term);
1047: indexIterator.position(termNumber);
1048: return indexIterator;
1049: }
1050:
1051: public IndexIterator documents(final int term) throws IOException {
1052: return documents(null, term);
1053: }
1054:
1055: public IndexIterator documents(final CharSequence term)
1056: throws IOException {
1057: if (closed)
1058: throw new IllegalStateException("This "
1059: + getClass().getSimpleName() + " has been closed");
1060: if (index.termMap != null) {
1061: final int termIndex = (int) index.termMap.getLong(term);
1062: if (termIndex == -1)
1063: return index.emptyIndexIterator;
1064: return documents(term, termIndex);
1065: }
1066: throw new UnsupportedOperationException("Index " + index
1067: + " has no term map");
1068: }
1069:
1070: @Override
1071: public IndexIterator nextIterator() throws IOException {
1072: return indexIterator.advance();
1073: }
1074:
1075: public String toString() {
1076: return getClass().getSimpleName() + "[" + index + "]";
1077: }
1078:
1079: public void close() throws IOException {
1080: super.close();
1081: indexIterator.ibs.close();
1082: indexIterator.positions.close();
1083: }
1084: }
|