001: package it.unimi.dsi.mg4j.index.wired;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2006 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023: import it.unimi.dsi.fastutil.ints.IntIterator;
024: import it.unimi.dsi.fastutil.ints.IntIterators;
025: import it.unimi.dsi.fastutil.ints.IntSet;
026: import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
027: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
029: import it.unimi.dsi.fastutil.objects.ReferenceSet;
030: import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
031: import it.unimi.dsi.mg4j.index.AbstractIndexReader;
032: import it.unimi.dsi.mg4j.index.BitStreamIndex;
033: import it.unimi.dsi.mg4j.index.Index;
034: import it.unimi.dsi.mg4j.index.IndexIterator;
035: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
036: import it.unimi.dsi.mg4j.index.payload.Payload;
037: import it.unimi.dsi.io.InputBitStream;
038: import it.unimi.dsi.util.Interval;
039: import it.unimi.dsi.mg4j.search.IntervalIterator;
040: import it.unimi.dsi.mg4j.search.IntervalIterators;
041: import it.unimi.dsi.bits.Fast;
042: import it.unimi.dsi.Util;
043: import java.io.IOException;
044: import java.util.NoSuchElementException;
045: import org.apache.log4j.Logger;
046:
047: public class GammaDeltaGammaDeltaBitStreamIndexReader extends
048: AbstractIndexReader {
049: @SuppressWarnings("unused")
050: private static final Logger LOGGER = Util
051: .getLogger(GammaDeltaGammaDeltaBitStreamIndexReader.class);
052: /** The reference index. */
053: protected final BitStreamIndex index;
054: private final static boolean ASSERTS = false;
055: private final static boolean DEBUG = false;
056: /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
057: protected final BitStreamIndexReaderIndexIterator indexIterator;
058:
059: /** Creates a new skip index reader, with the specified underlying {@link Index} and input bit stream.
060: *
061: * @param index the index.
062: * @param ibs the underlying bit stream.
063: */
064: public GammaDeltaGammaDeltaBitStreamIndexReader(
065: final BitStreamIndex index, final InputBitStream ibs) {
066: this .index = index;
067: this .indexIterator = new BitStreamIndexReaderIndexIterator(
068: this , ibs);
069: }
070:
071: protected static final class BitStreamIndexReaderIndexIterator
072: extends AbstractIndexIterator implements IndexIterator {
073: /** The enclosing instance. */
074: private final GammaDeltaGammaDeltaBitStreamIndexReader parent;
075: /** The reference index. */
076: protected final BitStreamIndex index;
077: /** The underlying input bit stream. */
078: protected final InputBitStream ibs;
079: /** The enclosed interval iterator. */
080: private final IndexIntervalIterator intervalIterator;
081: /** A singleton set containing the enclosed interval iterator. */
082: private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;
083: /** The key index. */
084: private final Index keyIndex;
085: /** The cached copy of {@link #index index.pointerCoding}. */
086: protected final Coding pointerCoding;
087: /** The cached copy of {@link #index index.countCoding}. */
088: protected final Coding countCoding;
089: /** The cached copy of {@link #index index.positionCoding}. */
090: protected final Coding positionCoding;
091: /** The current term. */
092: protected int currentTerm = -1;
093: /** The current frequency. */
094: protected int frequency;
095: /** Whether the current terms has pointers at all (this happens when the {@link #frequency} is smaller than the number of documents). */
096: protected boolean hasPointers;
097: /** The current count (if this index contains counts). */
098: protected int count;
099: /** The last document pointer we read from current list, -1 if we just read the frequency,
100: * {@link Integer#MAX_VALUE} if we are beyond the end of list. */
101: protected int currentDocument;
102: /** The number of the document record we are going to read inside the current inverted list. */
103: protected int numberOfDocumentRecord;
104: /** This variable tracks the current state of the reader. */
105: protected int state;
106: /** This value of {@link #state} can be assumed only in indices that contain a payload; it
107: * means that we are positioned just before the payload for the current document record. */
108: private static final int BEFORE_PAYLOAD = 1;
109: /** This value of {@link #state} can be assumed only in indices that contain counts; it
110: * means that we are positioned just before the count for the current document record. */
111: private static final int BEFORE_COUNT = 2;
112: /** This value of {@link #state} can be assumed only in indices that contain document positions;
113: * it means that we are positioned just before the position list of the current document record. */
114: private static final int BEFORE_POSITIONS = 3;
115: /** This value of {@link #state} means that we are at the start of a new document record,
116: * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == {@link #frequency}),
117: * in which case we are at the end of the inverted list, and {@link #endOfList()} is true. */
118: private static final int BEFORE_POINTER = 4;
119: /** The cached position array. */
120: protected int[] positionCache = new int[16];
121:
122: public BitStreamIndexReaderIndexIterator(
123: final GammaDeltaGammaDeltaBitStreamIndexReader parent,
124: final InputBitStream ibs) {
125: this .parent = parent;
126: this .ibs = ibs;
127: index = parent.index;
128: keyIndex = index.keyIndex;
129: pointerCoding = index.pointerCoding;
130: if (index.hasPayloads)
131: throw new IllegalStateException();
132: if (!index.hasCounts)
133: throw new IllegalStateException();
134: countCoding = index.countCoding;
135: if (!index.hasPositions)
136: throw new IllegalStateException();
137: positionCoding = index.positionCoding;
138: intervalIterator = index.hasPositions ? new IndexIntervalIterator()
139: : null;
140: singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps
141: .singleton(keyIndex,
142: (IntervalIterator) intervalIterator)
143: : null;
144: }
145:
146: /** Positions the index on the inverted list of a given term.
147: *
148: * <p>This method can be called at any time. Note that it is <em>always</em> possible
149: * to call this method with argument 0, even if offsets have not been loaded.
150: *
151: * @param term a term.
152: */
153: protected void position(final int term) throws IOException {
154: if (term == 0) {
155: ibs.position(0);
156: ibs.readBits(0);
157: } else {
158: if (index.offsets == null)
159: throw new IllegalStateException(
160: "You cannot position an index without offsets");
161: final long offset = index.offsets.getLong(term);
162: ibs.position(offset);
163: // TODO: Can't we set this to 0?
164: ibs.readBits(offset);
165: }
166: currentTerm = term;
167: readFrequency();
168: }
169:
170: public int termNumber() {
171: return currentTerm;
172: }
173:
174: protected IndexIterator advance() throws IOException {
175: if (currentTerm == index.numberOfTerms - 1)
176: return null;
177: if (currentTerm != -1) {
178: skipTo(Integer.MAX_VALUE);
179: nextDocument(); // This guarantees we have no garbage before the frequency
180: }
181: currentTerm++;
182: readFrequency();
183: return this ;
184: }
185:
186: private void readFrequency() throws IOException {
187: // Read the frequency
188: frequency = ibs.readGamma() + 1;
189: hasPointers = frequency < index.numberOfDocuments;
190: count = -1;
191: currentDocument = -1;
192: numberOfDocumentRecord = -1;
193: state = BEFORE_POINTER;
194: }
195:
196: public Index index() {
197: return keyIndex;
198: }
199:
200: public int frequency() {
201: return frequency;
202: }
203:
204: private void ensureCurrentDocument() {
205: if (currentDocument < 0)
206: throw new IllegalStateException(
207: "nextDocument() has never been called for (term="
208: + currentTerm + ")");
209: if (currentDocument == Integer.MAX_VALUE)
210: throw new IllegalStateException(
211: "This reader is positioned beyond the end of list of (term="
212: + currentTerm + ")");
213: }
214:
215: /** Returns whether there are no more document records in the current inverted list.
216: *
217: * <p>This method returns true if the last document pointer of the current inverted
218: * list has been read. It makes no distinction as to where (inside the last document
219: * record) this reader is currently positioned. In particular, this method will
220: * return true independently of whether count and positions have been read or not (we
221: * note by passing that this is the only sensible behaviour, as you can build indices
222: * with or without counts/positions).
223: *
224: * <p>This method will return true also when this reader is positioned <em>beyond</em>
225: * the last document pointer. In this case, {@link #currentDocumentPointer()} will
226: * return {@link Integer#MAX_VALUE}.
227: *
228: * @return true whether there are no more document records in the current inverted list.
229: */
230: private boolean endOfList() {
231: if (ASSERTS)
232: assert numberOfDocumentRecord <= frequency;
233: return numberOfDocumentRecord >= frequency - 1;
234: }
235:
236: public int document() {
237: if (ASSERTS)
238: ensureCurrentDocument();
239: return currentDocument;
240: }
241:
242: public Payload payload() throws IOException {
243: if (DEBUG)
244: System.err.println(this + ".payload()");
245: if (ASSERTS)
246: ensureCurrentDocument();
247: throw new UnsupportedOperationException("This index ("
248: + index + ") does not contain payloads");
249: }
250:
251: public int count() throws IOException {
252: if (DEBUG)
253: System.err.println(this + ".count()");
254: if (count != -1)
255: return count;
256: if (ASSERTS)
257: ensureCurrentDocument();
258: {
259: state = BEFORE_POSITIONS;
260: count = ibs.readGamma() + 1;
261: }
262: return count;
263: }
264:
265: /** We read positions, assuming state <= BEFORE_POSITIONS */
266: @SuppressWarnings("unused")
267: protected void updatePositionCache() throws IOException {
268: if (ASSERTS)
269: assert state <= BEFORE_POSITIONS;
270: if (state < BEFORE_POSITIONS) {
271: if (state == BEFORE_COUNT) {
272: state = BEFORE_POSITIONS;
273: count = ibs.readGamma() + 1;
274: }
275: }
276: if (count > positionCache.length)
277: positionCache = new int[Math.max(
278: positionCache.length * 2, count)];
279: final int[] occ = positionCache;
280: state = BEFORE_POINTER;
281: ibs.readDeltas(occ, count);
282: for (int i = 1; i < count; i++)
283: occ[i] += occ[i - 1] + 1;
284: }
285:
286: public IntIterator positions() throws IOException {
287: if (ASSERTS)
288: ensureCurrentDocument();
289: if (state <= BEFORE_POSITIONS)
290: updatePositionCache();
291: return IntIterators.wrap(positionCache, 0, count);
292: }
293:
294: public int[] positionArray() throws IOException {
295: if (ASSERTS)
296: ensureCurrentDocument();
297: if (state <= BEFORE_POSITIONS)
298: updatePositionCache();
299: return positionCache;
300: }
301:
302: // TODO: check who's using this (positionArray() is actually faster now)
303: public int positions(final int[] position) throws IOException {
304: if (ASSERTS)
305: ensureCurrentDocument();
306: if (state <= BEFORE_POSITIONS)
307: updatePositionCache(); // And also that positions have been read
308: if (position.length < count)
309: return -count;
310: for (int i = count; i-- != 0;)
311: position[i] = this .positionCache[i];
312: return count;
313: }
314:
315: public int nextDocument() throws IOException {
316: if (DEBUG)
317: System.err.println("{" + this + "} nextDocument()");
318: if (state != BEFORE_POINTER) {
319: if (state == BEFORE_COUNT) {
320: state = BEFORE_POSITIONS;
321: count = ibs.readGamma() + 1;
322: }
323: if (state == BEFORE_POSITIONS) {
324: // Here we just skip; note that the state change is necessary if endOfList() is true
325: state = BEFORE_POINTER;
326: ibs.skipDeltas(count);
327: }
328: }
329: if (endOfList())
330: return -1;
331: if (hasPointers) {// We do not write pointers for everywhere occurring terms.
332: currentDocument += ibs.readDelta() + 1;
333: } else
334: currentDocument++;
335: numberOfDocumentRecord++;
336: state = BEFORE_COUNT;
337: count = -1;
338: return currentDocument;
339: }
340:
341: public int skipTo(final int p) throws IOException {
342: if (DEBUG)
343: System.err.println(this + ".skipTo(" + p
344: + ") [currentDocument=" + currentDocument
345: + ", numberOfDocumentRecord="
346: + numberOfDocumentRecord + ", endOfList()="
347: + endOfList());
348: // If we are just at the start of a list, let us read the first pointer.
349: if (numberOfDocumentRecord == -1)
350: nextDocument(); // TODO: shouldn't we just read the tower?
351: if (currentDocument >= p) {
352: if (DEBUG)
353: System.err.println(this
354: + ": No skip necessary, returning "
355: + currentDocument);
356: return currentDocument;
357: }
358: while (currentDocument < p) {
359: if (DEBUG)
360: System.err
361: .println(this
362: + ": Skipping sequentially (second), currentDocument="
363: + currentDocument
364: + ", numberOfDocumentRecord="
365: + numberOfDocumentRecord + ", p="
366: + p);
367: if (nextDocument() == -1) {
368: if (DEBUG)
369: System.err.println(this
370: + ": end-of-list, returning MAX_VALUE");
371: return Integer.MAX_VALUE;
372: }
373: }
374: if (DEBUG)
375: System.err.println(this + ".toSkip(): Returning "
376: + currentDocument);
377: return currentDocument;
378: }
379:
380: public void dispose() throws IOException {
381: parent.close();
382: }
383:
384: public boolean hasNext() {
385: return !endOfList();
386: }
387:
388: public int nextInt() {
389: if (!hasNext())
390: throw new NoSuchElementException();
391: try {
392: return nextDocument();
393: } catch (IOException e) {
394: throw new RuntimeException(e);
395: }
396: }
397:
398: public String toString() {
399: return index + " [" + currentTerm + "]";
400: }
401:
402: /** An interval iterator returning the positions of the current document as singleton intervals. */
403: private final class IndexIntervalIterator extends
404: AbstractObjectIterator<Interval> implements
405: IntervalIterator {
406: int pos = -1;
407:
408: public void reset() throws IOException {
409: pos = -1;
410: if (state <= BEFORE_POSITIONS)
411: updatePositionCache(); // This guarantees the position cache is ok
412: }
413:
414: public void intervalTerms(final IntSet terms) {
415: terms
416: .add(BitStreamIndexReaderIndexIterator.this .currentTerm);
417: }
418:
419: public boolean hasNext() {
420: return pos < count - 1;
421: }
422:
423: public Interval next() {
424: if (!hasNext())
425: throw new NoSuchElementException();
426: return Interval.valueOf(positionCache[++pos]);
427: }
428:
429: public Interval nextInterval() {
430: return pos < count - 1 ? Interval
431: .valueOf(positionCache[++pos]) : null;
432: }
433:
434: public int extent() {
435: return 1;
436: }
437:
438: public String toString() {
439: return index + ": " + term + "[doc=" + currentDocument
440: + ", count=" + count + ", pos=" + pos + "]";
441: }
442: };
443:
444: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
445: throws IOException {
446: intervalIterator();
447: return singletonIntervalIterator;
448: }
449:
450: public IntervalIterator intervalIterator() throws IOException {
451: return intervalIterator(keyIndex);
452: }
453:
454: public IntervalIterator intervalIterator(final Index index)
455: throws IOException {
456: if (ASSERTS)
457: ensureCurrentDocument();
458: // TODO: this was if ( index != keyIndex || hasPayloads )
459: if (index != keyIndex)
460: return IntervalIterators.TRUE;
461: if (ASSERTS)
462: assert intervalIterator != null;
463: intervalIterator.reset();
464: return intervalIterator;
465: }
466:
467: public ReferenceSet<Index> indices() {
468: return index.singletonSet;
469: }
470: }
471:
472: private IndexIterator documents(final CharSequence term,
473: final int termNumber) throws IOException {
474: indexIterator.term(term);
475: indexIterator.position(termNumber);
476: return indexIterator;
477: }
478:
479: public IndexIterator documents(final int term) throws IOException {
480: return documents(null, term);
481: }
482:
483: public IndexIterator documents(final CharSequence term)
484: throws IOException {
485: if (closed)
486: throw new IllegalStateException("This "
487: + getClass().getSimpleName() + " has been closed");
488: if (index.termMap != null) {
489: final int termIndex = (int) index.termMap.getLong(term);
490: if (termIndex == -1)
491: return index.emptyIndexIterator;
492: return documents(term, termIndex);
493: }
494: throw new UnsupportedOperationException("Index " + index
495: + " has no term map");
496: }
497:
498: @Override
499: public IndexIterator nextIterator() throws IOException {
500: return indexIterator.advance();
501: }
502:
503: public String toString() {
504: return getClass().getSimpleName() + "[" + index + "]";
505: }
506:
507: public void close() throws IOException {
508: super.close();
509: indexIterator.ibs.close();
510: }
511: }
|