001: package it.unimi.dsi.mg4j.util;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023: import it.unimi.dsi.fastutil.longs.AbstractLongList;
024: import it.unimi.dsi.io.InputBitStream;
025:
026: import java.io.IOException;
027:
028: /** Provides semi-external random access to offsets of an {@link it.unimi.dsi.mg4j.index.Index index}.
029: *
030: * <p>This class is a semi-external {@link it.unimi.dsi.fastutil.longs.LongList} that
031: * MG4J uses as default for accessing term offsets.
032: *
033: * <p>When the number of terms in the index grows, storing each offset as a long in an
034: * array can consume hundred of megabytes of memory, and most of this memory is wasted,
035: * as it is occupied by offsets of <i>hapax legomena</i> (terms occurring just once in the
036: * collection). Instead, this class accesses offsets in their
037: * compressed forms, and provides entry points for random access to each offset. At construction
038: * time, entry points are computed with a certain <em>step</em>, which is the number of offsets
039: * accessible from each entry point, or, equivalently, the maximum number of offsets that will
040: * be necessary to read to access a given offset.
041: *
042: * <p><strong>Warning:</strong> This class is not thread safe, and needs to be synchronised to be used in a
043: * multithreaded environment.
044: *
045: * @author Fabien Campagne
046: * @author Sebastiano Vigna
047: */
048: public class SemiExternalOffsetList extends AbstractLongList {
049: /** Position in the offset stream for each random access entry point (one each {@link #offsetStep} elements). */
050: private final long[] position;
051: /** An array parallel to {@link #position} recording the value of the offset for each random access entry point. */
052: private final long[] startValue;
053: /** Stream over the compressed offset information. */
054: private final InputBitStream ibs;
055: /** Maximum number of times {@link InputBitStream#readLongGamma()} will be called to access an offset. */
056: private final int offsetStep;
057: /** The number of offsets. */
058: private final int numOffsets;
059:
060: /** Creates a new semi-external list.
061: *
062: * @param offsetRawData a bit stream containing the offsets in compressed form (γ-encoded deltas).
063: * @param offsetStep the step used to build random-access entry points.
064: * @param numOffsets the overall number of offsets (i.e., the number of terms).
065: */
066:
067: public SemiExternalOffsetList(final InputBitStream offsetRawData,
068: final int offsetStep, final int numOffsets)
069: throws IOException {
070: int slots = (numOffsets + offsetStep - 1) / offsetStep;
071: this .position = new long[slots];
072: this .startValue = new long[slots];
073: this .offsetStep = offsetStep;
074: this .numOffsets = numOffsets;
075: this .ibs = offsetRawData;
076: prepareRandomAccess(numOffsets);
077: }
078:
079: /** Scans {@link #ibs} and fills the necessary data in {@link #position} and {@link #startValue}.
080: *
081: * @param numOffsets the number of offsets.
082: */
083:
084: private void prepareRandomAccess(final int numOffsets)
085: throws IOException {
086: long offset = 0;
087: ibs.position(0);
088:
089: int k = 0;
090: int slotIndex = 0;
091:
092: for (int i = numOffsets; i-- != 0;) {
093: offset += ibs.readLongGamma();
094:
095: if (k-- == 0) {
096: k = offsetStep - 1;
097:
098: startValue[slotIndex] = offset;
099: position[slotIndex] = ibs.readBits();
100: slotIndex++;
101: }
102: }
103: }
104:
105: public final long getLong(final int index) {
106: if (index < 0)
107: throw new IndexOutOfBoundsException();
108: final int slotNumber = index / offsetStep;
109: final int k = index % offsetStep;
110: if (k == 0) {
111: // exact match to an index in startValue:
112: return startValue[slotNumber];
113: } else {
114: try {
115: long value = startValue[slotNumber];
116: ibs.position(position[slotNumber]);
117: for (int i = k; i-- != 0;) {
118: final long diff = ibs.readLongGamma();
119: // System.out.println("diff: " + diff);
120: value += diff;
121: }
122: return value;
123: } catch (IOException e) {
124: throw new RuntimeException(e);
125: }
126: }
127: }
128:
129: public int size() {
130: return numOffsets;
131: }
132: }
|