0001: package it.unimi.dsi.mg4j.index;
0002:
0003: /*
0004: * MG4J: Managing Gigabytes for Java
0005: *
0006: * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna
0007: *
0008: * This library is free software; you can redistribute it and/or modify it
0009: * under the terms of the GNU Lesser General Public License as published by the Free
0010: * Software Foundation; either version 2.1 of the License, or (at your option)
0011: * any later version.
0012: *
0013: * This library is distributed in the hope that it will be useful, but
0014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
0015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
0016: * for more details.
0017: *
0018: * You should have received a copy of the GNU Lesser General Public License
0019: * along with this program; if not, write to the Free Software
0020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0021: *
0022: */
0023:
0024: import it.unimi.dsi.bits.Fast;
0025: import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;
0026: import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
0027: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
0028: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
0029: import it.unimi.dsi.mg4j.index.payload.Payload;
0030: import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;
0031: import it.unimi.dsi.io.NullOutputStream;
0032: import it.unimi.dsi.io.OutputBitStream;
0033: import it.unimi.dsi.mg4j.search.score.VignaScorer;
0034: import it.unimi.dsi.Util;
0035: import it.unimi.dsi.lang.MutableString;
0036: import it.unimi.dsi.util.Properties;
0037:
0038: import java.io.File;
0039: import java.io.FileInputStream;
0040: import java.io.FileOutputStream;
0041: import java.io.IOException;
0042: import java.io.PrintStream;
0043: import java.util.Map;
0044:
0045: /** Writes a bitstream-based high-performance index. The comments about
0046: * offsets in the documentation of {@link BitStreamIndexWriter} apply here, too.
0047: *
0048: * <p>The difference between indices generated by this class and those generated
0049: * by {@link BitStreamIndexWriter} lie in the level
0050: * of interleaving. Indices generated by this class have positions in a separate stream (similarly to Lucene), and
0051: * a compulsory skip structure (an extension of that used by a {@link BitStreamIndexWriter})
0052: * that indexes both the main index file and the positions file. This can result in major performance
0053: * improvement in the resolution of position-based operators (e.g., phrases) and in the evaluation
0054: * of {@linkplain VignaScorer proximity-based scorers}. Since the overhead due to the additional
0055: * skip structure and to the separate positions stream is negligible, indices generated by
0056: * this class are the default in MG4J.
0057: *
0058: * <p>Presently, indices generated by this class cannot carry payloads: you must use a {@link BitStreamIndexWriter}
0059: * in that case. Moreover, only nonparametric indices can be used for positions
0060: * (this limitation rules out {@link Coding#GOLOMB}, {@link Coding#SKEWED_GOLOMB}, and {@link Coding#INTERPOLATIVE}).
0061: *
0062: * @author Sebastiano Vigna
0063: * @since 1.2
0064: */
0065:
0066: public class BitStreamHPIndexWriter extends
0067: AbstractBitStreamIndexWriter implements IndexWriter {
0068: private static final boolean ASSERTS = false;
0069: private static final boolean DEBUG = false;
0070: private static final boolean COOKIES = false;
0071:
0072: /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists
0073: * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */
0074: public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;
0075:
0076: /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
0077: protected static final int BEFORE_INVERTED_LIST = 0;
0078:
0079: /** This value of {@link #state} means that we are positioned at the start of an inverted list,
0080: * and we should call {@link #writeFrequency(int)}.*/
0081: protected static final int BEFORE_FREQUENCY = 1;
0082:
0083: /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
0084: protected static final int BEFORE_DOCUMENT_RECORD = 2;
0085:
0086: /** This value of {@link #state} means that we just started a new document record, and we
0087: * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */
0088: protected static final int BEFORE_POINTER = 3;
0089:
0090: /** This value of {@link #state} can be assumed only in indices that contain payloads; it
0091: * means that we are positioned just before the payload for the current document record. */
0092: protected static final int BEFORE_PAYLOAD = 4;
0093:
0094: /** This value of {@link #state} can be assumed only in indices that contain counts; it
0095: * means that we are positioned just before the count for the current document record. */
0096: protected static final int BEFORE_COUNT = 5;
0097:
0098: /** This value of {@link #state} can be assumed only in indices that contain document positions;
0099: * it means that we are positioned just before the position list of the current document record. */
0100: protected static final int BEFORE_POSITIONS = 6;
0101:
0102: /** This is the first unused state. Subclasses may start from this value to define new states. */
0103: protected static final int FIRST_UNUSED_STATE = 7;
0104:
0105: /** The underlying index {@link OutputBitStream}. */
0106: protected OutputBitStream obs;
0107: /** The underlying positions {@link OutputBitStream}. */
0108: protected OutputBitStream positions;
0109: /** The offset {@link OutputBitStream}. */
0110: private OutputBitStream offset;
0111: /** The current state of the writer. */
0112: protected int state;
0113: /** The number of document records that the current inverted list will contain. */
0114: protected int frequency;
0115: /** The number of document records already written for the current inverted list. */
0116: protected int writtenDocuments;
0117: /** The current document pointer. */
0118: protected int currentDocument;
0119: /** The last document pointer in the current list. */
0120: protected int lastDocument;
0121: /** The position (in bytes) where the last inverted list started. */
0122: private long lastInvertedListPos;
0123: /** The parameter <code>b</code> for Golomb coding of pointers. */
0124: protected int b;
0125: /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
0126: protected int log2b;
0127: /** The maximum number of positions in a document record so far. */
0128: public int maxCount;
0129: /** The number of bits written for offsets in the file of positions. */
0130: public long bitsForPositionsOffsets;
0131: /** Maximum number of trials when optimising the entry bit length. */
0132: private final static int MAX_TRY = 32;
0133:
0134: /** The parameter <code>h</code> (the maximum height of a skip tower). */
0135: private final int h;
0136:
0137: /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */
0138: private final int q;
0139:
0140: /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */
0141: private final int w;
0142:
0143: /** The number of document records written in the cache containing the current block. */
0144: private int cache;
0145:
0146: /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th
0147: * skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>
0148: * contains the first document pointer within the next block. */
0149: private final int[] skipPointer;
0150:
0151: /** The {@link OutputBitStream}s where cached document pointers are written. */
0152: private final OutputBitStream[] cachePointer;
0153:
0154: /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */
0155: private final FastByteArrayOutputStream[] cachePointerByte;
0156:
0157: /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip
0158: * indices. */
0159: private final OutputBitStream[] cacheSkip;
0160:
0161: /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */
0162: private final OutputBitStream[] cacheSkipBitCount;
0163:
0164: /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip
0165: * indices. */
0166: private final FastByteArrayOutputStream[] cacheSkipByte;
0167:
0168: /** The {@link OutputBitStream} where cached document data are written. */
0169: private final CachingOutputBitStream cacheDataOut;
0170:
0171: /** The {@link FastBufferedInputStream} from which cached document data are read. */
0172: private final FastBufferedInputStream cacheDataIn;
0173:
0174: /** The length of the data segment for each quantum. */
0175: private final int[] cacheDataLength;
0176:
0177: /** The length of the positions bitstream for each quantum. */
0178: private final long[] cachePositionsLength;
0179:
0180: /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */
0181: private final OutputBitStream bitCount;
0182:
0183: /** The sum of all tower data computed so far. */
0184: public final TowerData towerData;
0185:
0186: /** The number of bits written to the positions stream at the start of the current quantum. */
0187: private long writtenPositionsBitsAtLastQuantum;
0188:
0189: /** The number of bits written for quantum lengths. */
0190: public long bitsForQuantumBitLengths;
0191:
0192: /** The number of bits written for quantum lengths in the positions stream. */
0193: public long bitsForPositionsQuantumBitLengths;
0194:
0195: /** The number of bits written for entry lenghts. */
0196: public long bitsForEntryBitLengths;
0197:
0198: /** The number of written blocks. */
0199: public long numberOfBlocks;
0200:
0201: /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been
0202: * written for the current inverted list. */
0203: public int prevEntryBitLength;
0204:
0205: /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been
0206: * written for the current inverted list. */
0207: public int prevQuantumBitLength;
0208:
0209: /** An estimate on the number of bits occupied per quantum in the positions stream in the last written cache, or -1 if no cache has been
0210: * written for the current inverted list. */
0211: public int prevPositionsQuantumBitLength;
0212:
0213: /** The Golomb modulus for a top pointer skip, for each level. */
0214: private final int[] towerTopB;
0215:
0216: /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */
0217: private final int[] towerTopLog2B;
0218:
0219: /** The Golomb modulus for a lower pointer skip, for each level. */
0220: private final int[] towerLowerB;
0221:
0222: /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
0223: private final int[] towerLowerLog2B;
0224:
0225: /** The prediction for a pointer skip, for each level. */
0226: private final int[] pointerPrediction;
0227:
0228: /** The <var>k</var>-th entry of this array contains the number of bits from the start of
0229: * the <var>k</var>-th skip tower up to the end of the current block (more precisely,
0230: * to the point that should be reached via skipping, which is just after the document pointer).
0231: * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)},
0232: * but it is declared here for efficiency.
0233: */
0234: final private long[] distance;
0235:
0236: /** The temporary file dumping the index data contained in a block. */
0237: final private File tempFile;
0238:
0239: /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).
0240: * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>).
0241: *
0242: * @param basename the basename.
0243: * @param numberOfDocuments the number of documents in the collection to be indexed.
0244: * @param writeOffsets if <code>true</code>, the offset file will also be produced.
0245: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
0246: */
0247: public BitStreamHPIndexWriter(final CharSequence basename,
0248: final int numberOfDocuments, final boolean writeOffsets,
0249: int tempBufferSize, final Map<Component, Coding> flags,
0250: final int q, final int h) throws IOException {
0251: this (new OutputBitStream(new FileOutputStream(basename
0252: + DiskBasedIndex.INDEX_EXTENSION)),
0253: new OutputBitStream(new FileOutputStream(basename
0254: + DiskBasedIndex.POSITIONS_EXTENSION)),
0255: writeOffsets ? new OutputBitStream(
0256: new FileOutputStream(basename
0257: + DiskBasedIndex.OFFSETS_EXTENSION))
0258: : null, numberOfDocuments, tempBufferSize,
0259: flags, q, h);
0260: }
0261:
0262: /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
0263: *
0264: * @param obs the underlying output bit stream.
0265: * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.
0266: * @param numberOfDocuments the number of documents in the collection to be indexed.
0267: * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
0268: * @throws IOException
0269: */
0270: public BitStreamHPIndexWriter(final OutputBitStream obs,
0271: final OutputBitStream positions,
0272: final OutputBitStream offset, final int numberOfDocuments,
0273: int tempBufferSize, final Map<Component, Coding> flags,
0274: final int q, final int h) throws IOException {
0275: super (numberOfDocuments, flags);
0276: this .obs = obs;
0277: this .positions = positions;
0278: this .offset = offset;
0279: this .frequency = -1;
0280: this .currentTerm = -1;
0281: this .maxCount = 0;
0282:
0283: if (!hasCounts && hasPositions)
0284: throw new IllegalArgumentException(
0285: "Index would have positions but no counts (this can't happen)");
0286: if (h < 0)
0287: throw new IllegalArgumentException("Illegal height " + h);
0288: if (q <= 0 || (q & -q) != q)
0289: throw new IllegalArgumentException("Illegal quantum " + q);
0290: this .h = h;
0291: this .q = q;
0292:
0293: int two2h = 1 << h;
0294: w = two2h * q;
0295:
0296: if (DEBUG) {
0297: System.err.println("Cache will contain at most " + w
0298: + " records (q=" + q + ",h=" + h + ")");
0299: System.err.print("Skip records will be ");
0300: for (int i = 0; i < two2h; i++)
0301: System.err.print((i * q) + " ");
0302: System.err.println();
0303: }
0304:
0305: towerData = new TowerData();
0306: tempFile = File.createTempFile("MG4J", ".data");
0307: cacheDataIn = new FastBufferedInputStream(new FileInputStream(
0308: tempFile));
0309: cacheDataOut = new CachingOutputBitStream(tempFile,
0310: tempBufferSize);
0311: cacheDataLength = new int[two2h];
0312: cachePositionsLength = new long[two2h + 1];
0313: cachePointer = new OutputBitStream[two2h];
0314: cachePointerByte = new FastByteArrayOutputStream[two2h];
0315:
0316: for (int i = 0; i < two2h; i++)
0317: cachePointer[i] = new OutputBitStream(
0318: cachePointerByte[i] = new FastByteArrayOutputStream(),
0319: 0);
0320:
0321: cacheSkip = new OutputBitStream[two2h];
0322: cacheSkipBitCount = new OutputBitStream[two2h];
0323: cacheSkipByte = new FastByteArrayOutputStream[two2h];
0324:
0325: for (int i = 0; i < two2h; i++) {
0326: cacheSkip[i] = new OutputBitStream(
0327: cacheSkipByte[i] = new FastByteArrayOutputStream(),
0328: 0);
0329: cacheSkipBitCount[i] = new OutputBitStream(NullOutputStream
0330: .getInstance(), 0);
0331: }
0332:
0333: skipPointer = new int[two2h + 1];
0334: distance = new long[two2h + 1];
0335:
0336: bitCount = new OutputBitStream(NullOutputStream.getInstance(),
0337: 0);
0338:
0339: towerTopB = new int[h + 1];
0340: towerTopLog2B = new int[h + 1];
0341: towerLowerB = new int[h + 1];
0342: towerLowerLog2B = new int[h + 1];
0343: pointerPrediction = new int[h + 1];
0344:
0345: }
0346:
0347: private int writeOutPointer(final OutputBitStream out,
0348: final int pointer) throws IOException {
0349: if (frequency == numberOfDocuments)
0350: return 0; // We do not write pointers for everywhere occurring terms.
0351:
0352: switch (pointerCoding) {
0353: case GAMMA:
0354: return out.writeGamma(pointer - lastDocument - 1);
0355: case DELTA:
0356: return out.writeDelta(pointer - lastDocument - 1);
0357: case GOLOMB:
0358: return out
0359: .writeGolomb(pointer - lastDocument - 1, b, log2b);
0360: default:
0361: throw new IllegalStateException(
0362: "The required pointer coding (" + pointerCoding
0363: + ") is not supported.");
0364: }
0365: }
0366:
0367: /** A structure maintaining statistical data about tower construction. */
0368:
0369: public static class TowerData {
0370: /** The number of bits written for bit skips at the top of a tower. */
0371: public long bitsForTopBitSkips;
0372:
0373: /** The number of bits written for positions bit skips at the top of a tower. */
0374: public long bitsForTopPositionsBitSkips;
0375:
0376: /** The number of bits written for skip pointers at the top of a tower. */
0377: public long bitsForTopSkipPointers;
0378:
0379: /** The number of bits written for bit skips in the lower part of a tower. */
0380: public long bitsForLowerBitSkips;
0381:
0382: /** The number of bits written for positions bit skips in the lower part of a tower. */
0383: public long bitsForLowerPositionsBitSkips;
0384:
0385: /** The number of bits written for skip pointers in the lower part of a tower. */
0386: public long bitsForLowerSkipPointers;
0387:
0388: /** The number of bits written for tower lengths. */
0389: public long bitsForTowerLengths;
0390:
0391: /** The number of written skip towers. */
0392: public long numberOfSkipTowers;
0393:
0394: /** The number of written top skip entries. */
0395: public long numberOfTopEntries;
0396:
0397: /** The number of written lower skip entries. */
0398: public long numberOfLowerEntries;
0399:
0400: /** Clear all fields of this tower data. */
0401:
0402: void clear() {
0403: bitsForTopBitSkips = 0;
0404: bitsForTopPositionsBitSkips = 0;
0405: bitsForTopSkipPointers = 0;
0406: bitsForLowerBitSkips = 0;
0407: bitsForLowerPositionsBitSkips = 0;
0408: bitsForLowerSkipPointers = 0;
0409: bitsForTowerLengths = 0;
0410: numberOfSkipTowers = 0;
0411: numberOfTopEntries = 0;
0412: numberOfLowerEntries = 0;
0413: }
0414:
0415: /** Returns the overall number of bits used for skip pointers.
0416: * @return the overall number of bits used for skip pointers.
0417: */
0418: public long bitsForSkipPointers() {
0419: return bitsForTopSkipPointers + bitsForLowerSkipPointers;
0420: }
0421:
0422: /** Returns the overall number of bits used for bit skips.
0423: * @return the overall number of bits used for bit skips.
0424: */
0425: public long bitsForBitSkips() {
0426: return bitsForTopBitSkips + bitsForLowerBitSkips;
0427: }
0428:
0429: /** Returns the overall number of bits used for bit skips.
0430: * @return the overall number of bits used for bit skips.
0431: */
0432: public long bitsForPositionsBitSkips() {
0433: return bitsForTopPositionsBitSkips
0434: + bitsForLowerPositionsBitSkips;
0435: }
0436:
0437: /** Returns the overall number of bits used for tower entries (bits for tower lengths are not included).
0438: * @return the overall number of bits used for tower entries.
0439: */
0440: public long bitsForEntries() {
0441: return bitsForSkipPointers() + bitsForBitSkips()
0442: + bitsForPositionsBitSkips();
0443: }
0444:
0445: /** Returns the overall number of bits used for towers.
0446: * @return the overall number of bits used for towers.
0447: */
0448: public long bitsForTowers() {
0449: return bitsForTowerLengths + bitsForEntries();
0450: }
0451:
0452: /** Returns the overall number of entries.
0453: * @return the overall number of entries.
0454: */
0455: public long numberOfEntries() {
0456: return numberOfTopEntries + numberOfLowerEntries;
0457: }
0458: }
0459:
0460: public long newInvertedList() throws IOException {
0461: if (cache != 0)
0462: writeOutCache(-1);
0463: if (frequency >= 0 && frequency != writtenDocuments)
0464: throw new IllegalStateException(
0465: "The number of document records ("
0466: + this .writtenDocuments
0467: + ") does not match the frequency ("
0468: + this .frequency + ")");
0469: if (state != BEFORE_INVERTED_LIST
0470: && state != BEFORE_DOCUMENT_RECORD)
0471: throw new IllegalStateException(
0472: "Trying to start new inverted list in state "
0473: + state);
0474:
0475: // The position (in bits) where the new inverted list starts
0476: long pos = obs.writtenBits();
0477: // Reset variables
0478: writtenDocuments = 0;
0479: currentTerm++;
0480: currentDocument = -1;
0481:
0482: // If needed, write the offset
0483: if (offset != null)
0484: offset.writeLongGamma(pos - lastInvertedListPos);
0485: // Write the offset for positions
0486: bitsForPositionsOffsets += obs.writeLongDelta(positions
0487: .writtenBits());
0488: lastInvertedListPos = pos;
0489: state = BEFORE_FREQUENCY;
0490: return pos;
0491: }
0492:
0493: public int writeFrequency(final int frequency) throws IOException {
0494: if (state != BEFORE_FREQUENCY)
0495: throw new IllegalStateException(
0496: "Trying to write frequency in state " + state);
0497:
0498: int bitCount;
0499: // Write the frequency
0500: switch (frequencyCoding) {
0501: case SHIFTED_GAMMA:
0502: bitCount = obs.writeShiftedGamma(frequency - 1); // frequency cannot be 0
0503: break;
0504: case GAMMA:
0505: bitCount = obs.writeGamma(frequency - 1); // frequency cannot be 0
0506: break;
0507: case DELTA:
0508: bitCount = obs.writeDelta(frequency - 1); // frequency cannot be 0
0509: break;
0510: default:
0511: throw new IllegalStateException(
0512: "The required frequency coding (" + frequencyCoding
0513: + ") is not supported.");
0514: }
0515:
0516: this .frequency = frequency;
0517:
0518: // We compute the modulus used for pointer Golomb coding
0519: if (pointerCoding == Coding.GOLOMB) {
0520: b = BitStreamIndex.golombModulus(frequency,
0521: numberOfDocuments);
0522: log2b = Fast.mostSignificantBit(b);
0523: }
0524:
0525: prevQuantumBitLength = prevEntryBitLength = prevPositionsQuantumBitLength = -1;
0526:
0527: if (DEBUG)
0528: System.err.println("----------- " + currentTerm + " ("
0529: + frequency + ")");
0530:
0531: final long pointerQuantumSigma = BitStreamIndex.quantumSigma(
0532: frequency, numberOfDocuments, q);
0533: for (int i = Math
0534: .min(h, Fast.mostSignificantBit(frequency / q)); i >= 0; i--) {
0535: towerTopB[i] = BitStreamIndex.gaussianGolombModulus(
0536: pointerQuantumSigma, i + 1);
0537: towerTopLog2B[i] = Fast.mostSignificantBit(towerTopB[i]);
0538: towerLowerB[i] = BitStreamIndex.gaussianGolombModulus(
0539: pointerQuantumSigma, i);
0540: towerLowerLog2B[i] = Fast
0541: .mostSignificantBit(towerLowerB[i]);
0542: pointerPrediction[i] = (int) ((q * (1L << i)
0543: * numberOfDocuments + frequency / 2) / frequency);
0544: }
0545:
0546: state = BEFORE_DOCUMENT_RECORD;
0547: bitsForFrequencies += bitCount;
0548: return bitCount;
0549: }
0550:
0551: public OutputBitStream newDocumentRecord() throws IOException {
0552: if (frequency == writtenDocuments)
0553: throw new IllegalStateException(
0554: "Document record overflow (written "
0555: + this .frequency + " already)");
0556: if (state != BEFORE_DOCUMENT_RECORD)
0557: throw new IllegalStateException(
0558: "Trying to start new document record in state "
0559: + state);
0560:
0561: writtenDocuments++;
0562: numberOfPostings++;
0563: lastDocument = currentDocument;
0564: state = BEFORE_POINTER;
0565: return cacheDataOut;
0566: }
0567:
0568: public int writeDocumentPointer(@SuppressWarnings("unused")
0569: final OutputBitStream unused, final int pointer) throws IOException {
0570: if (state != BEFORE_POINTER)
0571: throw new IllegalStateException(
0572: "Trying to write pointer in state " + state);
0573:
0574: // If the previous block is over, write it out!
0575:
0576: if (cache == w)
0577: writeOutCache(pointer);
0578:
0579: final OutputBitStream out;
0580:
0581: // Record data pointer if we are on a skip; otherwise, write it to the cache.
0582: if (cache % q == 0) {
0583: if (cache / q > 0) {
0584: cacheDataLength[cache / q - 1] = (int) cacheDataOut
0585: .writtenBits();
0586: if (ASSERTS)
0587: assert positions.writtenBits()
0588: - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : (positions
0589: .writtenBits() - writtenPositionsBitsAtLastQuantum)
0590: + " > " + Integer.MAX_VALUE;
0591: cachePositionsLength[cache / q - 1] = (int) (positions
0592: .writtenBits() - writtenPositionsBitsAtLastQuantum);
0593: writtenPositionsBitsAtLastQuantum = positions
0594: .writtenBits();
0595: }
0596: cacheDataOut.align();
0597: cacheDataOut.writtenBits(0);
0598: skipPointer[cache / q] = pointer;
0599: out = cachePointer[cache++ / q];
0600: } else {
0601: cache++;
0602: out = cacheDataOut;
0603: }
0604:
0605: currentDocument = pointer;
0606: int bitCount = 0;
0607:
0608: if (frequency != numberOfDocuments) { // We do not write pointers for everywhere occurring documents.
0609: switch (pointerCoding) {
0610: case SHIFTED_GAMMA:
0611: bitCount = out.writeShiftedGamma(pointer - lastDocument
0612: - 1);
0613: break;
0614: case GAMMA:
0615: bitCount = out.writeGamma(pointer - lastDocument - 1);
0616: break;
0617: case DELTA:
0618: bitCount = out.writeDelta(pointer - lastDocument - 1);
0619: break;
0620: case GOLOMB:
0621: bitCount = out.writeGolomb(pointer - lastDocument - 1,
0622: b, log2b);
0623: break;
0624: default:
0625: throw new IllegalStateException(
0626: "The required pointer coding (" + pointerCoding
0627: + ") is not supported.");
0628: }
0629: } else if (pointer - lastDocument != 1)
0630: throw new IllegalStateException(
0631: "Term "
0632: + currentTerm
0633: + " has frequency equal to the number of documents, but pointers are not consecutive integers");
0634:
0635: state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT
0636: : BEFORE_DOCUMENT_RECORD;
0637: bitsForPointers += bitCount;
0638: return bitCount;
0639: }
0640:
0641: public int writePayload(final OutputBitStream out,
0642: final Payload payload) throws IOException {
0643: throw new IllegalStateException(
0644: "High-performance indices do not support payloads");
0645: /*if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" );
0646: if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state );
0647: final int count = payload.write( out );
0648: bitsForPayloads += count;
0649: state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
0650: return count;*/
0651: }
0652:
0653: public int writePositionCount(final OutputBitStream out,
0654: final int count) throws IOException {
0655: if (frequency < 0)
0656: throw new IllegalStateException(
0657: "Trying to write count without calling newInvertedList");
0658: if (state != BEFORE_COUNT)
0659: throw new IllegalStateException(
0660: "Trying to write count in state " + state);
0661: final int bitCount;
0662:
0663: numberOfOccurrences += count;
0664: switch (countCoding) {
0665: case SHIFTED_GAMMA:
0666: bitCount = out.writeShiftedGamma(count - 1);
0667: break;
0668: case GAMMA:
0669: bitCount = out.writeGamma(count - 1);
0670: break;
0671: case UNARY:
0672: bitCount = out.writeUnary(count - 1);
0673: break;
0674: case DELTA:
0675: bitCount = out.writeDelta(count - 1);
0676: break;
0677: default:
0678: throw new IllegalStateException(
0679: "The required count coding (" + countCoding
0680: + ") is not supported.");
0681: }
0682:
0683: state = hasPositions ? BEFORE_POSITIONS
0684: : BEFORE_DOCUMENT_RECORD;
0685: bitsForCounts += bitCount;
0686: return bitCount;
0687: }
0688:
0689: public int writeDocumentPositions(@SuppressWarnings("unused")
0690: final OutputBitStream unused, final int[] occ, final int offset,
0691: final int len, final int docSize) throws IOException {
0692: if (frequency < 0)
0693: throw new IllegalStateException(
0694: "Trying to write occurrences without calling newInvertedList");
0695: if (state != BEFORE_POSITIONS)
0696: throw new IllegalStateException(
0697: "Trying to write positions in state " + state);
0698:
0699: if (ASSERTS && docSize > 0)
0700: for (int i = 0; i < len; i++)
0701: assert occ[offset + i] < docSize : "Position "
0702: + occ[offset + i] + " for document "
0703: + currentDocument + " is too large; size is "
0704: + docSize;
0705:
0706: int i;
0707: int prev = -1;
0708: int bitCount = 0;
0709: final int end = offset + len;
0710: final OutputBitStream positions = this .positions;
0711:
0712: switch (positionCoding) {
0713: case GAMMA:
0714: if (COOKIES)
0715: bitCount += positions.writeGamma(Integer.MAX_VALUE);
0716: for (i = offset; i < end; i++) {
0717: bitCount += positions.writeGamma(occ[i] - prev - 1);
0718: prev = occ[i];
0719: }
0720: break;
0721: case DELTA:
0722: if (COOKIES)
0723: bitCount += positions.writeDelta(Integer.MAX_VALUE);
0724: for (i = offset; i < end; i++) {
0725: bitCount += positions.writeDelta(occ[i] - prev - 1);
0726: prev = occ[i];
0727: }
0728: break;
0729: case SHIFTED_GAMMA:
0730: if (COOKIES)
0731: bitCount += positions
0732: .writeShiftedGamma(Integer.MAX_VALUE);
0733: for (i = offset; i < end; i++) {
0734: bitCount += positions.writeShiftedGamma(occ[i] - prev
0735: - 1);
0736: prev = occ[i];
0737: }
0738: break;
0739: default:
0740: throw new IllegalStateException(
0741: "The required position coding (" + positionCoding
0742: + ") is not supported.");
0743: }
0744:
0745: state = BEFORE_DOCUMENT_RECORD;
0746: bitsForPositions += bitCount;
0747: if (len > maxCount)
0748: maxCount = len;
0749: return bitCount;
0750: }
0751:
0752: public void close() throws IOException {
0753: if (cache != 0)
0754: writeOutCache(-1);
0755:
0756: if (state != BEFORE_DOCUMENT_RECORD
0757: && state != BEFORE_INVERTED_LIST)
0758: throw new IllegalStateException(
0759: "Trying to close index in state " + state);
0760: if (frequency >= 0 && frequency != writtenDocuments)
0761: throw new IllegalStateException(
0762: "The number of document records ("
0763: + this .writtenDocuments
0764: + ") does not match the frequency ("
0765: + this .frequency + ")");
0766:
0767: if (writtenBits() != obs.writtenBits()
0768: + positions.writtenBits())
0769: throw new IllegalStateException(
0770: "Written bits count mismatch: we say "
0771: + writtenBits()
0772: + ", the streams say "
0773: + (obs.writtenBits() + positions
0774: .writtenBits()));
0775:
0776: if (offset != null) {
0777: offset.writeLongGamma(obs.writtenBits()
0778: - lastInvertedListPos);
0779: offset.close();
0780: }
0781:
0782: obs.close();
0783: positions.close();
0784: cacheDataIn.close();
0785: cacheDataOut.close();
0786: tempFile.delete();
0787: }
0788:
0789: /** Computes the towers.
0790: *
0791: * @param quantumBitLength the length in bits of a quantum.
0792: * @param positionsQuantumBitLength the length in bits of a quantum in the positions stream.
0793: * @param entryBitLength the estimated length in bits of a tower entry.
0794: * @param toTheEnd the number of bits that must be skipped to reach the next tower (usually,
0795: * the length of the first pointer of the next block or 0 if this is to be the last block).
0796: * @param skip an array of output bit stream where the data related to each tower will be written.
0797: * @param towerData will be filled with statistical date about the towers.
0798: * @param doinIt if true, we are actually writing a tower, not just trying.
0799: */
0800: private void tryTower(final int quantumBitLength,
0801: final int positionsQuantumBitLength,
0802: final int entryBitLength, long toTheEnd,
0803: final OutputBitStream[] skip, final TowerData towerData,
0804: final boolean doinIt) throws IOException {
0805: int i, k, s;
0806: long d;
0807: int basePointer;
0808: // truncated is true only for those towers (in defective blocks) whose height is strictly smaller than the height they should have
0809:
0810: boolean truncated = false;
0811:
0812: if (DEBUG && doinIt)
0813: System.err.println("Writing out tower for term "
0814: + currentTerm + "; quantumBitLength="
0815: + quantumBitLength + " entryBitLength="
0816: + entryBitLength);
0817:
0818: for (k = (cache - 1) / q; k >= 0; k--) {
0819: // Where are we? At the end of the k-th quantum. So toTheEnd must be increased by
0820: // the length of the data contained in the same quantum, moving us...
0821: toTheEnd += cacheDataLength[k];
0822:
0823: // ...just after the k-th skip tower.
0824: // We compute the maximum valid index of the skip tower (*MUST* be kept in sync with the subsequent loop).
0825: s = (k == 0) ? h : Fast.leastSignificantBit(k);
0826:
0827: // This test handles defective blocks. In particular, for defective quanta s=-1,
0828: // yielding no skipping data at all for such quanta. truncated is true if the
0829: // current tower is truncated w.r.t. the infinite skip list.
0830: if (cache < w) {
0831: final int upperBound = Fast
0832: .mostSignificantBit((cache / q) - k);
0833: if (s > upperBound) {
0834: s = upperBound;
0835: truncated = true;
0836: } else
0837: truncated = false;
0838: } else
0839: truncated = k == 0;
0840:
0841: skip[k].writtenBits(0);
0842:
0843: if (s >= 0) {
0844: if (DEBUG && doinIt)
0845: System.err.print("% (" + k + ") [" + skipPointer[k]
0846: + "] ");
0847:
0848: basePointer = skipPointer[k];
0849:
0850: /* If the current tower is truncated, we must actually write the top of the tower.
0851: * The top must be forecast in a Bernoullian way: we write it as a difference from the average pointer skip,
0852: * which is q 2^s / relativeFrequency. */
0853: if (truncated) {
0854: towerData.numberOfTopEntries++;
0855: // TODO: prediction should be based not on 1<<s, but rather on the actual number of skipped quanta, which could be smaller (because of end-of-list)
0856: towerData.bitsForTopSkipPointers += skip[k]
0857: .writeGolomb(Fast.int2nat(skipPointer[k
0858: + (1 << s)]
0859: - basePointer
0860: - pointerPrediction[s]),
0861: towerTopB[s], towerTopLog2B[s]);
0862: towerData.bitsForTopBitSkips += skip[k]
0863: .writeLongDelta(Fast
0864: .int2nat((int) ((toTheEnd - distance[k
0865: + (1 << s)]) - (quantumBitLength
0866: * (1 << s) + entryBitLength
0867: * ((1 << s + 1) - s - 2)))));
0868: towerData.bitsForTopPositionsBitSkips += skip[k]
0869: .writeLongDelta(Fast
0870: .int2nat((cachePositionsLength[k] - cachePositionsLength[k
0871: + (1 << s)])
0872: - positionsQuantumBitLength
0873: * (1 << s)));
0874: }
0875:
0876: if (DEBUG && doinIt)
0877: System.err.print((truncated ? "" : "(")
0878: + (skipPointer[k + (1 << s)] - basePointer)
0879: + ":" + (toTheEnd - distance[k + (1 << s)])
0880: + (truncated ? " " : ") "));
0881:
0882: // Produce a (single) tower of height s
0883: for (i = s - 1; i >= 0; i--) {
0884: towerData.bitsForLowerSkipPointers += skip[k]
0885: .writeGolomb(
0886: Fast
0887: .int2nat((skipPointer[k
0888: + (1 << i)] - basePointer)
0889: - ((skipPointer[k
0890: + (1 << i + 1)] - basePointer) / 2)),
0891: towerLowerB[i], towerLowerLog2B[i]);
0892:
0893: towerData.bitsForLowerBitSkips += skip[k]
0894: .writeLongDelta(Fast
0895: .int2nat((int) (((toTheEnd
0896: - distance[k
0897: + (1 << (i + 1))] - entryBitLength
0898: * (i + 1)) / 2) - (toTheEnd - distance[k
0899: + (1 << i)]))));
0900: towerData.bitsForLowerPositionsBitSkips += skip[k]
0901: .writeLongDelta(Fast
0902: .int2nat((cachePositionsLength[k] - cachePositionsLength[k
0903: + (1 << i + 1)])
0904: / 2
0905: - (cachePositionsLength[k] - cachePositionsLength[k
0906: + (1 << i)])));
0907:
0908: if (DEBUG && doinIt)
0909: System.err
0910: .print((skipPointer[k + (1 << i)] - basePointer)
0911: + ":"
0912: + (toTheEnd - distance[k
0913: + (1 << i)]) + " ");
0914: }
0915:
0916: if (s > 0) { // No length for single-entry towers.
0917: d = bitCount.writeDelta(Fast.int2nat((int) skip[k]
0918: .writtenBits()
0919: - (s + 1) * entryBitLength));
0920: towerData.bitsForTowerLengths += d;
0921: toTheEnd += d;
0922: }
0923:
0924: toTheEnd += skip[k].writtenBits();
0925:
0926: if (DEBUG && doinIt)
0927: System.err.print(" (" + (int) skip[k].writtenBits()
0928: + " bits)");
0929:
0930: towerData.numberOfLowerEntries += s;
0931: towerData.numberOfSkipTowers++;
0932:
0933: if (DEBUG && doinIt)
0934: System.err.println();
0935: }
0936:
0937: distance[k] = toTheEnd;
0938:
0939: // Where are we? Just before the beginning of the k-th skip tower
0940: toTheEnd += cachePointer[k].writtenBits();
0941:
0942: // Where are we? Just before the beginning of the k-th document record
0943: }
0944: }
0945:
0946: /** Write out the cache content.
0947: *
0948: * @param nextPointer the first pointer of the next block, or -1 if this is the last block.
0949: */
0950: private void writeOutCache(final int nextPointer)
0951: throws IOException {
0952: if (DEBUG)
0953: System.err.println("Entered writeOutCache() with cache="
0954: + cache + " (H is " + (1 << h) + ", B is " + w
0955: + ")");
0956:
0957: cacheDataLength[(cache + q - 1) / q - 1] = (int) cacheDataOut
0958: .writtenBits();
0959: if (ASSERTS)
0960: assert positions.writtenBits()
0961: - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : (positions
0962: .writtenBits() - writtenPositionsBitsAtLastQuantum)
0963: + " > " + Integer.MAX_VALUE;
0964: cachePositionsLength[(cache + q - 1) / q - 1] = (int) (positions
0965: .writtenBits() - writtenPositionsBitsAtLastQuantum);
0966: cachePositionsLength[(cache + q - 1) / q] = 0;
0967: writtenPositionsBitsAtLastQuantum = positions.writtenBits();
0968:
0969: /* We cumulate the position lengths so to obtain the actual skips. */
0970: for (int i = (cache + q - 1) / q; i-- != 0;)
0971: cachePositionsLength[i] += cachePositionsLength[i + 1];
0972:
0973: //System.err.print( basename ); for( int i = 0; i < ( cache + q - 1 ) / q; i++ ) System.err.print( " " + cachePositionsLength[ i ] ); System.err.println();
0974:
0975: /* Number of bits to go after the first pointer of the first record of the next block (or, if there
0976: is no other block in the current list, to go to the end of the list). */
0977: long toTheEnd;
0978:
0979: // Record the new document pointer for the highest tower
0980: int nextAfter = ((cache + q) - 1) / q; // This is ceil( cache / q )
0981:
0982: if (nextPointer >= 0) {
0983: skipPointer[nextAfter] = nextPointer;
0984: toTheEnd = writeOutPointer(bitCount, nextPointer);
0985: } else {
0986: skipPointer[nextAfter] = currentDocument + 1; // Fake: just for the last block
0987: toTheEnd = 0;
0988: }
0989:
0990: distance[nextAfter] = 0;
0991:
0992: int k, s;
0993: long d;
0994:
0995: // Compute quantum length in bits (without towers)
0996: int quantumBitLength = 0, entryBitLength = 0, positionsQuantumBitLength = (int) ((cachePositionsLength[0]
0997: * q + (cache - 1)) / cache);
0998:
0999: for (d = k = 0; k <= ((cache - 1) / q); k++)
1000: d += (cachePointer[k].writtenBits() + cacheDataLength[k]);
1001: quantumBitLength = (int) (((d * q) + (cache - 1)) / cache);
1002:
1003: final TowerData td = new TowerData();
1004: final Int2IntRBTreeMap candidates = new Int2IntRBTreeMap();
1005:
1006: /* As a first try, we compute the tower costs using 0 as average entry bit length. */
1007: tryTower(quantumBitLength, positionsQuantumBitLength, 0,
1008: toTheEnd, cacheSkipBitCount, td, false);
1009:
1010: if (td.numberOfSkipTowers > 0) { // There actually is at least a tower.
1011: /* Now we repeat this operation, trying to obtain the best value for the
1012: * average entry bit length.
1013: */
1014:
1015: while (candidates.size() < MAX_TRY
1016: && !candidates
1017: .containsValue(entryBitLength = (int) (td
1018: .bitsForTowers() / td
1019: .numberOfEntries()))) {
1020: td.clear();
1021: tryTower(quantumBitLength, positionsQuantumBitLength,
1022: entryBitLength, toTheEnd, cacheSkipBitCount,
1023: td, false);
1024: candidates.put((int) (td.bitsForTowers() / td
1025: .numberOfEntries()), entryBitLength);
1026: }
1027:
1028: if (ASSERTS)
1029: assert candidates.size() < MAX_TRY;
1030:
1031: entryBitLength = candidates.get(candidates.firstIntKey());
1032:
1033: if (DEBUG)
1034: System.err.println("Going to write tower at position "
1035: + obs.writtenBits());
1036: tryTower(quantumBitLength, positionsQuantumBitLength,
1037: entryBitLength, toTheEnd, cacheSkip, towerData,
1038: true);
1039: }
1040:
1041: // Ready to write out cache
1042: int maxCacheDataLength = 0;
1043: for (k = 0; k <= ((cache - 1) / q); k++)
1044: if (cacheDataLength[k] > maxCacheDataLength)
1045: maxCacheDataLength = cacheDataLength[k];
1046:
1047: /* We have two ways of writing out cached data. If all the data is still in the output bit
1048: * stream buffer, we just read it directly. Otherwise, we have to pour it into a temporary buffer. */
1049:
1050: final byte[] buffer;
1051: final boolean direct;
1052: int pos = 0;
1053:
1054: cacheDataOut.align();
1055:
1056: if (cacheDataOut.buffer() != null) {
1057: buffer = cacheDataOut.buffer();
1058: direct = true;
1059: } else {
1060: cacheDataOut.flush();
1061: buffer = new byte[(maxCacheDataLength + 7) / 8];
1062: direct = false;
1063: cacheDataIn.flush();
1064: cacheDataIn.position(0);
1065: }
1066:
1067: for (k = 0; k <= ((cache - 1) / q); k++) {
1068:
1069: /* See comments above. */
1070: s = (k == 0) ? h : Fast.leastSignificantBit(k);
1071:
1072: if (cache < w)
1073: s = Math.min(s, Fast
1074: .mostSignificantBit((cache / q) - k));
1075:
1076: d = cachePointer[k].writtenBits();
1077: cachePointer[k].flush();
1078: obs.write(cachePointerByte[k].array, d);
1079:
1080: d = cacheSkip[k].writtenBits();
1081: cacheSkip[k].flush();
1082:
1083: if (s >= 0) {
1084: if (k == 0) {
1085: if (prevQuantumBitLength < 0) {
1086: bitsForQuantumBitLengths += obs
1087: .writeLongDelta(quantumBitLength);
1088: bitsForPositionsQuantumBitLengths += obs
1089: .writeLongDelta(positionsQuantumBitLength);
1090: bitsForEntryBitLengths += obs
1091: .writeLongDelta(entryBitLength);
1092: } else {
1093: bitsForQuantumBitLengths += obs
1094: .writeLongDelta(Fast
1095: .int2nat(quantumBitLength
1096: - prevQuantumBitLength));
1097: bitsForPositionsQuantumBitLengths += obs
1098: .writeLongDelta(Fast
1099: .int2nat(positionsQuantumBitLength
1100: - prevPositionsQuantumBitLength));
1101: bitsForEntryBitLengths += obs
1102: .writeLongDelta(Fast
1103: .int2nat(entryBitLength
1104: - prevEntryBitLength));
1105: }
1106:
1107: prevQuantumBitLength = quantumBitLength;
1108: prevPositionsQuantumBitLength = positionsQuantumBitLength;
1109: prevEntryBitLength = entryBitLength;
1110:
1111: numberOfBlocks++;
1112: }
1113:
1114: if (s > 0)
1115: obs.writeDelta(Fast.int2nat((int) d
1116: - entryBitLength * (s + 1))); // No length for single-entry towers.
1117: } else if (ASSERTS)
1118: assert d == 0;
1119:
1120: obs.write(cacheSkipByte[k].array, d);
1121:
1122: if (direct) {
1123: obs.write(buffer, pos * 8, cacheDataLength[k]);
1124: pos += (cacheDataLength[k] + 7) / 8;
1125: } else {
1126: cacheDataIn.read(buffer, 0,
1127: (cacheDataLength[k] + 7) / 8);
1128: obs.write(buffer, cacheDataLength[k]);
1129: }
1130: }
1131:
1132: // Clean used caches
1133: for (k = 0; k <= ((cache - 1) / q); k++) {
1134: cachePointerByte[k].reset();
1135: cachePointer[k].writtenBits(0);
1136:
1137: cacheSkipByte[k].reset();
1138: cacheSkip[k].writtenBits(0);
1139:
1140: cacheDataOut.position(0);
1141: cacheDataOut.writtenBits(0);
1142: }
1143:
1144: cache = 0;
1145:
1146: if (ASSERTS)
1147: assert obs.writtenBits() + positions.writtenBits() == writtenBits();
1148: }
1149:
1150: public long writtenBits() {
1151: return bitsForFrequencies + bitsForPointers + bitsForPayloads
1152: + bitsForCounts + bitsForPositions
1153: + bitsForPositionsOffsets + towerData.bitsForTowers()
1154: + bitsForQuantumBitLengths
1155: + bitsForPositionsQuantumBitLengths
1156: + bitsForEntryBitLengths;
1157: }
1158:
1159: public Properties properties() {
1160: Properties result = new Properties();
1161: result.setProperty(Index.PropertyKeys.DOCUMENTS,
1162: numberOfDocuments);
1163: result.setProperty(Index.PropertyKeys.TERMS, currentTerm + 1);
1164: result.setProperty(Index.PropertyKeys.POSTINGS,
1165: numberOfPostings);
1166: result.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
1167: result.setProperty(Index.PropertyKeys.INDEXCLASS,
1168: FileHPIndex.class.getName());
1169: result.setProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM, q);
1170: result.setProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT, h);
1171: if (COOKIES)
1172: result.setProperty("cookies", true);
1173: // We save all flags, except for the PAYLOAD component, which is just used internally.
1174: for (Map.Entry<Component, Coding> e : flags.entrySet())
1175: if (e.getKey() != Component.PAYLOADS)
1176: result.addProperty(Index.PropertyKeys.CODING,
1177: new MutableString().append(e.getKey()).append(
1178: ':').append(e.getValue()));
1179: return result;
1180: }
1181:
1182: public void printStats(final PrintStream stats) {
1183: super .printStats(stats);
1184: stats.println("Skip towers: "
1185: + Util.format(towerData.numberOfSkipTowers)
1186: + " ("
1187: + Util.format(towerData.bitsForTowers())
1188: + " bits ["
1189: + Util.format(towerData.bitsForTowers() * 100.0
1190: / writtenBits())
1191: + "%], "
1192: + Util.format(towerData.bitsForTowers()
1193: / (double) towerData.numberOfSkipTowers)
1194: + " bits/tower)");
1195: stats.println("Skip entries: "
1196: + Util.format(towerData.numberOfEntries())
1197: + " ("
1198: + Util.format(towerData.bitsForEntries()
1199: / (double) towerData.numberOfEntries())
1200: + " bits/entry)");
1201: // Note that lengths are written approximately every other tower.
1202: stats.println("Skip tower lengths: "
1203: + Util.format(towerData.bitsForTowerLengths)
1204: + " bits ("
1205: + Util.format(2.0 * towerData.bitsForTowerLengths
1206: / towerData.numberOfSkipTowers)
1207: + " bits/tower)");
1208: stats.println("Quantum bit lengths: "
1209: + Util.format(bitsForQuantumBitLengths)
1210: + " bits ("
1211: + Util.format(bitsForQuantumBitLengths
1212: / (double) numberOfBlocks) + " bits/block)");
1213: stats.println("Positions quantum bit lengths: "
1214: + Util.format(bitsForPositionsQuantumBitLengths)
1215: + " bits ("
1216: + Util.format(bitsForPositionsQuantumBitLengths
1217: / (double) numberOfBlocks) + " bits/block)");
1218: stats.println("Entry bit lengths: "
1219: + Util.format(bitsForEntryBitLengths)
1220: + " bits ("
1221: + Util.format(bitsForEntryBitLengths
1222: / (double) numberOfBlocks) + " bits/block)");
1223:
1224: stats.println("Top bit skips: "
1225: + Util.format(towerData.bitsForTopBitSkips)
1226: + " bits ("
1227: + Util.format(towerData.bitsForTopBitSkips
1228: / (double) towerData.numberOfTopEntries)
1229: + " bits/skip)");
1230: stats.println("Top positions bit skips: "
1231: + Util.format(towerData.bitsForTopPositionsBitSkips)
1232: + " bits ("
1233: + Util.format(towerData.bitsForTopPositionsBitSkips
1234: / (double) towerData.numberOfTopEntries)
1235: + " bits/skip)");
1236: stats.println("Top pointer skips: "
1237: + Util.format(towerData.bitsForTopSkipPointers)
1238: + " bits ("
1239: + Util.format(towerData.bitsForTopSkipPointers
1240: / (double) towerData.numberOfTopEntries)
1241: + " bits/skip)");
1242: stats.println("Lower bit skips: "
1243: + Util.format(towerData.bitsForLowerBitSkips)
1244: + " bits ("
1245: + Util.format(towerData.bitsForLowerBitSkips
1246: / (double) towerData.numberOfLowerEntries)
1247: + " bits/skip)");
1248: stats.println("Lower positions bit skips: "
1249: + Util.format(towerData.bitsForLowerPositionsBitSkips)
1250: + " bits ("
1251: + Util.format(towerData.bitsForLowerPositionsBitSkips
1252: / (double) towerData.numberOfLowerEntries)
1253: + " bits/skip)");
1254: stats.println("Lower pointer skips: "
1255: + Util.format(towerData.bitsForLowerSkipPointers)
1256: + " bits ("
1257: + Util.format(towerData.bitsForLowerSkipPointers
1258: / (double) towerData.numberOfLowerEntries)
1259: + " bits/skip)");
1260: stats.println("Bit skips: "
1261: + Util.format(towerData.bitsForBitSkips())
1262: + " bits ("
1263: + Util.format(towerData.bitsForBitSkips()
1264: / (double) towerData.numberOfEntries())
1265: + " bits/skip)");
1266: stats.println("Positions bit skips: "
1267: + Util.format(towerData.bitsForPositionsBitSkips())
1268: + " bits ("
1269: + Util.format(towerData.bitsForPositionsBitSkips()
1270: / (double) towerData.numberOfEntries())
1271: + " bits/skip)");
1272: stats.println("Pointer skips: "
1273: + Util.format(towerData.bitsForSkipPointers())
1274: + " bits ("
1275: + Util.format(towerData.bitsForSkipPointers()
1276: / (double) towerData.numberOfEntries())
1277: + " bits/skip)");
1278: }
1279: }
|