001: package it.unimi.dsi.mg4j.index;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
025: import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
026: import it.unimi.dsi.Util;
027:
028: import java.io.PrintStream;
029: import java.util.Map;
030:
031: /** An abstract bitstream-based index writer, providing common variables and a basic {@link #printStats(PrintStream)} implementation.
032: *
033: * <H2>Compression flags</H2>
034: *
035: * <P>Implementing subclasses need to know the compression method that they should use
036: * to write frequencies, pointers, payloads, counts and positions (and whether to write any of them).
037: * This information is passed to the {@linkplain #AbstractBitStreamIndexWriter(int, Map) constructor}
038: * using a suitable <em>flag map</em> (see {@link CompressionFlags}).
039: *
040: * @author Sebastiano Vigna
041: * @since 1.2
042: */
043:
044: public abstract class AbstractBitStreamIndexWriter implements
045: IndexWriter {
046:
047: /** The number of documents of the collection to be indexed. */
048: protected final int numberOfDocuments;
049: /** The flag map. */
050: public Map<Component, Coding> flags;
051: /** The coding for frequencies. */
052: protected Coding frequencyCoding;
053: /** The coding for pointers. */
054: protected Coding pointerCoding;
055: /** The coding for counts. */
056: protected Coding countCoding;
057: /** The coding for positions. */
058: protected Coding positionCoding;
059: /** Whether this index contains payloads. */
060: protected final boolean hasPayloads;
061: /** Whether this index contains counts. */
062: protected final boolean hasCounts;
063: /** Whether this index contains positions. */
064: protected final boolean hasPositions;
065:
066: /** The number of indexed postings (pairs term/document). */
067: protected long numberOfPostings;
068: /** The number of indexed occurrences. */
069: protected long numberOfOccurrences;
070: /** The current term. */
071: protected int currentTerm;
072: /** The number of bits written for frequencies. */
073: public long bitsForFrequencies;
074: /** The number of bits written for document pointers. */
075: public long bitsForPointers;
076: /** The number of bits written for counts. */
077: public long bitsForCounts;
078: /** The number of bits written for payloads. */
079: public long bitsForPayloads;
080: /** The number of bits written for positions. */
081: public long bitsForPositions;
082:
083: public AbstractBitStreamIndexWriter(final int numberOfDocuments,
084: final Map<Component, Coding> flags) {
085: this .numberOfDocuments = numberOfDocuments;
086: this .flags = flags;
087: frequencyCoding = flags.get(Component.FREQUENCIES);
088: pointerCoding = flags.get(Component.POINTERS);
089: countCoding = flags.get(Component.COUNTS);
090: positionCoding = flags.get(Component.POSITIONS);
091:
092: hasPayloads = flags.containsKey(Component.PAYLOADS);
093: hasCounts = countCoding != null;
094: hasPositions = positionCoding != null;
095: }
096:
097: public void printStats(PrintStream stats) {
098: stats.println("Number of documents: "
099: + Util.format(numberOfDocuments));
100: stats.println("Number of terms: "
101: + Util.format(currentTerm + 1));
102:
103: stats.println("Frequencies: " + Util.format(bitsForFrequencies)
104: + " bits, "
105: + Util.format(bitsForFrequencies / (currentTerm + 1.0))
106: + " bits/frequency.");
107: stats.println("Document pointers: "
108: + Util.format(numberOfPostings)
109: + " ("
110: + Util.format(bitsForPointers)
111: + " bits, "
112: + Util.format(bitsForPointers
113: / (double) numberOfPostings)
114: + " bits/pointer).");
115:
116: if (hasCounts)
117: stats.println("Counts: "
118: + Util.format(numberOfPostings)
119: + " ("
120: + Util.format(bitsForCounts)
121: + " bits, "
122: + Util.format(bitsForCounts
123: / (double) numberOfPostings)
124: + " bits/count).");
125: if (hasPositions)
126: stats.println("Occurrences: "
127: + Util.format(numberOfOccurrences)
128: + " ("
129: + Util.format(bitsForPositions)
130: + " bits, "
131: + Util.format(bitsForPositions
132: / (double) numberOfOccurrences)
133: + " bits/occurrence).");
134: if (hasPayloads)
135: stats.println("Payloads: "
136: + Util.format(numberOfPostings)
137: + " ("
138: + Util.format(bitsForPayloads)
139: + " bits, "
140: + Util.format(bitsForPayloads
141: / (double) numberOfPostings)
142: + " bits/payload).");
143: if (hasPositions)
144: stats.println("Total: "
145: + Util.format(writtenBits())
146: + " bits, "
147: + Util.format(writtenBits()
148: / (double) numberOfOccurrences)
149: + " bits/occurrence");
150: else
151: stats.println("Total: "
152: + Util.format(writtenBits())
153: + " bits, "
154: + Util.format(writtenBits()
155: / (double) numberOfPostings)
156: + " bits/posting");
157: }
158: }
|