001: package it.unimi.dsi.mg4j.index;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.bits.Fast;
025: import it.unimi.dsi.fastutil.ints.IntIterator;
026: import it.unimi.dsi.fastutil.ints.IntList;
027: import it.unimi.dsi.fastutil.longs.LongList;
028: import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
029: import it.unimi.dsi.mg4j.index.payload.Payload;
030: import it.unimi.dsi.io.InputBitStream;
031: import it.unimi.dsi.util.Interval;
032: import it.unimi.dsi.util.Intervals;
033: import it.unimi.dsi.util.StringMap;
034: import it.unimi.dsi.util.PrefixMap;
035: import it.unimi.dsi.Util;
036: import it.unimi.dsi.util.Properties;
037:
038: import java.io.IOException;
039: import java.io.InputStream;
040: import java.lang.reflect.Constructor;
041:
042: import org.apache.commons.lang.StringUtils;
043: import org.apache.log4j.Logger;
044:
045: /** A {@linkplain BitStreamIndexWriter bitstream-based} index. Instances of this class contains additional
046: * index data related to compression, such as the codes used for each part of the index.
047: *
048: * <P>Implementing subclasses must provide access to the index bitstream
049: * both at {@linkplain #getInputStream() byte} and {@linkplain #getInputBitStream(int) bit} level.
050: *
051: * <A>A bitstream-based index usually exposes {@linkplain #termMap term}
052: * or {@linkplain #prefixMap prefix} maps, but this is not compulsory. Additionally,
053: * the index could also expose the {@linkplain #offsets offset list}
054: * and the {@linkplain #sizes size list}; the latter, in particular, is compulsory
055: * with certain codings.
056: *
057: * <h2>Wired implementations</h2>
058: *
059: * <p>The standard readers associated to an instance of this class are of type {@link BitStreamIndexReader}.
060: * Nonetheless, it is possible to generate automatically sources for wired classes that
061: * work only for a particular set of codings and flags. The wired classes will be fetched
062: * automagically by reflection, if available. Please read the section about performance in the MG4J manual.
063: *
064: * @author Sebastiano Vigna
065: * @since 1.1
066: */
067:
068: public abstract class BitStreamIndex extends Index {
069: private static final long serialVersionUID = 0;
070: private static final Logger LOGGER = Util
071: .getLogger(BitStreamIndex.class);
072: private static final boolean ASSERTS = false;
073:
074: /** Symbolic names for additional properties of a {@link BitStreamIndex}. */
075: public static enum PropertyKeys {
076: /** The skip quantum. */
077: SKIPQUANTUM,
078: /** The skip height. */
079: SKIPHEIGHT,
080: /** The size of the buffer used to read the bit stream. */
081: BUFFERSIZE
082: }
083:
084: /** The default height (fairly low, due to memory consumption). */
085: public final static int DEFAULT_HEIGHT = 8;
086:
087: /** The default quantum. */
088: public final static int DEFAULT_QUANTUM = 64;
089:
090: /** The default buffer size. */
091: public final static int DEFAULT_BUFFER_SIZE = 4 * 1024;
092:
093: /** The coding for frequencies. See {@link CompressionFlags}. */
094: public final Coding frequencyCoding;
095: /** The coding for pointers. See {@link CompressionFlags}. */
096: public final Coding pointerCoding;
097: /** The coding for counts. See {@link CompressionFlags}. */
098: public final Coding countCoding;
099: /** The coding for positions. See {@link CompressionFlags}. */
100: public final Coding positionCoding;
101: /** The offset of each term, if offsets were loaded or specified at creation time, or <code>null</code>. */
102: public final LongList offsets;
103: /** The term map for this index, or <code>null</code> if the term map was not loaded. */
104: public final StringMap<? extends CharSequence> termMap;
105: /** The prefix map for this index, or <code>null</code> if the prefix map was not loaded. */
106: public final PrefixMap<? extends CharSequence> prefixMap;
107: /** The parameter <code>h</code> (the maximum height of a skip tower), or -1 if this index has no skips. */
108: public final int height;
109: /** The quantum, or -1 if this index has no skips. */
110: public final int quantum;
111: /** The size of the buffer used to read the bit stream. */
112: public final int bufferSize;
113: /** The constructor that will be used to create new index readers. */
114: public final Constructor<? extends IndexReader> readerConstructor;
115:
116: public BitStreamIndex(final int numberOfDocuments,
117: final int numberOfTerms, final long numberOfPostings,
118: final long numberOfOccurrences, final int maxCount,
119: final Payload payload, final Coding frequencyCoding,
120: final Coding pointerCoding, final Coding countCoding,
121: final Coding positionCoding, final int quantum,
122: final int height, final int bufferSize,
123: final TermProcessor termProcessor, final String field,
124: final Properties properties,
125: final StringMap<? extends CharSequence> termMap,
126: final PrefixMap<? extends CharSequence> prefixMap,
127: final IntList sizes, final LongList offsets) {
128: super (numberOfDocuments, numberOfTerms, numberOfPostings,
129: numberOfOccurrences, maxCount, payload,
130: countCoding != null, positionCoding != null,
131: termProcessor, field, sizes, properties);
132: this .frequencyCoding = frequencyCoding;
133: this .pointerCoding = pointerCoding;
134: this .countCoding = countCoding;
135: this .positionCoding = positionCoding;
136: this .termMap = termMap;
137: this .prefixMap = prefixMap;
138: this .offsets = offsets;
139: this .quantum = quantum;
140: this .height = height;
141: this .bufferSize = bufferSize;
142:
143: if (quantum != -1) {
144: if (height < 0)
145: throw new IllegalArgumentException("Illegal height "
146: + height);
147: if (quantum <= 0 || (quantum & -quantum) != quantum)
148: throw new IllegalArgumentException("Illegal quantum "
149: + quantum);
150: }
151:
152: readerConstructor = getConstructor();
153: }
154:
155: @SuppressWarnings("unchecked")
156: protected Constructor<? extends IndexReader> getConstructor() {
157: Class<? extends IndexReader> readerClass = BitStreamIndexReader.class;
158: String className = BitStreamIndexReader.class.getPackage()
159: .getName()
160: + ".wired."
161: + (quantum != -1 ? "Skip" : "")
162: + featureName(frequencyCoding)
163: + featureName(pointerCoding)
164: + (hasPayloads ? "Payloads " : featureName(countCoding)
165: + featureName(positionCoding))
166: + BitStreamIndexReader.class.getSimpleName();
167:
168: try {
169: readerClass = (Class<? extends IndexReader>) Class
170: .forName(className);
171: LOGGER.info("Dynamically fetched reader class "
172: + readerClass.getSimpleName());
173: } catch (Exception e) {
174: LOGGER.info("Cannot fetch dynamically class " + className
175: + "; falling back to generic (slower) class "
176: + BitStreamIndexReader.class.getSimpleName());
177: }
178:
179: try {
180: return readerClass.getConstructor(BitStreamIndex.class,
181: InputBitStream.class);
182: } catch (Exception shouldntReallyHappen) {
183: throw new RuntimeException(
184: "Cannot find suitable constructor in "
185: + readerClass.getSimpleName());
186: }
187: }
188:
189: protected static String featureName(final Coding coding) {
190: return StringUtils
191: .capitalize((coding == null ? CompressionFlags.NONE
192: : coding.toString()).toLowerCase());
193: }
194:
195: /** Returns an input bit stream over the index.
196: *
197: * @param bufferSize a suggested buffer size.
198: * @return an input bit stream over the index.
199: */
200: public abstract InputBitStream getInputBitStream(
201: final int bufferSize) throws IOException;
202:
203: /** Returns an input stream over the index.
204: *
205: * @return an input stream over the index.
206: */
207: public abstract InputStream getInputStream() throws IOException;
208:
209: public IndexReader getReader(final int bufferSize)
210: throws IOException {
211: try {
212: return readerConstructor
213: .newInstance(
214: this ,
215: getInputBitStream(bufferSize == -1 ? this .bufferSize
216: : bufferSize));
217: } catch (IOException e) {
218: throw e;
219: } catch (Exception e) {
220: throw new RuntimeException(e);
221: }
222: }
223:
224: /** Returns a {@link MultiTermIndexIterator} over all terms starting with the given prefix,
225: * provided their number does not exceed the given limit and that this index has a {@link #prefixMap}.
226: */
227:
228: public IndexIterator documents(final CharSequence prefix,
229: final int limit) throws IOException, TooManyTermsException {
230: if (prefixMap != null) {
231: final Interval interval = prefixMap.rangeMap().get(prefix);
232: if (interval == Intervals.EMPTY_INTERVAL)
233: return emptyIndexIterator;
234: final IndexIterator result;
235:
236: if (interval.length() > limit)
237: throw new TooManyTermsException(interval.length());
238:
239: if (interval.length() == 1)
240: result = documents(interval.left);
241: else {
242: IndexIterator[] baseIterator = new IndexIterator[interval
243: .length()];
244: int k = 0;
245: for (IntIterator i = interval.iterator(); i.hasNext();)
246: baseIterator[k++] = documents(i.nextInt());
247:
248: result = MultiTermIndexIterator.getInstance(this ,
249: baseIterator);
250: }
251: result.term(prefix + "*");
252: return result;
253: } else
254: throw new UnsupportedOperationException("Index " + this
255: + " has no prefix map");
256: }
257:
258: /** Fixed number of fractional binary digits used in fixed-point computation of Golomb moduli. */
259: public final static int FIXED_POINT_BITS = 31;
260: /** <code>1L << {@link #FIXED_POINT_BITS}</code>. */
261: public final static long FIXED_POINT_MULTIPLIER = (1L << FIXED_POINT_BITS);
262:
263: private final static int[] GOLOMB_STEP = {
264: (int) (.3819660112501052 * FIXED_POINT_MULTIPLIER),
265: (int) (.245122333753307 * FIXED_POINT_MULTIPLIER),
266: (int) (.1808274866038356 * FIXED_POINT_MULTIPLIER),
267: (int) (.1433251161454971 * FIXED_POINT_MULTIPLIER),
268: (int) (.1187285383664304 * FIXED_POINT_MULTIPLIER),
269: (int) (.1013462873713007 * FIXED_POINT_MULTIPLIER),
270: (int) (.0884076465179451 * FIXED_POINT_MULTIPLIER),
271: (int) (.0784006803660170 * FIXED_POINT_MULTIPLIER),
272: (int) (.0704298717679771 * FIXED_POINT_MULTIPLIER),
273: (int) (.0639308889222416 * FIXED_POINT_MULTIPLIER),
274: (int) (.0585303826783648 * FIXED_POINT_MULTIPLIER),
275: (int) (.0539714717143864 * FIXED_POINT_MULTIPLIER),
276: (int) (.0500716000363801 * FIXED_POINT_MULTIPLIER),
277: (int) (.0466974625983358 * FIXED_POINT_MULTIPLIER),
278: (int) (.0437494423620109 * FIXED_POINT_MULTIPLIER), };
279:
280: private final static int GOLOMB_STEP_LENGTH = GOLOMB_STEP.length;
281: private final static int GOLOMB_THRESHOLD = (int) (.0411515989924386 * FIXED_POINT_MULTIPLIER);
282: private final static long GOLOMB_ADD = (long) ((-(1 + Math.log(2)) / 2) * FIXED_POINT_MULTIPLIER),
283: GOLOMB_MULT = (int) (Math.log(2) * FIXED_POINT_MULTIPLIER);
284:
285: /** Computes the Golomb modulus for a given fraction using
286: * fixed-point arithmetic and a precomputed table for
287: * small values. This gives results that are
288: * extremely close to ⌈ log( 2 - <code>p</code>/<code>q</code> ) / log( 1 - <code>p</code>/<code>q</code> ) ⌉,
289: * but the computation is orders of magnitude quicker.
290: *
291: * @param p the numerator.
292: * @param q the denominator (larger than or equal to <code>p</code>).
293: * @return the Golomb modulus for <code>p</code>/<code>q</code>.
294: */
295: public static int golombModulus(final int p, final int q) {
296: if (ASSERTS)
297: assert p <= q;
298: final int f = (int) ((p * FIXED_POINT_MULTIPLIER) / q);
299: if (f < GOLOMB_THRESHOLD)
300: return (int) ((GOLOMB_ADD + (GOLOMB_MULT * q) / p
301: + FIXED_POINT_MULTIPLIER - 1) >> FIXED_POINT_BITS);
302: int i = GOLOMB_STEP_LENGTH;
303: while (i-- != 0)
304: if (f < GOLOMB_STEP[i])
305: return i + 2;
306: return 1;
307: }
308:
309: /** Fixed-point reprentation of the constant part of the formula for Gaussian Golomb codes. */
310: private final static long GOLOMB_GAUSSIAN = (long) ((2 * Math
311: .sqrt(2 / Math.PI) * Math.log(2)) * FIXED_POINT_MULTIPLIER);
312: /** Fixed-point representation of the square root of 2<sup>i-1</sup>.*/
313: private final static long[] SQRT_2_TO = new long[32];
314: static {
315: for (int i = SQRT_2_TO.length; i-- != 0;)
316: SQRT_2_TO[i] = (long) ((Math.sqrt((1L << i) / 2.0) * FIXED_POINT_MULTIPLIER));
317: }
318:
319: /** Computes the Gaussian Golomb modulus for a given standard deviation
320: * and shift using fixed-point arithmetic.
321: *
322: * <p>The Golomb modulus for (positive and negative)
323: * integers normally distributed with standard deviation σ can be computed using
324: * the formula ⌈ 2 sqrt( 2 / π ) ln(2) σ ⌉.
325: *
326: * <P>The resulting Golomb modulus is near to optimal for coding such
327: * integers after they have been passed through {@link Fast#int2nat(int)}. Note,
328: * however, that Golomb coding is <em>not</em> optimal for a normal distribution.
329: *
330: * <p>This function is used to compute the correct Golomb modulus for skip towers.
331: *
332: * @param quantumSigma the standard deviation of a quantum as returned by {@link #quantumSigma(int, int, int)}.
333: * @param shift a shift parameter.
334: * @return the Golomb modulus for the standard deviation obtained multiplying <code>quantumSigma</code> by
335: * the square root of 2<sup><code>shift</code>-1</sup>.
336: */
337: public static int gaussianGolombModulus(final long quantumSigma,
338: final int shift) {
339: return (int) (((((GOLOMB_GAUSSIAN >> FIXED_POINT_BITS / 2) * (quantumSigma >> FIXED_POINT_BITS
340: - FIXED_POINT_BITS / 2)) >> FIXED_POINT_BITS / 2) * (SQRT_2_TO[shift] >> FIXED_POINT_BITS
341: - FIXED_POINT_BITS / 2))
342: + FIXED_POINT_MULTIPLIER - 1 >> FIXED_POINT_BITS);
343: }
344:
345: /** Computes the standard deviation associated to a given quantum and document frequency.
346: *
347: * @param frequency the document frequency.
348: * @param numberOfDocuments the overall number of documents.
349: * @param quantum the quantum.
350: * @return a long representing in fixed-point arithmetic the value <code>Math.sqrt( quantum * ( 1 - p ) ) / p</code>, where
351: * <code>p</code> is the relative frequency.
352: */
353: public static long quantumSigma(final int frequency,
354: final int numberOfDocuments, final int quantum) {
355: return (long) (((Math.sqrt(quantum
356: * (1 - (double) frequency / numberOfDocuments)) * numberOfDocuments) / frequency) * FIXED_POINT_MULTIPLIER);
357: }
358: }
|