001: package it.unimi.dsi.mg4j.index;
002:
003: import it.unimi.dsi.mg4j.index.payload.Payload;
004: import it.unimi.dsi.io.OutputBitStream;
005: import it.unimi.dsi.util.Properties;
006:
007: import java.io.IOException;
008: import java.io.PrintStream;
009:
010: /** An interface for classes that generate indices.
011: *
012: * <P>Implementations of this interface are used to write inverted lists in
013: * sequential order, as follows:
014: * <UL>
015: * <li>to create a new inverted list, you must call {@link #newInvertedList()};
016: * <li>then, you must specified the frequency using {@link #writeFrequency(int)};
017: * <li>the document records follow; before writing a new document record, you must call {@link #newDocumentRecord()};
018: * note that, all in all, the number of calls to {@link #newDocumentRecord()} must be equal to the frequency;
019: * <li>for each document record, you must supply the information needed for the index you are building
020: * ({@linkplain #writeDocumentPointer(OutputBitStream, int) pointer},
021: * {@linkplain #writePayload(OutputBitStream, Payload) payload},
022: * {@linkplain #writePositionCount(OutputBitStream, int) count}, and
023: * {@linkplain #writeDocumentPositions(OutputBitStream, int[], int, int, int) positions}, in this order).
024: * </UL>
025: *
026: * <p>{@link #newDocumentRecord()} returns an {@link OutputBitStream} that must be used to write the document-record data.
027: * Note that there is no guarantee that the returned {@link OutputBitStream} coincides with the
028: * underlying bit stream. Moreover, there is no guarantee as to when the bits will be actually
029: * written on the underlying stream, except that when starting a new inverted list, the previous
030: * inverted list, if any, will be written onto the underlying stream.
031: *
032: * @author Paolo Boldi
033: * @author Sebastiano Vigna
034: * @since 1.2
035: */
036:
037: public interface IndexWriter {
038:
039: /** Starts a new inverted list. The previous inverted list, if any, is actually written
040: * to the underlying bit stream.
041: *
042: * @return the position (in bytes) of the underlying bit stream where the new inverted
043: * list starts.
044: * @throws IllegalStateException if too few records were written for the previous inverted
045: * list.
046: */
047: long newInvertedList() throws IOException;
048:
049: /** Writes the frequency.
050: *
051: * @param frequency the (positive) number of document records that this inverted list will contain.
052: * @return the number of bits written.
053: */
054: int writeFrequency(final int frequency) throws IOException;
055:
056: /** Starts a new document record.
057: *
058: * <P>This method must be called exactly exactly <var>f</var> times, where <var>f</var> is the frequency specified with
059: * {@link #writeFrequency(int)}.
060: *
061: * @return the output bit stream where the next document record data should be written.
062: * @throws IllegalStateException if too many records were written for the current inverted list,
063: * or if there is no current inverted list.
064: */
065: OutputBitStream newDocumentRecord() throws IOException;
066:
067: /** Writes a document pointer.
068: *
069: * <P>This method must be called immediately after {@link #newDocumentRecord()}.
070: *
071: * @param out the output bit stream where the pointer will be written.
072: * @param pointer the document pointer.
073: * @return the number of bits written.
074: */
075: int writeDocumentPointer(final OutputBitStream out,
076: final int pointer) throws IOException;
077:
078: /** Writes the payload for the current document.
079: *
080: * <P>This method must be called immediately after {@link #writeDocumentPointer(OutputBitStream, int)}.
081: *
082: * @param out the output bit stream where the payload will be written.
083: * @param payload the payload.
084: * @return the number of bits written.
085: */
086: int writePayload(final OutputBitStream out, final Payload payload)
087: throws IOException;
088:
089: /** Writes the count of the occurrences of the current term in the current document to the given {@link OutputBitStream}.
090: * @param out the output stream where the occurrences should be written.
091: * @param count the count.
092: * @return the number of bits written.
093: */
094: int writePositionCount(final OutputBitStream out, final int count)
095: throws IOException;
096:
097: /** Writes the positions of the occurrences of the current term in the current document to the given {@link OutputBitStream}.
098: *
099: * @param out the output stream where the occurrences should be written.
100: * @param occ the position vector (a sequence of strictly increasing natural numbers).
101: * @param offset the first valid entry in <code>occ</code>.
102: * @param len the number of valid entries in <code>occ</code>.
103: * @param docSize the size of the current document (only for Golomb and interpolative coding; you can safely pass -1 otherwise).
104: * @return the number of bits written.
105: * @throws IllegalStateException if there is no current inverted list.
106: */
107: int writeDocumentPositions(final OutputBitStream out,
108: final int[] occ, final int offset, final int len,
109: final int docSize) throws IOException;
110:
111: /** Returns the overall number of bits written onto the underlying stream(s).
112: *
113: * @return the number of bits written, according to the variables keeping statistical records.
114: */
115: long writtenBits();
116:
117: /** Returns properties of the index generated by this index writer.
118: *
119: * <p>This method should only be called after {@link #close()}.
120: * It returns a new {@linkplain Properties property object}
121: * containing values for (whenever appropriate)
122: * {@link Index.PropertyKeys#DOCUMENTS}, {@link Index.PropertyKeys#TERMS},
123: * {@link Index.PropertyKeys#POSTINGS}, {@link Index.PropertyKeys#MAXCOUNT},
124: * {@link Index.PropertyKeys#INDEXCLASS}, {@link Index.PropertyKeys#CODING}, {@link Index.PropertyKeys#PAYLOADCLASS},
125: * {@link BitStreamIndex.PropertyKeys#SKIPQUANTUM}, and {@link BitStreamIndex.PropertyKeys#SKIPHEIGHT}.
126: *
127: * @return properties a new set of properties for the just created index.
128: */
129: Properties properties();
130:
131: /** Closes this index writer, completing the index creation process and releasing all resources.
132: *
133: * @throws IllegalStateException if too few records were written for the last inverted list.
134: */
135: void close() throws IOException;
136:
137: /** Writes to the given print stream statistical information about the index just built.
138: * This method must be called after {@link #close()}.
139: *
140: * @param stats a print stream where statistical information will be written.
141: */
142: void printStats(final PrintStream stats);
143:
144: }
|