001: package it.unimi.dsi.mg4j.search;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntIterator;
025: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
026: import it.unimi.dsi.fastutil.objects.ReferenceSet;
027: import it.unimi.dsi.mg4j.index.Index;
028: import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
029: import it.unimi.dsi.util.Interval;
030:
031: import java.io.IOException;
032: import java.util.Map;
033: import java.util.NoSuchElementException;
034:
035: /** An iterator over documents (pointers) and their intervals.
036: *
037: * <p><strong>Warning:</strong> the semantics of {@link #nextDocument()} has changed significantly
038: * in MG4J 1.2.
039: *
040: * <p><strong>Warning</strong>: from MG4J 1.2, most methods throw an {@link IOException}
041: * (such exceptions used to be catched and wrapped into a {@link RuntimeException}).
042: *
043: * <p><strong>Warning:</strong> the semantics of {@link #skipTo(int)} has changed significantly
044: * in MG4J 1.1.
045: *
046: * <P>Each call to {@link #nextDocument()}
047: * will return a document pointer, or -1 if no more documents are available. Just
048: * after the call to {@link #nextDocument()}, {@link #intervalIterator(Index)} will return an interval iterator
049: * enumerating intervals in the last returned document for the specified index. The latter method may return, as a special result, a
050: * special {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE} value: this means that
051: * albeit the current document satisfies the query, there is only a generic
052: * empty witness to prove it (see {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE} for some elaboration).
053: *
054: * <p>Note that this class implements {@link IntIterator}. Nonetheless, for performance reasons,
055: * the preferred access to the document pointers is {@link #nextDocument()}.
056: *
057: * <P>The {@link #iterator()} method <strong>must</strong> be an alias for {@link #intervalIterator()}, and shares
058: * the same limitations.
059: *
060: * <p>A document iterator is usually structured as composite,
061: * with operators as internal nodes and {@link it.unimi.dsi.mg4j.index.IndexIterator}s
062: * as leaves. The methods {@link #accept(DocumentIteratorVisitor)}
063: * and {@link #acceptOnTruePaths(DocumentIteratorVisitor)} implement the visitor pattern.
064: *
065: * <p>The {@link #dispose()} method is intended to recursively release all resources associated
066: * to a composite document iterator. Note that this is not always what you want, as you might
067: * be, say, pooling {@linkplain it.unimi.dsi.mg4j.index.IndexReader index readers} to reduce the number
068: * of file open/close operations. For this reason, we intentionally avoid calling the method “close”.
069: *
070: * <p><strong>Warning:</strong> the interval enumeration can be carried out only just after a call
071: * to {@link #nextDocument()}. Subsequent calls to {@link #nextDocument()} <em>or even to {@link java.util.Iterator#hasNext()}</em>
072: * will reset the internal state of the iterator. In particular, trying to enumerate intervals after a call
073: * to {@link java.util.Iterator#hasNext()} will usually throw an {@link java.lang.IllegalStateException}.
074: */
075:
076: public interface DocumentIterator extends IntIterator,
077: Iterable<Interval> {
078:
079: /** Returns the interval iterator of this document iterator for single-index queries.
080: *
081: * <P>This is a commodity method that can be used only for queries
082: * built over a single index.
083: *
084: * @return an interval iterator.
085: * @see #intervalIterator(Index)
086: * @throws IllegalStateException if this document iterator is not built on a single index.
087: */
088: public IntervalIterator intervalIterator() throws IOException;
089:
090: /** Returns the interval iterator of this document iterator for the given index.
091: *
092: * <P>After a call to {@link #nextDocument()}, this iterator
093: * can be used to retrieve the intervals in the current document (the
094: * one returned by {@link #nextDocument()}) for
095: * the index <code>index</code>.
096: *
097: * <P>Note that if all indices have positions,
098: * it is guaranteed that at least one index will return an interval.
099: * However, for disjunctive queries it cannot be guaranteed that <em>all</em>
100: * indices will return an interval.
101: *
102: * <p>Indices without positions always return {@link IntervalIterators#TRUE}.
103: * Thus, in presence of indices without positions it is possible that no
104: * intervals at all are available.
105: *
106: * @param index an index (must be one over which the query was built).
107: * @return an interval iterator over the current document in <code>index</code>.
108: */
109:
110: public IntervalIterator intervalIterator(Index index)
111: throws IOException;
112:
113: /** Returns an unmodifiable map from indices to interval iterators.
114: *
115: * <P>After a call to {@link #nextDocument()}, this map
116: * can be used to retrieve the intervals in the current document. An invocation of {@link Map#get(java.lang.Object)}
117: * on this map with argument <code>index</code> yields the same result as
118: * {@link #intervalIterator(Index) intervalIterator(index)}.
119: *
120: * @return a map from indices to interval iterators over the current document.
121: * @throws UnsupportedOperationException if this index does not contain positions.
122: * @see #intervalIterator(Index)
123: */
124:
125: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
126: throws IOException;
127:
128: /** Returns the set of indices over which this iterator is built.
129: *
130: * @return the set of indices over which this iterator is built.
131: */
132:
133: public ReferenceSet<Index> indices();
134:
135: /** Returns the next document.
136: *
137: * @deprecated As of MG4J 1.2, the suggested way of iterating over document iterators
138: * is {@link #nextDocument()}, which has been modified so to provide fully lazy
139: * iteration. After a couple of releases, however, this annotation will be removed, as it
140: * is very practical to have document iterators implementing {@link IntIterator}. Its
141: * main purpose is to warn people about performance issues solved by {@link #nextDocument()}.
142: * @see #nextDocument()
143: */
144: @Deprecated
145: public int nextInt();
146:
147: /** Returns the next document provided by this document iterator, or -1 if no more documents are available.
148: *
149: * <p><strong>Warning</strong>: the specification of this method has significantly changed as of MG4J 1.2.
150: * The special return value -1 is used to mark the end of iteration (a {@link NoSuchElementException}
151: * would have been thrown before in that case, so ho harm should be caused by this change). The reason
152: * for this change is providing <em>fully lazy</em> iteration over documents. Fully lazy iteration
153: * does not provide an <code>hasNext()</code> method—you have to actually ask for the next
154: * element and check the return value. Fully lazy iteration is much lighter on method calls (half) and
155: * in most (if not all) MG4J classes leads to a much simpler logic. Moreover, {@link #nextDocument()}
156: * can be specified as throwing an {@link IOException}, which avoids the pernicious proliferation
157: * of try/catch blocks in very short, low-level methods (it was having a detectable impact on performance).
158: *
159: * @return the next document, or -1 if no more documents are available.
160: */
161: public int nextDocument() throws IOException;
162:
163: /** Returns the last document returned by {@link #nextDocument()}.
164: *
165: * @return the last document returned by {@link #nextDocument()}, or -1 if no document has been returned yet.
166: */
167:
168: public int document();
169:
170: /** Skips all documents smaller than <code>n</code>.
171: *
172: * <P>Define the <em>current document</em> <code>k</code> associated with this document iterator
173: * as follows:
174: * <ul>
175: * <li>-1, if {@link #nextDocument()} and this method have never been called;
176: * <li>{@link Integer#MAX_VALUE}, if a call to this method returned {@link Integer#MAX_VALUE};
177: * <li>the last value returned by a call to {@link #nextDocument()} or this method, otherwise.
178: * </ul>
179: *
180: * <p>If <code>k</code> is larger than or equal to <code>n</code>, then
181: * this method does nothing and returns <code>k</code>. Otherwise, a
182: * call to this method is equivalent to
183: * <pre>
184: * while( ( k = nextDocument() ) < n && k != -1 );
185: * return k == -1 ? Integer.MAX_VALUE : k;
186: * </pre>
187: *
188: * <P>Thus, when a result <code>k</code> ≠ {@link Integer#MAX_VALUE}
189: * is returned, the state of this iterator
190: * will be exactly the same as after a call to {@link #nextDocument()}
191: * that returned <code>k</code>.
192: * In particular, the first document larger than or equal to <code>n</code> (when returned
193: * by this method) will <em>not</em> be returned by the next call to
194: * {@link #nextDocument()}.
195: *
196: * @param n a document pointer.
197: * @return a document pointer larger than or equal to <code>n</code> if available, {@link Integer#MAX_VALUE}
198: * otherwise.
199: */
200:
201: int skipTo(int n) throws IOException;
202:
203: /** Accepts a visitor.
204: *
205: * <p>A document iterator is usually structured as composite,
206: * with operators as internal nodes and {@link it.unimi.dsi.mg4j.index.IndexIterator}s
207: * as leaves. This method implements the visitor pattern.
208: *
209: * @param visitor the visitor.
210: * @return true if the visit should continue.
211: */
212: boolean accept(DocumentIteratorVisitor visitor) throws IOException;
213:
214: /** Accepts a visitor after a call to {@link #nextDocument()},
215: * limiting recursion to true paths.
216: *
217: * <p>After a call to {@link #nextDocument()}, a document iterator
218: * is positioned over a document. This call is equivalent to {@link #accept(DocumentIteratorVisitor)},
219: * but visits only along <em>true paths</em>.
220: *
221: * <p>We define a <em>true path</em> as a path from the root of the composite that passes only through
222: * nodes whose associated subtree is positioned on the same document of the root. Note that {@link OrDocumentIterator}s
223: * detach exhausted iterators from the composite tree, so true paths define the subtree that is causing
224: * the current document to satisfy the query represented by this document iterator.
225: *
226: * <p>For more elaboration, and the main application of this method, see {@link it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor}.
227: *
228: * @param visitor the visitor.
229: * @return true if the visit should continue.
230: * @see #accept(DocumentIteratorVisitor)
231: * @see it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor
232: */
233: boolean acceptOnTruePaths(DocumentIteratorVisitor visitor)
234: throws IOException;
235:
236: /** Disposes this document iterator, releasing all resources.
237: *
238: * <p>This method should propagate down to the underlying index iterators, where it should release resources
239: * such as open files and network connections. If you're doing your own resource tracking and pooling,
240: * then you do not need to call this method.
241: */
242: void dispose() throws IOException;
243:
244: /** An alias for {@link #intervalIterator()}, that has the same limitations (i.e., it will work only if
245: * there is just one index), and that catches {@link IOException}s.
246: *
247: * @return an interval iterator.
248: */
249: IntervalIterator iterator();
250:
251: }
|