001: package it.unimi.dsi.mg4j.search;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
025: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
026: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
027: import it.unimi.dsi.mg4j.index.Index;
028: import it.unimi.dsi.mg4j.index.IndexIterator;
029:
030: import java.io.IOException;
031: import java.util.Arrays;
032: import java.util.Comparator;
033: import java.util.Iterator;
034:
035: /** An abstract iterator on documents, generating the intersection of the documents returned by
036: * a number of document iterators.
037: *
038: * <P>To be usable, this class must be subclassed so to provide also an iterator on intervals.
039: * Such iterators must be instantiable using {@link #getComposedIntervalIterator(Index)}.
040: * The latter is an example of a <em>non-static factory method</em>, that is, a factory method which
041: * depends on the enclosing instance. This pattern allows to easily specialise this class to iterators
042: * that do different things, such as {@link it.unimi.dsi.mg4j.search.AndDocumentIterator} and
043: * {@link it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator}, but that have a similar semantics at the document
044: * level (the semantics may in fact be slightly different: for instance, not all document belonging
045: * to all components will actually appear in a consecutive iterator, as there may be documents filtered
046: * at the interval level).
047: *
048: * <P>The important invariant is that <em>only</em> after a call to {@link #nextDocument()}, a call
049: * to {@link #intervalIterator(Index)} will return an interval iterator over the document
050: * just returned, and that for at least one index in {@link #indices()} the iterator will not be empty
051: * or {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE}.
052: *
053: * <h2>The intersection algorithm</h2>
054: *
055: * <p>Since MG4J 1.1, this class implements a new intersection algorithm that should be significantly
056: * faster than the previous one. The main idea is that of letting sparser iterator interact as much
057: * as possible to obtain a candidate common document, and then trying to align the others. At construction
058: * time, the component iterators are sorted so that index iterators are separated, and sorted by frequency.
059: * Then, each time we have to align the iterators we align them greedily starting from the index
060: * iterators, in frequency order. This has the effect of skipping very quickly (and usually by
061: * large jumps, which are handled nicely by indices with skips),
062: * as the main interaction happens between low-frequency index iterators.
063: *
064: * <p>Moreover, this class treats in a special way
065: * {@linkplain PayloadPredicateDocumentIterator index iterators coming from payload-based indices}. Such
066: * iterators are checked at the end of the alignment process,
067: * after all standard index iterators (and general document iterators)
068: * are aligned. At that point, the special method {@link PayloadPredicateDocumentIterator#skipUnconditionallyTo(int)}
069: * is used to position unconditionally such iterators and check whether the payload predicate is satisfied.
070: * If this doesn't happen, the current candidate (obtained by alignment of standard iterators) is increased and the
071: * whole process is restarted. This procedure guarantees that we will never search exhaustively in a
072: * payload-based index a document record satisfying the predicate (unless, of course, we have a query
073: * containing just {@link PayloadPredicateDocumentIterator}s), which is very efficient if the payload-based
074: * index uses skipping.
075: *
076: */
077:
078: public abstract class AbstractIntersectionDocumentIterator extends
079: AbstractCompositeDocumentIterator {
080: private final static boolean DEBUG = false;
081: private final static boolean ASSERTS = false;
082:
083: /** A map from indices to interval iterators. */
084: final private Reference2ReferenceArrayMap<Index, IntervalIterator> intervalIterators;
085: /** A map from indices to the iterators returned for the current document. The key set may
086: * not contain an index because the related iterator has never been requested. Moreover,
087: * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
088: * because it could be {@link IntervalIterators#TRUE TRUE} (in fact, in that case it may even
089: * happen that {@link #intervalIterators} does not contain the index). */
090: final private Reference2ReferenceArrayMap<Index, IntervalIterator> currentIterators;
091: /** An unmodifiable wrapper around {@link #currentIterators}. */
092: final private Reference2ReferenceMap<Index, IntervalIterator> unmodifiableCurrentIterators;
093: /** The provided document iterators, suitably sorted. */
094: final private DocumentIterator[] sortedIterator;
095: /** The last element of {@link #sortedIterator}. */
096: final private DocumentIterator lastIterator;
097: /** The prefix of {@link #sortedIterator} made of {@link PayloadPredicateDocumentIterator}s. */
098: final private PayloadPredicateDocumentIterator[] payloadPredicateDocumentIterator;
099: /** Iterators in {@link #sortedIterator} up to this position (exclusive) are instances of {@link PayloadPredicateDocumentIterator}. */
100: final private int predicateStart;
101: /** When true, the intersection list has been exhausted (of course, if {@link #next} is not -1 there is still an element to be returned). */
102: private boolean exhausted;
103: /** The current element on which all iterators are aligned, unless {@link #exhausted} is true, in which case the value is undefined. */
104: private int curr;
105:
106: /** Creates a new intersection iterator using a given array of iterators.
107: * @param documentIterator the iterators to be insersected (at least one).
108: * @throws IOException
109: */
110: protected AbstractIntersectionDocumentIterator(
111: final DocumentIterator[] documentIterator)
112: throws IOException {
113: super (documentIterator);
114: if (documentIterator.length == 0)
115: throw new IllegalArgumentException();
116: sortedIterator = documentIterator.clone(); // We need a copy to reorder iterators
117:
118: intervalIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
119: indices.size());
120: currentIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
121: indices.size());
122: unmodifiableCurrentIterators = Reference2ReferenceMaps
123: .unmodifiable(currentIterators);
124:
125: // We now reorder iterators putting in the back the index iterators of smallest frequency and moving to front payload-predicate iterators.
126: Arrays.sort(sortedIterator, new Comparator<DocumentIterator>() {
127: public int compare(final DocumentIterator d0,
128: final DocumentIterator d1) {
129: final PayloadPredicateDocumentIterator p0 = d0 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator) d0
130: : null;
131: final PayloadPredicateDocumentIterator p1 = d1 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator) d1
132: : null;
133:
134: if (p0 != null && p1 != null)
135: return 0;
136: if (p0 != null)
137: return -1;
138: if (p1 != null)
139: return 1;
140:
141: final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator) d0
142: : null;
143: final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator) d1
144: : null;
145: if (i0 == null && i1 == null)
146: return 0;
147: if ((i0 != null) != (i1 != null))
148: return (i0 != null) ? 1 : -1;
149: try {
150: return i1.frequency() - i0.frequency();
151: } catch (IOException e) {
152: throw new RuntimeException(e);
153: }
154: }
155: });
156:
157: lastIterator = sortedIterator[n - 1];
158: int i;
159: for (i = n; i-- != 0;)
160: if (sortedIterator[i] instanceof PayloadPredicateDocumentIterator)
161: break;
162: predicateStart = i + 1;
163: payloadPredicateDocumentIterator = new PayloadPredicateDocumentIterator[predicateStart];
164: for (i = predicateStart; i-- != 0;)
165: payloadPredicateDocumentIterator[i] = (PayloadPredicateDocumentIterator) sortedIterator[i];
166:
167: if (DEBUG)
168: System.err.println("Sorted iterators: "
169: + Arrays.toString(sortedIterator));
170:
171: /* We now advance all iterators to their first common pointer, if any. Note
172: * that the difference between documentIterator and this.documentIterator
173: * is immaterial here. */
174:
175: for (i = n; i-- != 0;)
176: if (!sortedIterator[i].hasNext()) {
177: // If any of the iterators is empty, we're over.
178: exhausted = true;
179: return;
180: }
181:
182: if (align())
183: next = curr;
184: else
185: exhausted = true;
186: }
187:
188: /** Align all iterators on the first common document pointer after {@link #curr}.
189: *
190: * <P>After a call to this method, all component iterators are positioned
191: * on the same document (and {@link #curr} is set to that document),
192: * unless the method returns false, in which case there
193: * are no more document pointer making alignment possible (and {@link #exhausted} is true).
194: *
195: * @return true if all component iterators are aligned, false if
196: * no alignment position could be found.
197: */
198: private boolean align() throws IOException {
199: if (ASSERTS)
200: assert !exhausted;
201: if (DEBUG)
202: System.err.println(this + ".align()...");
203:
204: int i, res;
205: int candidate = curr;
206:
207: for (;;) {
208: for (i = n; i-- != 0;) {
209: if (i < predicateStart)
210: res = payloadPredicateDocumentIterator[i]
211: .skipUnconditionallyTo(candidate);
212: else
213: res = sortedIterator[i].skipTo(candidate);
214:
215: if (res == Integer.MAX_VALUE)
216: return false;
217:
218: if (res != candidate) {
219: // Note that for payload-predicate document iterators res might be negative
220: if (res > candidate)
221: candidate = res;
222: else {
223: if (ASSERTS)
224: assert res < 0;
225: candidate++;
226: }
227: break;
228: }
229: }
230:
231: if (i == -1) {
232: curr = candidate;
233: return true;
234: }
235: }
236: }
237:
238: public int skipTo(final int n) throws IOException {
239: if (last >= n)
240: return last;
241: last = next = -1;
242: currentIterators.clear();
243: if (curr < n) {
244: if ((curr = lastIterator.skipTo(n)) == Integer.MAX_VALUE) {
245: exhausted = true;
246: return Integer.MAX_VALUE;
247: } else
248: exhausted = !align();
249: }
250: return exhausted ? Integer.MAX_VALUE : (last = curr);
251: }
252:
253: public int nextDocument() throws IOException {
254: if (DEBUG)
255: System.err.println(this + ".hasNext()");
256:
257: if (next >= 0) {
258: // We already know what to return
259: last = next;
260: next = -1;
261: return last;
262: }
263:
264: last = next = -1;
265: currentIterators.clear();
266: if ((curr = lastIterator.nextDocument()) == -1)
267: exhausted = true;
268: else
269: exhausted = !align();
270: if (exhausted)
271: return -1;
272: return last = curr;
273: }
274:
275: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
276: throws IOException {
277: final Iterator<Index> i = indices.iterator();
278: while (i.hasNext())
279: intervalIterator(i.next());
280: return unmodifiableCurrentIterators;
281: }
282:
283: public IntervalIterator intervalIterator(final Index index)
284: throws IOException {
285: if (DEBUG)
286: System.err.println(this + ".intervalIterator(" + index
287: + ")");
288: if (last == -1)
289: throw new IllegalStateException();
290: if (!indices.contains(index))
291: return IntervalIterators.TRUE;
292: IntervalIterator intervalIterator;
293:
294: // If the iterator has been created and it's ready, we just return it.
295: if ((intervalIterator = currentIterators.get(index)) != null)
296: return intervalIterator;
297:
298: int i, c;
299:
300: /* None of the iterators may be FALSE. Otherwise, if all
301: * iterators are TRUE, we return TRUE. Otherwise, we return
302: * (possibly after creation) the underlying interval iterator.
303: */
304:
305: for (i = c = 0; i < n; i++) {
306: intervalIterator = documentIterator[i]
307: .intervalIterator(index);
308: // We cannot be on a document one of whose iterators if FALSE.
309: if (intervalIterator == IntervalIterators.FALSE)
310: break;
311: if (intervalIterator != IntervalIterators.TRUE)
312: c++;
313: }
314:
315: // Note that we cannot optimise the case c == 1 because of gaps in ConsecutiveDocumentIterator.
316: if (i < n)
317: intervalIterator = IntervalIterators.FALSE;
318: else if (c == 0)
319: intervalIterator = IntervalIterators.TRUE;
320: else {
321: intervalIterator = intervalIterators.get(index);
322: if (intervalIterator == null)
323: intervalIterators
324: .put(
325: index,
326: intervalIterator = getComposedIntervalIterator(index));
327: intervalIterator.reset();
328: }
329:
330: currentIterators.put(index, intervalIterator);
331: return intervalIterator;
332: }
333:
334: abstract protected IntervalIterator getComposedIntervalIterator(
335: Index index);
336: }
|