001: package it.unimi.dsi.mg4j.search;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntArrays;
025: import it.unimi.dsi.fastutil.ints.IntSet;
026: import it.unimi.dsi.mg4j.index.Index;
027: import it.unimi.dsi.util.Interval;
028: import it.unimi.dsi.util.Intervals;
029:
030: import java.io.IOException;
031:
032: /** An iterator returning documents containing consecutive intervals (in query order)
033: * satisfying the underlying queries.
034: *
035: * <p>As an additional service, this class makes it possible to specify <em>gaps</em> between
036: * intervals. If gaps are specified, a match will satisfy the condition
037: * that the left extreme of the first interval is larger than or equal to the
038: * first gap, the left extreme of the second interval is larger than
039: * the right extreme of the first interval plus the second gap, and so on. The standard
040: * semantics corresponds thus to the everywhere zero gap array.
041: *
042: * <p>This semantics
043: * makes it possible to peform phrasal searches “with holes”, typically
044: * because of stopwords that have not been indexed. Note that it is possible to specify
045: * a gap <em>before</em> the first interval, but not <em>after</em> the last interval,
046: * as in general the document length is not known at this level of query resolution.
047: *
048: * <p>This class will handle correctly {@link IntervalIterators#TRUE TRUE} iterators; in this
049: * case, the semantics is defined as follows: an interval is in the output if it is formed by the union of disjoint intervals,
050: * one from each input list, and each gap of value <var>k</var> corresponds to <var>k</var> iterators
051: * returning all document positions as singleton intervals. Since {@link IntervalIterators#TRUE TRUE} represents a list containing just
052: * the empty interval, the result is equivalent to dropping {@link IntervalIterators#TRUE TRUE} iterators from the input; as
053: * a consequence, the gap of a {@link IntervalIterators#TRUE TRUE} iterator is merged with that of the following iterator.
054: *
055: * <p><strong>Warning</strong>: In case gaps are specified, the mathematically correct semantics would require that
056: * gaps before {@link IntervalIterators#TRUE TRUE} iterators that are not followed by any non-{@link IntervalIterators#TRUE TRUE} iterators
057: * have the effect of enlarging the resulting intervals <em>on the right side</em>. However,
058: * this behaviour is very difficult to implement at this level because document lengths are not known. For this
059: * reason, if one or more {@link IntervalIterators#TRUE TRUE} iterators appear a the end of the component iterator list they will be simply dropped.
060: */
061:
062: public class ConsecutiveDocumentIterator extends
063: AbstractOrderedIntervalDocumentIterator {
064: /** The gap array. This is essentially the array provided at construction time; however, if a {@link ConsecutiveIndexIntervalIterator}
065: * is requested by {@link #getComposedIntervalIterator(Index)} this array will be used to store <em>cumulative</em> gaps. */
066: private final int gap[];
067:
068: /** Returns a document iterator that computes the consecutive AND of the given array of iterators.
069: *
070: * <P>Note that the special case of the empty and of the singleton arrays
071: * are handled efficiently.
072: *
073: * @param numberOfDocuments the number of documents; relevant only if <code>it</code> has zero length.
074: * @param documentIterator the iterators to be composed.
075: * @return a document iterator that computes the consecutive AND of <code>it</code>.
076: * @throws IOException
077: */
078: public static DocumentIterator getInstance(
079: final int numberOfDocuments,
080: final DocumentIterator... documentIterator)
081: throws IOException {
082: if (documentIterator.length == 0)
083: return NotDocumentIterator
084: .getInstance(DocumentIterators.EMPTY_ITERATOR,
085: numberOfDocuments);
086: if (documentIterator.length == 1)
087: return documentIterator[0];
088: return new ConsecutiveDocumentIterator(documentIterator, null);
089: }
090:
091: /** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators.
092: *
093: * <P>Note that the special case of the singleton array is handled efficiently.
094: *
095: * @param documentIterator the iterators to be composed (at least one).
096: * @return a document iterator that computes the consecutive AND of <code>documentIterator</code>.
097: * @throws IOException
098: */
099: public static DocumentIterator getInstance(
100: final DocumentIterator... documentIterator)
101: throws IOException {
102: if (documentIterator.length == 1)
103: return documentIterator[0];
104: return getInstance(-1, documentIterator);
105: }
106:
107: /** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators, adding
108: * gaps between intervals.
109: *
110: * <p>A match will satisfy the condition
111: * that the left extreme of the first interval is larger than or equal to the
112: * first gap, the left extreme of the second interval is larger than
113: * the right extreme of the first interval plus the second gap, and so on. This semantics
114: * makes it possible to perform phrasal searches “with holes”, typically
115: * because of stopwords that have not been indexed.
116: *
117: * @param documentIterator the iterators to be composed (at least one).
118: * @param gap an array of gaps parallel to <code>documentIterator</code>, or <code>null</code> for no gaps.
119: * @return a document iterator that computes the consecutive AND of <code>documentIterator</code> using the given gaps.
120: * @throws IOException
121: */
122: public static DocumentIterator getInstance(
123: final DocumentIterator documentIterator[], final int gap[])
124: throws IOException {
125: if (gap != null && gap.length != documentIterator.length)
126: throw new IllegalArgumentException(
127: "The number of gaps ("
128: + gap.length
129: + ") is not equal to the number of document iterators ("
130: + documentIterator.length + ")");
131: if (documentIterator.length == 1
132: && (gap == null || gap[0] == 0))
133: return documentIterator[0];
134: return new ConsecutiveDocumentIterator(documentIterator, gap);
135: }
136:
137: protected ConsecutiveDocumentIterator(
138: final DocumentIterator[] documentIterator, final int[] gap)
139: throws IOException {
140: super (documentIterator);
141: if (gap == null)
142: this .gap = new int[n];
143: else
144: this .gap = gap.clone();
145: }
146:
147: protected IntervalIterator getComposedIntervalIterator(
148: final Index unused) {
149: if (ASSERTS)
150: assert unused == soleIndex;
151: if (indexIterator == null)
152: return new ConsecutiveIntervalIterator(gap);
153: // In this case, gap must be made cumulative.
154: for (int i = 1; i < n; i++)
155: gap[i] += gap[i - 1] + 1;
156: return new ConsecutiveIndexIntervalIterator(gap);
157: }
158:
159: /** An interval iterator returning the BLOCK of the component iterator
160: * (i.e., intervals made of sequences of consecutive intervals
161: * of the component iterator, in the given order).
162: *
163: * <p>In this implementation, {@link #advanced} is
164: * never true when {@link AbstractOrderedIntervalIterator#endOfProcess} is true.
165: */
166:
167: private class ConsecutiveIntervalIterator extends
168: AbstractOrderedIntervalIterator {
169: /** A cached reference to the gap array. */
170: @SuppressWarnings("hiding")
171: private final int[] gap;
172: /** The actual gaps. They depend on whether some {@link IntervalIterators#TRUE} iterator reduces the iterator array. */
173: private final int[] actualGap;
174: /** Whether the scan is over. */
175: private boolean endOfProcess;
176: /** The number of non-{@link IntervalIterators#TRUE} interval iterator. */
177: private int m;
178:
179: public ConsecutiveIntervalIterator(final int[] gap) {
180: this .gap = gap;
181: // The enlargement is made necessary by the filling long in reset().
182: this .actualGap = new int[n + 1];
183: }
184:
185: /** Loads {@link #curr} with the first interval from each non-{@link IntervalIterators#TRUE} iterator, leaving
186: * in {@link #m} the number of non-{@link IntervalIterators#TRUE} iterators, and in {@link #actualGap}
187: * the gaps to be used for those {@link #m} iterators.
188: */
189:
190: public void reset() throws IOException {
191: final int[] actualGap = this .actualGap;
192: final int[] gap = this .gap;
193: final IntervalIterator[] intervalIterator = this .intervalIterator;
194:
195: actualGap[m = 0] = 0;
196:
197: int i;
198: for (i = 0; i < n; i++) {
199: intervalIterator[m] = documentIterator[i]
200: .intervalIterator();
201: if (ASSERTS)
202: assert intervalIterator[m].hasNext();
203: actualGap[m] += gap[i];
204: curr[m] = Intervals.MINUS_INFINITY;
205: if (intervalIterator[m] != IntervalIterators.TRUE)
206: actualGap[++m] = 1;
207: }
208:
209: if (m == 0)
210: throw new IllegalStateException();
211:
212: next = null;
213: do
214: curr[0] = intervalIterator[0].nextInterval();
215: while (curr[0] != null && curr[0].left < actualGap[0]);
216: if (!(endOfProcess = curr[0] == null))
217: next = align();
218: }
219:
220: public void intervalTerms(final IntSet terms) {
221: for (int i = m; i-- != 0;)
222: intervalIterator[i].intervalTerms(terms);
223: }
224:
225: private Interval align() throws IOException {
226: if (DEBUG)
227: System.err.println(this + ".align()");
228:
229: final Interval[] curr = this .curr;
230: final int[] actualGap = this .actualGap;
231: final IntervalIterator[] intervalIterator = this .intervalIterator;
232:
233: if (DEBUG)
234: System.err.println(java.util.Arrays.asList(curr));
235: int k = 0;
236: Interval interval;
237:
238: while (k < m) {
239: for (k = 1; k < m; k++) {
240: while (curr[k].left < curr[k - 1].right
241: + actualGap[k])
242: if ((curr[k] = intervalIterator[k]
243: .nextInterval()) == null) {
244: endOfProcess = true;
245: return null;
246: }
247:
248: if (curr[k].left > curr[k - 1].right + actualGap[k]) {
249: final int limit = curr[k].left - actualGap[k];
250: /* Note that we are not interested in intervals whole right extreme is
251: * smaller than limit, as it is impossible to find a match in that case. */
252: while ((interval = intervalIterator[0]
253: .nextInterval()) != null
254: && interval.right < limit)
255: ;
256: if (endOfProcess = interval == null)
257: return null;
258: curr[0] = interval;
259: break;
260: }
261: }
262: }
263:
264: return Interval.valueOf(curr[0].left - actualGap[0],
265: curr[m - 1].right);
266: }
267:
268: public Interval nextInterval() throws IOException {
269: if (next != null) {
270: final Interval result = next;
271: next = null;
272: return result;
273: }
274:
275: if (endOfProcess)
276: return null;
277:
278: if ((curr[0] = intervalIterator[0].nextInterval()) == null) {
279: endOfProcess = true;
280: return null;
281: }
282:
283: return align();
284: }
285:
286: public int extent() {
287: int s = 0;
288: for (int i = m; i-- != 0;)
289: s += intervalIterator[i].extent() + actualGap[i];
290: return s;
291: }
292:
293: }
294:
295: private class ConsecutiveIndexIntervalIterator extends
296: AbstractOrderedIndexIntervalIterator {
297: /** A cached reference to the gap array. */
298: @SuppressWarnings("hiding")
299: private final int[] gap;
300: /** Whether the scan is over. */
301: private boolean endOfProcess;
302:
303: public ConsecutiveIndexIntervalIterator(final int[] gap) {
304: this .gap = gap;
305: }
306:
307: /** Resets the iterator by calling the superclass method, and then aligning all iterators.
308: *
309: * <p>Note that in this class {@link #curr} is used to denote the value of the current position
310: * minus the corresponding {@linkplain #gap cumulative gap}; this method updates {@link #curr} accordingly. */
311:
312: public void reset() throws IOException {
313: final int[][] position = this .position;
314: final int[] currPos = this .currPos;
315: final int[] curr = this .curr;
316: final int[] count = this .count;
317: final int[] gap = this .gap;
318:
319: IntArrays.fill(currPos, 0);
320: for (int i = n; i-- != 0;) {
321: count[i] = indexIterator[i].count();
322: position[i] = indexIterator[i].positionArray();
323: curr[i] = position[i][0];
324: }
325: endOfProcess = false;
326: for (int i = n; i-- != 0;)
327: curr[i] -= gap[i];
328:
329: if (gap[0] != 0) {
330: // Go beyond the 0-th gap. This must be done just once.
331: next = null;
332: while (curr[0] < 0 && ++currPos[0] < count[0])
333: curr[0] = position[0][currPos[0]] - gap[0];
334: endOfProcess = currPos[0] == count[0];
335: }
336: if (!endOfProcess)
337: next = align();
338: }
339:
340: public void intervalTerms(final IntSet terms) {
341: for (int i = n; i-- != 0;)
342: terms.add(indexIterator[i].termNumber());
343: }
344:
345: private Interval align() {
346: if (DEBUG)
347: System.err.println(this + ".align()");
348:
349: final int[][] position = this .position;
350: final int[] currPos = this .currPos;
351: final int[] curr = this .curr;
352: final int[] count = this .count;
353: final int[] gap = this .gap;
354:
355: int c, k = 1, l = n <= 2 ? 0 : 2; // This is actually 2 % n
356: boolean oneRoundDone = false;
357: int[] p;
358: int start = curr[0];
359:
360: for (;;) {
361: p = position[k];
362: c = currPos[k];
363: // First, we try to align the k-th term.
364: while (c < count[k] && p[c] - gap[k] < start)
365: c++;
366: // If we exhaust the term positions, it's all over.
367: if (c == count[k]) {
368: endOfProcess = true;
369: return null;
370: }
371: curr[k] = p[currPos[k] = c] - gap[k];
372: // If we went beyond start + k, we must update start.
373: if (curr[k] > start)
374: start = curr[k];
375:
376: // If k == 0, it means we have made a full round of alignment, so the next check is now valid.
377: oneRoundDone |= (k == 0);
378: // If oneRoundDone, all current normalised positions (curr[ x ] - gap[ x ]) are squeezed between start and curr[ l ].
379: if (oneRoundDone && curr[l] == start)
380: return Interval.valueOf(curr[0], curr[0]
381: + gap[n - 1]);
382: k = l;
383:
384: if ((l = l + 1) == n)
385: l = 0;
386: }
387: }
388:
389: public Interval nextInterval() {
390: if (next != null) {
391: final Interval result = next;
392: next = null;
393: return result;
394: }
395:
396: if (endOfProcess)
397: return null;
398:
399: if (++currPos[0] < count[0])
400: curr[0] = position[0][currPos[0]] - gap[0];
401: else {
402: endOfProcess = true;
403: return null;
404: }
405: return align();
406: }
407: }
408: }
|