001: package it.unimi.dsi.mg4j.search;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntSet;
025: import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
026: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
027: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
028: import it.unimi.dsi.fastutil.objects.ReferenceSet;
029: import it.unimi.dsi.mg4j.index.Index;
030: import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
031: import it.unimi.dsi.util.Interval;
032:
033: import java.io.IOException;
034: import java.util.Iterator;
035:
036: /** A document iterator that filters another document iterator, returning just intervals (and containing
037: * documents) whose length does not exceed a given threshold.
038: *
039: * @author Paolo Boldi
040: * @author Sebastiano Vigna
041: * @since 0.9
042: */
043:
044: public class LowPassDocumentIterator extends AbstractDocumentIterator {
045:
046: @SuppressWarnings("unused")
047: private final static boolean DEBUG = false;
048: @SuppressWarnings("unused")
049: private final static boolean ASSERTS = false;
050:
051: /** The underlying iterator. */
052: final private DocumentIterator documentIterator;
053: /** If not <code>null</code>, the sole index involved in this iterator. */
054: final private Index soleIndex;
055: /** The iterator threshold. */
056: final protected int threshold;
057: /** A map from indices to interval iterators. */
058: final private Reference2ReferenceArrayMap<Index, IntervalIterator> intervalIterators;
059: /** A map from indices to the iterators returned for the current document. The key set may
060: * not contain an index because the related iterator has never been requested. Moreover,
061: * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
062: * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even
063: * happen that {@link #intervalIterators} does not contain the index). */
064: final private Reference2ReferenceArrayMap<Index, IntervalIterator> currentIterators;
065: /** An unmodifiable wrapper around {@link #currentIterators}. */
066: final private Reference2ReferenceMap<Index, IntervalIterator> unmodifiableCurrentIterators;
067:
068: /** Creates a new low-pass document iterator over a given iterator.
069: * @param documentIterator the iterator to be filtered.
070: * @param threshold the filter threshold.
071: */
072: protected LowPassDocumentIterator(
073: final DocumentIterator documentIterator, final int threshold) {
074: this .documentIterator = documentIterator;
075: this .threshold = threshold;
076: final int n = documentIterator.indices().size();
077: soleIndex = n == 1 ? indices().iterator().next() : null;
078: intervalIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
079: n);
080: currentIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
081: n);
082: unmodifiableCurrentIterators = Reference2ReferenceMaps
083: .unmodifiable(currentIterators);
084: }
085:
086: /** Returns a low-pass document iterator over a given iterator.
087: * @param it the iterator to be filtered.
088: * @param threshold the filter threshold.
089: */
090: public static LowPassDocumentIterator getInstance(
091: final DocumentIterator it, final int threshold) {
092: return new LowPassDocumentIterator(it, threshold);
093: }
094:
095: public ReferenceSet<Index> indices() {
096: return documentIterator.indices();
097: }
098:
099: public int nextDocument() throws IOException {
100: if (next >= 0) {
101: last = next;
102: next = -1;
103: return last;
104: }
105:
106: do
107: currentIterators.clear();
108: while ((last = documentIterator.nextDocument()) != -1
109: && !isValid());
110: return last;
111: }
112:
113: public int skipTo(final int n) throws IOException {
114: // The easy case.
115: if (last >= n)
116: return last;
117:
118: last = next = -1;
119: currentIterators.clear();
120: // We first try to get a candidate document.
121: final int res = documentIterator.skipTo(n);
122: // If this doesn't work, be bail out.
123: if (res == Integer.MAX_VALUE)
124: return Integer.MAX_VALUE;
125:
126: last = res;
127: // Otherwise, we must manually check that we are on a valid document
128: if (isValid())
129: return res;
130: // If not, we invalidate and check whether there is another possible document.
131: return nextDocument() != -1 ? last : Integer.MAX_VALUE;
132: }
133:
134: private boolean isValid() throws IOException {
135: /* The policy here is that a low-pass is valid is at least one of the underlying
136: * interval iterators, once filtered, would return at least one interval. Note
137: * that TRUE iterators are not actually filtered, so they always
138: * return true on a call to hasNext(). */
139:
140: if (soleIndex == null)
141: return intervalIterator(soleIndex).hasNext();
142:
143: for (Index index : indices())
144: if (intervalIterator(index).hasNext())
145: return true;
146: return false;
147: }
148:
149: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
150: throws IOException {
151: final Iterator<Index> i = indices().iterator();
152: while (i.hasNext())
153: intervalIterator(i.next());
154: return unmodifiableCurrentIterators;
155: }
156:
157: public IntervalIterator intervalIterator() throws IOException {
158: if (soleIndex == null)
159: throw new IllegalStateException();
160: return intervalIterator(soleIndex);
161: }
162:
163: public IntervalIterator intervalIterator(final Index index)
164: throws IOException {
165: if (last == -1)
166: throw new IllegalStateException();
167: if (DEBUG)
168: System.err.println(this + ".intervalIterator(" + index
169: + ")");
170: if (!documentIterator.indices().contains(index))
171: return IntervalIterators.TRUE;
172:
173: IntervalIterator intervalIterator;
174:
175: // If the iterator has been created and it's ready, we just return it.
176: if ((intervalIterator = currentIterators.get(index)) != null)
177: return intervalIterator;
178:
179: intervalIterator = documentIterator.intervalIterator(index);
180:
181: /* If the underlying iterator is TRUE or FALSE, then our constribution to the result is not relevant,
182: * and we just pass this information upwards. E.g., consider the query (A OR title:B)~2 with
183: * a document containing A but not B in its title. When evaluating the query for the title index,
184: * the subquery before the low-pass operator evalutes to TRUE, meaning that its truth is independent
185: * of the title field. This fact is not changed by the low-pass operator. */
186: if (intervalIterator != IntervalIterators.FALSE
187: && intervalIterator != IntervalIterators.TRUE) {
188: intervalIterator = intervalIterators.get(index);
189: if (intervalIterator == null)
190: intervalIterators.put(index,
191: intervalIterator = new LowPassIntervalIterator(
192: index));
193: intervalIterator.reset();
194: }
195:
196: currentIterators.put(index, intervalIterator);
197: return intervalIterator;
198: }
199:
200: public void dispose() throws IOException {
201: documentIterator.dispose();
202: }
203:
204: public boolean accept(final DocumentIteratorVisitor visitor)
205: throws IOException {
206: return visitor.visitPre(this )
207: && documentIterator.accept(visitor)
208: && visitor.visitPost(this );
209: }
210:
211: public boolean acceptOnTruePaths(
212: final DocumentIteratorVisitor visitor) throws IOException {
213: return visitor.visitPre(this )
214: && documentIterator.acceptOnTruePaths(visitor)
215: && visitor.visitPost(this );
216: }
217:
218: public String toString() {
219: return this .getClass().getSimpleName() + "(" + documentIterator
220: + ", " + threshold + ")";
221: }
222:
223: /** An interval iterator returning just the interval shorter than {@link #threshold}. */
224:
225: private class LowPassIntervalIterator extends
226: AbstractIntervalIterator implements IntervalIterator {
227: /** The index of this iterator. */
228: final Index index;
229: /** The underlying interal iterator. */
230: private IntervalIterator intervalIterator;
231:
232: public LowPassIntervalIterator(final Index index) {
233: this .index = index;
234: }
235:
236: public void reset() throws IOException {
237: next = null;
238: intervalIterator = documentIterator.intervalIterator(index);
239: }
240:
241: public void intervalTerms(final IntSet terms) {
242: // Just delegate to the filtered iterator
243: intervalIterator.intervalTerms(terms);
244: }
245:
246: public Interval nextInterval() throws IOException {
247: if (next != null) {
248: final Interval result = next;
249: next = null;
250: return result;
251: }
252:
253: Interval result;
254: while ((result = intervalIterator.nextInterval()) != null
255: && result.length() > threshold)
256: ;
257: return result;
258: }
259:
260: public int extent() {
261: return Math.min(intervalIterator.extent(), threshold);
262: }
263:
264: public String toString() {
265: return getClass().getSimpleName() + "(" + intervalIterator
266: + ", " + threshold + ")";
267: }
268: }
269: }
|