001: package it.unimi.dsi.mg4j.search;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.IndirectPriorityQueue;
025: import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
026: import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
027: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
028: import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
029: import it.unimi.dsi.mg4j.index.Index;
030: import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;
031:
032: import java.io.IOException;
033: import java.util.Iterator;
034:
035: /** A document iterator on documents, generating the union of the documents returned
036: * by a number of document iterators.
037: *
038: * <p>The pattern of this class is the same as that of {@link AbstractIntersectionDocumentIterator}.
039: * Additionally, this class provides a mechanism that makes accessible the set of component
040: * document iterators that are {@linkplain #computeFront() positioned on the current document}.
041: */
042:
043: public abstract class AbstractUnionDocumentIterator extends
044: AbstractCompositeDocumentIterator {
045: private final static boolean DEBUG = false;
046: //private final static boolean ASSERTS = false;
047:
048: /** A heap-based semi-indirect priority queue used to keep track of the currently scanned integers. */
049: final protected IntHeapSemiIndirectPriorityQueue queue;
050: /** The {@link IndirectPriorityQueue#front(int[])} of {@link #queue}, if {@link #frontSize} is not -1. */
051: final protected int[] front;
052: /** The reference array used for the queue. */
053: final protected int[] curr;
054: /** A map from indices to interval iterators. */
055: final private Reference2ReferenceArrayMap<Index, IntervalIterator> intervalIterators;
056: /** A map from indices to the iterators returned for the current document. The key set may
057: * not contain an index because the related iterator has never been requested. Moreover,
058: * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
059: * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even
060: * happen that {@link #intervalIterators} does not contain the index). */
061: final private Reference2ReferenceArrayMap<Index, IntervalIterator> currentIterators;
062: /** An unmodifiable wrapper around {@link #currentIterators}. */
063: final private Reference2ReferenceMap<Index, IntervalIterator> unmodifiableCurrentIterators;
064:
065: /** The number of valid entries in {@link #front}, or -1 if the front has not been computed for the current document. */
066: protected int frontSize = -1;
067:
068: /** Creates a new document iterator that computes the OR of the given array of iterators.
069: * @param documentIterator the iterators to be joined.
070: * @throws IOException
071: */
072: protected AbstractUnionDocumentIterator(
073: final DocumentIterator... documentIterator)
074: throws IOException {
075: super (documentIterator);
076: this .curr = new int[n];
077:
078: queue = new IntHeapSemiIndirectPriorityQueue(curr);
079:
080: intervalIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
081: indices.size());
082: currentIterators = new Reference2ReferenceArrayMap<Index, IntervalIterator>(
083: indices.size());
084: unmodifiableCurrentIterators = Reference2ReferenceMaps
085: .unmodifiable(currentIterators);
086:
087: // Only add to the queue nonempty iterators...
088: for (int i = 0; i < n; i++)
089: if ((curr[i] = documentIterator[i].nextDocument()) != -1)
090: queue.enqueue(i);
091: // If queue is empty, the process is over
092:
093: if (!queue.isEmpty())
094: next = curr[queue.first()];
095: front = new int[queue.size()];
096: }
097:
098: public int skipTo(final int n) throws IOException {
099: if (last >= n)
100: return last;
101: if (queue.isEmpty())
102: return Integer.MAX_VALUE;
103: last = next = -1;
104: currentIterators.clear();
105: frontSize = -1; // Invalidate front
106:
107: int first, res;
108: while (curr[first = queue.first()] < n) {
109: res = documentIterator[first].skipTo(n);
110:
111: // Cannot advance the minimum
112: if (res == Integer.MAX_VALUE) {
113: // Remove it
114: queue.dequeue();
115: // If nothing else remains, we are done
116: if (queue.isEmpty())
117: return Integer.MAX_VALUE;
118: } else {
119: // Advance the top element, and signal this fact to the queue
120: curr[first] = res;
121: queue.changed();
122: }
123: }
124:
125: return last = curr[first];
126: }
127:
128: public int nextDocument() throws IOException {
129: if (next != -1) {
130: // We already know what to return
131: last = next;
132: next = -1;
133: return last;
134: }
135:
136: currentIterators.clear();
137: frontSize = -1; // Invalidate front
138: if (queue.isEmpty())
139: return last = -1;
140:
141: // The least element
142: int first, c = curr[queue.first()];
143:
144: // Advance all elements equal to the least one
145: while (curr[first = queue.first()] == c) {
146: if ((curr[first] = documentIterator[first].nextDocument()) != -1)
147: queue.changed();
148: else {
149: // Remove it
150: queue.dequeue();
151: // If nothing else remains, we are done
152: if (queue.isEmpty())
153: return last = -1;
154: }
155: }
156:
157: return last = curr[first];
158: }
159:
160: /** Forces computation of the current front, returning the number of indices it contains.
161: *
162: * <p>After a call to this method,
163: * the first elements of {@link #front} contain
164: * the indices of the {@linkplain AbstractCompositeDocumentIterator#documentIterator component document iterators}
165: * that are positioned on the current document. If the front has already been
166: * computed for the current document, this method has no side effects.
167: *
168: * @return the size of the current front (the number of valid entries in {@link #front}).
169: */
170:
171: protected int computeFront() {
172: if (frontSize == -1)
173: frontSize = queue.front(front);
174: return frontSize;
175: }
176:
177: public Reference2ReferenceMap<Index, IntervalIterator> intervalIterators()
178: throws IOException {
179: final Iterator<Index> i = indices.iterator();
180: while (i.hasNext())
181: intervalIterator(i.next());
182: return unmodifiableCurrentIterators;
183: }
184:
185: public IntervalIterator intervalIterator(final Index index)
186: throws IOException {
187: if (DEBUG)
188: System.err.println(this + ".intervalIterator(" + index
189: + ")");
190: if (last == -1)
191: throw new IllegalStateException();
192: if (!indices.contains(index))
193: return IntervalIterators.TRUE;
194:
195: IntervalIterator intervalIterator;
196:
197: // If the iterator has been created and it's ready, we just return it.
198: if ((intervalIterator = currentIterators.get(index)) != null)
199: return intervalIterator;
200:
201: boolean atLeastOneIsTrue = false;
202:
203: /* If all iterators are TRUE, we return TRUE. Otherwise,
204: * we skip all FALSE iterators and all iterators whose document
205: * is not last, and merge the remaining ones. If
206: * all iterators are FALSE, we return FALSE.
207: */
208:
209: // ALERT: this logic should be in OrDocumentIterator
210: int c = 0; // This counts non-TRUE, non-FALSE interval iterators
211: IntervalIterator oneNotTrue = null;
212: for (int i = computeFront(); i-- != 0;) {
213: intervalIterator = documentIterator[front[i]]
214: .intervalIterator(index);
215: if (intervalIterator == IntervalIterators.TRUE)
216: atLeastOneIsTrue = true;
217: else {
218: oneNotTrue = intervalIterator;
219: if (intervalIterator != IntervalIterators.FALSE)
220: c++;
221: }
222: }
223:
224: // If c == 0, all component iterators are either TRUE or FALSE. We return TRUE if at least one is TRUE.
225: if (c == 0)
226: intervalIterator = atLeastOneIsTrue ? IntervalIterators.TRUE
227: : IntervalIterators.FALSE;
228: else if (c == 1)
229: intervalIterator = oneNotTrue; // If we exclude TRUE and FALSE iterators, just oneNotTrue remains: we pass it upwards.
230: else {
231: intervalIterator = intervalIterators.get(index);
232: if (intervalIterator == null)
233: intervalIterators
234: .put(
235: index,
236: intervalIterator = getComposedIntervalIterator(index));
237: intervalIterator.reset();
238: }
239:
240: currentIterators.put(index, intervalIterator);
241: return intervalIterator;
242: }
243:
244: abstract protected IntervalIterator getComposedIntervalIterator(
245: Index index);
246:
247: /** Invokes {@link #acceptOnTruePaths(DocumentIteratorVisitor)} only on component
248: * iterators positioned on the current document.
249: *
250: * @param visitor a visitor.
251: * @return true if the visit should continue.
252: * @throws IOException
253: */
254:
255: @Override
256: public boolean acceptOnTruePaths(DocumentIteratorVisitor visitor)
257: throws IOException {
258: if (!visitor.visitPre(this ))
259: return false;
260: final int s = computeFront();
261: for (int i = 0; i < s; i++)
262: if (!documentIterator[front[i]].acceptOnTruePaths(visitor))
263: return false;
264: return visitor.visitPost(this);
265: }
266: }
|