001: package it.unimi.dsi.mg4j.search.visitor;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import java.io.IOException;
025:
026: import it.unimi.dsi.fastutil.Hash;
027: import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
028: import it.unimi.dsi.fastutil.objects.Object2IntMap;
029: import it.unimi.dsi.fastutil.objects.Reference2IntLinkedOpenHashMap;
030: import it.unimi.dsi.fastutil.objects.Reference2IntMap;
031: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
032: import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
033: import it.unimi.dsi.mg4j.index.Index;
034: import it.unimi.dsi.mg4j.index.IndexIterator;
035: import it.unimi.dsi.mg4j.search.DocumentIterator;
036:
037: import org.apache.log4j.Logger;
038:
039: /** A visitor collecting information about terms appearing
040: * in a {@link it.unimi.dsi.mg4j.search.DocumentIterator}.
041: *
042: * <P>The purpose of this visitor is that of exploring before iteration the structure
043: * of a {@link DocumentIterator} to count how many terms are actually used, and set up some
044: * preliminary access data. More precisely, we count the distinct pairs index/term
045: * appearing in all leaves of nonzero frequency (the latter
046: * condition is used to skip empty iterators). For this visitor to work, all leaves
047: * of nonzero frequency must return a non-<code>null</code> value on
048: * a call to {@link it.unimi.dsi.mg4j.index.IndexIterator#term()}.
049: *
050: * <p>During the visit, we keep track of which index/term pair have been already
051: * seen. Each pair is assigned an distinct <em>offset</em>—a number between
052: * zero and the overall number of distinct pairs—which is stored into
053: * each index iterator {@linkplain it.unimi.dsi.mg4j.index.IndexIterator#id() id}
054: * and is used afterwards to access quickly data about the pair. Note that duplicate index/term pairs
055: * get the same offset. The overall number of distinct pairs is returned
056: * by {@link #numberOfPairs()} after a visit.
057: *
058: * <p>During the visit, the indices actually appearing in some nonzero-frequency
059: * leaf are gathered; they are accessible as a vector returned
060: * by {@link #indices()}, and the map from positions in this vector to indices
061: * is inverted by {@link #indexMap()}.
062: *
063: * <p>The offset assigned to each pair index/term
064: * is returned by {@link #offset(Index, String)}. Should you need to know the terms
065: * associated to each index, they are returned by {@link #terms(Index)}.
066: *
067: * <p>The after a term collection, usually counters are set
068: * up by a visit of {@link it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor}.
069: */
070:
071: public class TermCollectionVisitor extends
072: AbstractDocumentIteratorVisitor {
073: private final static Logger LOGGER = Logger
074: .getLogger(TermCollectionVisitor.class);
075: private final static boolean DEBUG = false;
076:
077: /** The map from indices to maps from terms to offsets. The map themselves are linked,
078: * so terms are always returned in the same order (the visit order). */
079: private final Reference2ObjectMap<Index, Object2IntMap<String>> index2termMap;
080: /** The overall number of pairs index/term. */
081: private int numberOfPairs;
082: /** The array of indices involved in this query, returned by {@link #indices()}. */
083: private Index[] index;
084: /** A map from indices to positions in {@link #index}. */
085: private Reference2IntMap<Index> indexMap;
086:
087: /** Creates a new term-collection visitor. */
088:
089: public TermCollectionVisitor() {
090: index2termMap = new Reference2ObjectOpenHashMap<Index, Object2IntMap<String>>();
091: indexMap = new Reference2IntLinkedOpenHashMap<Index>(
092: Hash.DEFAULT_INITIAL_SIZE, .5f);
093: }
094:
095: public TermCollectionVisitor prepare() {
096: index = null;
097: index2termMap.clear();
098: indexMap.clear();
099: numberOfPairs = 0;
100: return this ;
101: }
102:
103: public boolean visit(final IndexIterator indexIterator)
104: throws IOException {
105: // TODO: the second condition should be checked elsewhere, maybe...
106: if (indexIterator.frequency() > 0
107: && indexIterator.index().hasCounts) { // We skip empty iterators and indices without counts
108: final Index index = indexIterator.index();
109: final String term = indexIterator.term();
110:
111: if (term == null)
112: throw new NullPointerException(
113: "This visitor needs a non-null term for each index iterator of nonzero frequency");
114:
115: if (DEBUG)
116: LOGGER.debug("Visiting leaf: index=" + index
117: + ", term=" + term);
118:
119: final Object2IntMap<String> termMap;
120:
121: if (!indexMap.containsKey(index)) {
122: // This index has never been seen before
123: indexMap.put(index, indexMap.size());
124: // Lazy instantiation of the term map
125: index2termMap
126: .put(
127: index,
128: termMap = new Object2IntLinkedOpenHashMap<String>(
129: Hash.DEFAULT_INITIAL_SIZE, .5f));
130: termMap.defaultReturnValue(-1);
131: } else
132: termMap = index2termMap.get(index);
133:
134: int offset = termMap.getInt(term);
135: if (offset == -1)
136: termMap.put(term, offset = numberOfPairs++); // Unknown index/term pair
137: indexIterator.id(offset);
138: if (DEBUG)
139: LOGGER.debug("Offset for index iterator "
140: + indexIterator + ": " + offset);
141: }
142: return true;
143: }
144:
145: /** Returns the number of distinct index/term pair corresponding to
146: * nonzero-frequency index iterators in the last visit.
147: *
148: * @return the number distinct index/term pair corresponding to
149: * nonzero-frequency index iterators.
150: */
151: public int numberOfPairs() {
152: return numberOfPairs;
153: }
154:
155: /** Returns the indices met during pair collection.
156: *
157: * <p>Note that the returned array does not include indices only associated
158: * to index iterators of zero frequency.
159: *
160: * @return the indices met during term collection.
161: */
162: public Index[] indices() {
163: if (index == null)
164: index = indexMap.keySet().toArray(
165: new Index[index2termMap.size()]);
166: return index;
167: }
168:
169: /** Returns a map from indices met during term collection to their position
170: * into {@link #indices()}.
171: *
172: * <p>Note that the returned array does not include indices only associated
173: * to index iterators of zero frequency.
174: *
175: * @return a map from indices met during term collection to their position
176: * into {@link #indices()}.
177: */
178: public Reference2IntMap<Index> indexMap() {
179: return indexMap;
180: }
181:
182: /** Returns the terms associated to the given index.
183: *
184: * @param index an index.
185: * @return the terms associated to <code>index</code>, in the same order in which
186: * they appeared during the visit, skipping duplicates, if some nonzero-frequency iterator
187: * based on <code>index</code> was found; <code>null</code> otherwise.
188: */
189: public String[] terms(final Index index) {
190: final Object2IntMap<String> termMap = index2termMap.get(index);
191: return termMap == null ? null : termMap.keySet().toArray(
192: new String[termMap.size()]);
193: }
194:
195: /** Returns the offset associated to a given pair index/term.
196: *
197: * @param index an index appearing in {@link #indices()}.
198: * @param term a term appearing in the array returned by {@link #terms(Index)} with argument <code>index</code>.
199: * @return the offset associated to the pair <code>index</code>/<code>term</code>.
200: */
201:
202: public int offset(final Index index, final String term) {
203: return index2termMap.get(index).getInt(term);
204: }
205:
206: public String toString() {
207: return "[Leaves: " + numberOfPairs + "; " + index2termMap + "]";
208: }
209: }
|