001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.ConcurrentModificationException;
022: import java.util.Vector;
023: import java.util.Iterator;
024:
025: import org.apache.lucene.document.Document;
026: import org.apache.lucene.index.CorruptIndexException;
027:
028: /** A ranked list of documents, used to hold search results.
029: * <p>
030: * <b>Caution:</b> Iterate only over the hits needed. Iterating over all
031: * hits is generally not desirable and may be the source of
032: * performance issues. If you need to iterate over many or all hits, consider
033: * using the search method that takes a {@link HitCollector}.
034: * </p>
035: * <p><b>Note:</b> Deleting matching documents concurrently with traversing
036: * the hits, might, when deleting hits that were not yet retrieved, decrease
037: * {@link #length()}. In such case,
038: * {@link java.util.ConcurrentModificationException ConcurrentModificationException}
039: * is thrown when accessing hit <code>n</code> ≥ current_{@link #length()}
040: * (but <code>n</code> < {@link #length()}_at_start).
041: */
042: public final class Hits {
043: private Weight weight;
044: private Searcher searcher;
045: private Filter filter = null;
046: private Sort sort = null;
047:
048: private int length; // the total number of hits
049: private Vector hitDocs = new Vector(); // cache of hits retrieved
050:
051: private HitDoc first; // head of LRU cache
052: private HitDoc last; // tail of LRU cache
053: private int numDocs = 0; // number cached
054: private int maxDocs = 200; // max to cache
055:
056: private int nDeletions; // # deleted docs in the index.
057: private int lengthAtStart; // this is the number apps usually count on (although deletions can bring it down).
058: private int nDeletedHits = 0; // # of already collected hits that were meanwhile deleted.
059:
060: boolean debugCheckedForDeletions = false; // for test purposes.
061:
062: Hits(Searcher s, Query q, Filter f) throws IOException {
063: weight = q.weight(s);
064: searcher = s;
065: filter = f;
066: nDeletions = countDeletions(s);
067: getMoreDocs(50); // retrieve 100 initially
068: lengthAtStart = length;
069: }
070:
071: Hits(Searcher s, Query q, Filter f, Sort o) throws IOException {
072: weight = q.weight(s);
073: searcher = s;
074: filter = f;
075: sort = o;
076: nDeletions = countDeletions(s);
077: getMoreDocs(50); // retrieve 100 initially
078: lengthAtStart = length;
079: }
080:
081: // count # deletions, return -1 if unknown.
082: private int countDeletions(Searcher s) throws IOException {
083: int cnt = -1;
084: if (s instanceof IndexSearcher) {
085: cnt = s.maxDoc()
086: - ((IndexSearcher) s).getIndexReader().numDocs();
087: }
088: return cnt;
089: }
090:
091: /**
092: * Tries to add new documents to hitDocs.
093: * Ensures that the hit numbered <code>min</code> has been retrieved.
094: */
095: private final void getMoreDocs(int min) throws IOException {
096: if (hitDocs.size() > min) {
097: min = hitDocs.size();
098: }
099:
100: int n = min * 2; // double # retrieved
101: TopDocs topDocs = (sort == null) ? searcher.search(weight,
102: filter, n) : searcher.search(weight, filter, n, sort);
103:
104: length = topDocs.totalHits;
105: ScoreDoc[] scoreDocs = topDocs.scoreDocs;
106:
107: float scoreNorm = 1.0f;
108:
109: if (length > 0 && topDocs.getMaxScore() > 1.0f) {
110: scoreNorm = 1.0f / topDocs.getMaxScore();
111: }
112:
113: int start = hitDocs.size() - nDeletedHits;
114:
115: // any new deletions?
116: int nDels2 = countDeletions(searcher);
117: debugCheckedForDeletions = false;
118: if (nDeletions < 0 || nDels2 > nDeletions) {
119: // either we cannot count deletions, or some "previously valid hits" might have been deleted, so find exact start point
120: nDeletedHits = 0;
121: debugCheckedForDeletions = true;
122: int i2 = 0;
123: for (int i1 = 0; i1 < hitDocs.size()
124: && i2 < scoreDocs.length; i1++) {
125: int id1 = ((HitDoc) hitDocs.get(i1)).id;
126: int id2 = scoreDocs[i2].doc;
127: if (id1 == id2) {
128: i2++;
129: } else {
130: nDeletedHits++;
131: }
132: }
133: start = i2;
134: }
135:
136: int end = scoreDocs.length < length ? scoreDocs.length : length;
137: length += nDeletedHits;
138: for (int i = start; i < end; i++) {
139: hitDocs.addElement(new HitDoc(scoreDocs[i].score
140: * scoreNorm, scoreDocs[i].doc));
141: }
142:
143: nDeletions = nDels2;
144: }
145:
146: /** Returns the total number of hits available in this set. */
147: public final int length() {
148: return length;
149: }
150:
151: /** Returns the stored fields of the n<sup>th</sup> document in this set.
152: * <p>Documents are cached, so that repeated requests for the same element may
153: * return the same Document object.
154: * @throws CorruptIndexException if the index is corrupt
155: * @throws IOException if there is a low-level IO error
156: */
157: public final Document doc(int n) throws CorruptIndexException,
158: IOException {
159: HitDoc hitDoc = hitDoc(n);
160:
161: // Update LRU cache of documents
162: remove(hitDoc); // remove from list, if there
163: addToFront(hitDoc); // add to front of list
164: if (numDocs > maxDocs) { // if cache is full
165: HitDoc oldLast = last;
166: remove(last); // flush last
167: oldLast.doc = null; // let doc get gc'd
168: }
169:
170: if (hitDoc.doc == null) {
171: hitDoc.doc = searcher.doc(hitDoc.id); // cache miss: read document
172: }
173:
174: return hitDoc.doc;
175: }
176:
177: /** Returns the score for the n<sup>th</sup> document in this set. */
178: public final float score(int n) throws IOException {
179: return hitDoc(n).score;
180: }
181:
182: /** Returns the id for the n<sup>th</sup> document in this set.
183: * Note that ids may change when the index changes, so you cannot
184: * rely on the id to be stable.
185: */
186: public final int id(int n) throws IOException {
187: return hitDoc(n).id;
188: }
189:
190: /**
191: * Returns a {@link HitIterator} to navigate the Hits. Each item returned
192: * from {@link Iterator#next()} is a {@link Hit}.
193: * <p>
194: * <b>Caution:</b> Iterate only over the hits needed. Iterating over all
195: * hits is generally not desirable and may be the source of
196: * performance issues. If you need to iterate over many or all hits, consider
197: * using a search method that takes a {@link HitCollector}.
198: * </p>
199: */
200: public Iterator iterator() {
201: return new HitIterator(this );
202: }
203:
204: private final HitDoc hitDoc(int n) throws IOException {
205: if (n >= lengthAtStart) {
206: throw new IndexOutOfBoundsException(
207: "Not a valid hit number: " + n);
208: }
209:
210: if (n >= hitDocs.size()) {
211: getMoreDocs(n);
212: }
213:
214: if (n >= length) {
215: throw new ConcurrentModificationException(
216: "Not a valid hit number: " + n);
217: }
218:
219: return (HitDoc) hitDocs.elementAt(n);
220: }
221:
222: private final void addToFront(HitDoc hitDoc) { // insert at front of cache
223: if (first == null) {
224: last = hitDoc;
225: } else {
226: first.prev = hitDoc;
227: }
228:
229: hitDoc.next = first;
230: first = hitDoc;
231: hitDoc.prev = null;
232:
233: numDocs++;
234: }
235:
236: private final void remove(HitDoc hitDoc) { // remove from cache
237: if (hitDoc.doc == null) { // it's not in the list
238: return; // abort
239: }
240:
241: if (hitDoc.next == null) {
242: last = hitDoc.prev;
243: } else {
244: hitDoc.next.prev = hitDoc.prev;
245: }
246:
247: if (hitDoc.prev == null) {
248: first = hitDoc.next;
249: } else {
250: hitDoc.prev.next = hitDoc.next;
251: }
252:
253: numDocs--;
254: }
255: }
256:
257: final class HitDoc {
258: float score;
259: int id;
260: Document doc = null;
261:
262: HitDoc next; // in doubly-linked cache
263: HitDoc prev; // in doubly-linked cache
264:
265: HitDoc(float s, int i) {
266: score = s;
267: id = i;
268: }
269: }
|