001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.util.LuceneTestCase;
021: import org.apache.lucene.analysis.KeywordAnalyzer;
022: import org.apache.lucene.analysis.standard.StandardAnalyzer;
023: import org.apache.lucene.document.Document;
024: import org.apache.lucene.document.Field;
025: import org.apache.lucene.document.SetBasedFieldSelector;
026: import org.apache.lucene.index.IndexReader;
027: import org.apache.lucene.index.IndexWriter;
028: import org.apache.lucene.index.Term;
029: import org.apache.lucene.queryParser.QueryParser;
030: import org.apache.lucene.store.Directory;
031: import org.apache.lucene.store.RAMDirectory;
032: import org.apache.lucene.store.MockRAMDirectory;
033:
034: import java.io.IOException;
035: import java.util.Collections;
036: import java.util.HashSet;
037: import java.util.Set;
038:
039: /**
040: * Tests {@link MultiSearcher} class.
041: *
042: * @version $Id: TestMultiSearcher.java 583534 2007-10-10 16:46:35Z mikemccand $
043: */
044: public class TestMultiSearcher extends LuceneTestCase {
045: public TestMultiSearcher(String name) {
046: super (name);
047: }
048:
049: /**
050: * ReturnS a new instance of the concrete MultiSearcher class
051: * used in this test.
052: */
053: protected MultiSearcher getMultiSearcherInstance(
054: Searcher[] searchers) throws IOException {
055: return new MultiSearcher(searchers);
056: }
057:
058: public void testEmptyIndex() throws Exception {
059: // creating two directories for indices
060: Directory indexStoreA = new MockRAMDirectory();
061: Directory indexStoreB = new MockRAMDirectory();
062:
063: // creating a document to store
064: Document lDoc = new Document();
065: lDoc.add(new Field("fulltext", "Once upon a time.....",
066: Field.Store.YES, Field.Index.TOKENIZED));
067: lDoc.add(new Field("id", "doc1", Field.Store.YES,
068: Field.Index.UN_TOKENIZED));
069: lDoc.add(new Field("handle", "1", Field.Store.YES,
070: Field.Index.UN_TOKENIZED));
071:
072: // creating a document to store
073: Document lDoc2 = new Document();
074: lDoc2.add(new Field("fulltext",
075: "in a galaxy far far away.....", Field.Store.YES,
076: Field.Index.TOKENIZED));
077: lDoc2.add(new Field("id", "doc2", Field.Store.YES,
078: Field.Index.UN_TOKENIZED));
079: lDoc2.add(new Field("handle", "1", Field.Store.YES,
080: Field.Index.UN_TOKENIZED));
081:
082: // creating a document to store
083: Document lDoc3 = new Document();
084: lDoc3.add(new Field("fulltext",
085: "a bizarre bug manifested itself....", Field.Store.YES,
086: Field.Index.TOKENIZED));
087: lDoc3.add(new Field("id", "doc3", Field.Store.YES,
088: Field.Index.UN_TOKENIZED));
089: lDoc3.add(new Field("handle", "1", Field.Store.YES,
090: Field.Index.UN_TOKENIZED));
091:
092: // creating an index writer for the first index
093: IndexWriter writerA = new IndexWriter(indexStoreA,
094: new StandardAnalyzer(), true);
095: // creating an index writer for the second index, but writing nothing
096: IndexWriter writerB = new IndexWriter(indexStoreB,
097: new StandardAnalyzer(), true);
098:
099: //--------------------------------------------------------------------
100: // scenario 1
101: //--------------------------------------------------------------------
102:
103: // writing the documents to the first index
104: writerA.addDocument(lDoc);
105: writerA.addDocument(lDoc2);
106: writerA.addDocument(lDoc3);
107: writerA.optimize();
108: writerA.close();
109:
110: // closing the second index
111: writerB.close();
112:
113: // creating the query
114: QueryParser parser = new QueryParser("fulltext",
115: new StandardAnalyzer());
116: Query query = parser.parse("handle:1");
117:
118: // building the searchables
119: Searcher[] searchers = new Searcher[2];
120: // VITAL STEP:adding the searcher for the empty index first, before the searcher for the populated index
121: searchers[0] = new IndexSearcher(indexStoreB);
122: searchers[1] = new IndexSearcher(indexStoreA);
123: // creating the multiSearcher
124: Searcher mSearcher = getMultiSearcherInstance(searchers);
125: // performing the search
126: Hits hits = mSearcher.search(query);
127:
128: assertEquals(3, hits.length());
129:
130: // iterating over the hit documents
131: for (int i = 0; i < hits.length(); i++) {
132: Document d = hits.doc(i);
133: }
134: mSearcher.close();
135:
136: //--------------------------------------------------------------------
137: // scenario 2
138: //--------------------------------------------------------------------
139:
140: // adding one document to the empty index
141: writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(),
142: false);
143: writerB.addDocument(lDoc);
144: writerB.optimize();
145: writerB.close();
146:
147: // building the searchables
148: Searcher[] searchers2 = new Searcher[2];
149: // VITAL STEP:adding the searcher for the empty index first, before the searcher for the populated index
150: searchers2[0] = new IndexSearcher(indexStoreB);
151: searchers2[1] = new IndexSearcher(indexStoreA);
152: // creating the mulitSearcher
153: MultiSearcher mSearcher2 = getMultiSearcherInstance(searchers2);
154: // performing the same search
155: Hits hits2 = mSearcher2.search(query);
156:
157: assertEquals(4, hits2.length());
158:
159: // iterating over the hit documents
160: for (int i = 0; i < hits2.length(); i++) {
161: // no exception should happen at this point
162: Document d = hits2.doc(i);
163: }
164:
165: // test the subSearcher() method:
166: Query subSearcherQuery = parser.parse("id:doc1");
167: hits2 = mSearcher2.search(subSearcherQuery);
168: assertEquals(2, hits2.length());
169: assertEquals(0, mSearcher2.subSearcher(hits2.id(0))); // hit from searchers2[0]
170: assertEquals(1, mSearcher2.subSearcher(hits2.id(1))); // hit from searchers2[1]
171: subSearcherQuery = parser.parse("id:doc2");
172: hits2 = mSearcher2.search(subSearcherQuery);
173: assertEquals(1, hits2.length());
174: assertEquals(1, mSearcher2.subSearcher(hits2.id(0))); // hit from searchers2[1]
175: mSearcher2.close();
176:
177: //--------------------------------------------------------------------
178: // scenario 3
179: //--------------------------------------------------------------------
180:
181: // deleting the document just added, this will cause a different exception to take place
182: Term term = new Term("id", "doc1");
183: IndexReader readerB = IndexReader.open(indexStoreB);
184: readerB.deleteDocuments(term);
185: readerB.close();
186:
187: // optimizing the index with the writer
188: writerB = new IndexWriter(indexStoreB, new StandardAnalyzer(),
189: false);
190: writerB.optimize();
191: writerB.close();
192:
193: // building the searchables
194: Searcher[] searchers3 = new Searcher[2];
195:
196: searchers3[0] = new IndexSearcher(indexStoreB);
197: searchers3[1] = new IndexSearcher(indexStoreA);
198: // creating the mulitSearcher
199: Searcher mSearcher3 = getMultiSearcherInstance(searchers3);
200: // performing the same search
201: Hits hits3 = mSearcher3.search(query);
202:
203: assertEquals(3, hits3.length());
204:
205: // iterating over the hit documents
206: for (int i = 0; i < hits3.length(); i++) {
207: Document d = hits3.doc(i);
208: }
209: mSearcher3.close();
210: indexStoreA.close();
211: indexStoreB.close();
212: }
213:
214: private static Document createDocument(String contents1,
215: String contents2) {
216: Document document = new Document();
217:
218: document.add(new Field("contents", contents1, Field.Store.YES,
219: Field.Index.UN_TOKENIZED));
220: document.add(new Field("other", "other contents",
221: Field.Store.YES, Field.Index.UN_TOKENIZED));
222: if (contents2 != null) {
223: document.add(new Field("contents", contents2,
224: Field.Store.YES, Field.Index.UN_TOKENIZED));
225: }
226:
227: return document;
228: }
229:
230: private static void initIndex(Directory directory, int nDocs,
231: boolean create, String contents2) throws IOException {
232: IndexWriter indexWriter = null;
233:
234: try {
235: indexWriter = new IndexWriter(directory,
236: new KeywordAnalyzer(), create);
237:
238: for (int i = 0; i < nDocs; i++) {
239: indexWriter.addDocument(createDocument("doc" + i,
240: contents2));
241: }
242: } finally {
243: if (indexWriter != null) {
244: indexWriter.close();
245: }
246: }
247: }
248:
249: public void testFieldSelector() throws Exception {
250: RAMDirectory ramDirectory1, ramDirectory2;
251: IndexSearcher indexSearcher1, indexSearcher2;
252:
253: ramDirectory1 = new RAMDirectory();
254: ramDirectory2 = new RAMDirectory();
255: Query query = new TermQuery(new Term("contents", "doc0"));
256:
257: // Now put the documents in a different index
258: initIndex(ramDirectory1, 10, true, null); // documents with a single token "doc0", "doc1", etc...
259: initIndex(ramDirectory2, 10, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
260:
261: indexSearcher1 = new IndexSearcher(ramDirectory1);
262: indexSearcher2 = new IndexSearcher(ramDirectory2);
263:
264: MultiSearcher searcher = getMultiSearcherInstance(new Searcher[] {
265: indexSearcher1, indexSearcher2 });
266: assertTrue("searcher is null and it shouldn't be",
267: searcher != null);
268: Hits hits = searcher.search(query);
269: assertTrue("hits is null and it shouldn't be", hits != null);
270: assertTrue(hits.length() + " does not equal: " + 2, hits
271: .length() == 2);
272: Document document = searcher.doc(hits.id(0));
273: assertTrue("document is null and it shouldn't be",
274: document != null);
275: assertTrue("document.getFields() Size: "
276: + document.getFields().size() + " is not: " + 2,
277: document.getFields().size() == 2);
278: //Should be one document from each directory
279: //they both have two fields, contents and other
280: Set ftl = new HashSet();
281: ftl.add("other");
282: SetBasedFieldSelector fs = new SetBasedFieldSelector(ftl,
283: Collections.EMPTY_SET);
284: document = searcher.doc(hits.id(0), fs);
285: assertTrue("document is null and it shouldn't be",
286: document != null);
287: assertTrue("document.getFields() Size: "
288: + document.getFields().size() + " is not: " + 1,
289: document.getFields().size() == 1);
290: String value = document.get("contents");
291: assertTrue("value is not null and it should be", value == null);
292: value = document.get("other");
293: assertTrue("value is null and it shouldn't be", value != null);
294: ftl.clear();
295: ftl.add("contents");
296: fs = new SetBasedFieldSelector(ftl, Collections.EMPTY_SET);
297: document = searcher.doc(hits.id(1), fs);
298: value = document.get("contents");
299: assertTrue("value is null and it shouldn't be", value != null);
300: value = document.get("other");
301: assertTrue("value is not null and it should be", value == null);
302: }
303:
304: /* uncomment this when the highest score is always normalized to 1.0, even when it was < 1.0
305: public void testNormalization1() throws IOException {
306: testNormalization(1, "Using 1 document per index:");
307: }
308: */
309:
310: public void testNormalization10() throws IOException {
311: testNormalization(10, "Using 10 documents per index:");
312: }
313:
314: private void testNormalization(int nDocs, String message)
315: throws IOException {
316: Query query = new TermQuery(new Term("contents", "doc0"));
317:
318: RAMDirectory ramDirectory1;
319: IndexSearcher indexSearcher1;
320: Hits hits;
321:
322: ramDirectory1 = new MockRAMDirectory();
323:
324: // First put the documents in the same index
325: initIndex(ramDirectory1, nDocs, true, null); // documents with a single token "doc0", "doc1", etc...
326: initIndex(ramDirectory1, nDocs, false, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
327:
328: indexSearcher1 = new IndexSearcher(ramDirectory1);
329:
330: hits = indexSearcher1.search(query);
331:
332: assertEquals(message, 2, hits.length());
333:
334: assertEquals(message, 1, hits.score(0), 1e-6); // hits.score(0) is 0.594535 if only a single document is in first index
335:
336: // Store the scores for use later
337: float[] scores = { hits.score(0), hits.score(1) };
338:
339: assertTrue(message, scores[0] > scores[1]);
340:
341: indexSearcher1.close();
342: ramDirectory1.close();
343: hits = null;
344:
345: RAMDirectory ramDirectory2;
346: IndexSearcher indexSearcher2;
347:
348: ramDirectory1 = new MockRAMDirectory();
349: ramDirectory2 = new MockRAMDirectory();
350:
351: // Now put the documents in a different index
352: initIndex(ramDirectory1, nDocs, true, null); // documents with a single token "doc0", "doc1", etc...
353: initIndex(ramDirectory2, nDocs, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
354:
355: indexSearcher1 = new IndexSearcher(ramDirectory1);
356: indexSearcher2 = new IndexSearcher(ramDirectory2);
357:
358: Searcher searcher = getMultiSearcherInstance(new Searcher[] {
359: indexSearcher1, indexSearcher2 });
360:
361: hits = searcher.search(query);
362:
363: assertEquals(message, 2, hits.length());
364:
365: // The scores should be the same (within reason)
366: assertEquals(message, scores[0], hits.score(0), 1e-6); // This will a document from ramDirectory1
367: assertEquals(message, scores[1], hits.score(1), 1e-6); // This will a document from ramDirectory2
368:
369: // Adding a Sort.RELEVANCE object should not change anything
370: hits = searcher.search(query, Sort.RELEVANCE);
371:
372: assertEquals(message, 2, hits.length());
373:
374: assertEquals(message, scores[0], hits.score(0), 1e-6); // This will a document from ramDirectory1
375: assertEquals(message, scores[1], hits.score(1), 1e-6); // This will a document from ramDirectory2
376:
377: searcher.close();
378:
379: ramDirectory1.close();
380: ramDirectory2.close();
381: }
382:
383: /**
384: * test that custom similarity is in effect when using MultiSearcher (LUCENE-789).
385: * @throws IOException
386: */
387: public void testCustomSimilarity() throws IOException {
388: RAMDirectory dir = new RAMDirectory();
389: initIndex(dir, 10, true, "x"); // documents with two tokens "doc0" and "x", "doc1" and x, etc...
390: IndexSearcher srchr = new IndexSearcher(dir);
391: MultiSearcher msrchr = getMultiSearcherInstance(new Searcher[] { srchr });
392:
393: Similarity customSimilarity = new DefaultSimilarity() {
394: // overide all
395: public float idf(int docFreq, int numDocs) {
396: return 100.0f;
397: }
398:
399: public float coord(int overlap, int maxOverlap) {
400: return 1.0f;
401: }
402:
403: public float lengthNorm(String fieldName, int numTokens) {
404: return 1.0f;
405: }
406:
407: public float queryNorm(float sumOfSquaredWeights) {
408: return 1.0f;
409: }
410:
411: public float sloppyFreq(int distance) {
412: return 1.0f;
413: }
414:
415: public float tf(float freq) {
416: return 1.0f;
417: }
418: };
419:
420: srchr.setSimilarity(customSimilarity);
421: msrchr.setSimilarity(customSimilarity);
422:
423: Query query = new TermQuery(new Term("contents", "doc0"));
424:
425: // Get a score from IndexSearcher
426: TopDocs topDocs = srchr.search(query, null, 1);
427: float score1 = topDocs.getMaxScore();
428:
429: // Get the score from MultiSearcher
430: topDocs = msrchr.search(query, null, 1);
431: float scoreN = topDocs.getMaxScore();
432:
433: // The scores from the IndexSearcher and Multisearcher should be the same
434: // if the same similarity is used.
435: assertEquals(
436: "MultiSearcher score must be equal to single esrcher score!",
437: score1, scoreN, 1e-6);
438: }
439: }
|