001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.util.LuceneTestCase;
021: import org.apache.lucene.analysis.*;
022: import org.apache.lucene.document.*;
023: import org.apache.lucene.index.IndexWriter;
024: import org.apache.lucene.index.Term;
025: import org.apache.lucene.store.Directory;
026: import org.apache.lucene.store.RAMDirectory;
027:
028: import java.io.IOException;
029: import java.io.Reader;
030:
031: /**
032: * Tests {@link PhraseQuery}.
033: *
034: * @see TestPositionIncrement
035: * @author Erik Hatcher
036: */
037: public class TestPhraseQuery extends LuceneTestCase {
038:
039: /** threshold for comparing floats */
040: public static final float SCORE_COMP_THRESH = 1e-6f;
041:
042: private IndexSearcher searcher;
043: private PhraseQuery query;
044: private RAMDirectory directory;
045:
046: public void setUp() throws Exception {
047: super .setUp();
048: directory = new RAMDirectory();
049: Analyzer analyzer = new Analyzer() {
050: public TokenStream tokenStream(String fieldName,
051: Reader reader) {
052: return new WhitespaceTokenizer(reader);
053: }
054:
055: public int getPositionIncrementGap(String fieldName) {
056: return 100;
057: }
058: };
059: IndexWriter writer = new IndexWriter(directory, analyzer, true);
060:
061: Document doc = new Document();
062: doc.add(new Field("field", "one two three four five",
063: Field.Store.YES, Field.Index.TOKENIZED));
064: doc.add(new Field("repeated",
065: "this is a repeated field - first part",
066: Field.Store.YES, Field.Index.TOKENIZED));
067: Fieldable repeatedField = new Field("repeated",
068: "second part of a repeated field", Field.Store.YES,
069: Field.Index.TOKENIZED);
070: doc.add(repeatedField);
071: doc.add(new Field("palindrome", "one two three two one",
072: Field.Store.YES, Field.Index.TOKENIZED));
073: writer.addDocument(doc);
074:
075: doc = new Document();
076: doc.add(new Field("nonexist",
077: "phrase exist notexist exist found", Field.Store.YES,
078: Field.Index.TOKENIZED));
079: writer.addDocument(doc);
080:
081: doc = new Document();
082: doc.add(new Field("nonexist",
083: "phrase exist notexist exist found", Field.Store.YES,
084: Field.Index.TOKENIZED));
085: writer.addDocument(doc);
086:
087: writer.optimize();
088: writer.close();
089:
090: searcher = new IndexSearcher(directory);
091: query = new PhraseQuery();
092: }
093:
094: public void tearDown() throws Exception {
095: super .tearDown();
096: searcher.close();
097: directory.close();
098: }
099:
100: public void testNotCloseEnough() throws Exception {
101: query.setSlop(2);
102: query.add(new Term("field", "one"));
103: query.add(new Term("field", "five"));
104: Hits hits = searcher.search(query);
105: assertEquals(0, hits.length());
106: QueryUtils.check(query, searcher);
107: }
108:
109: public void testBarelyCloseEnough() throws Exception {
110: query.setSlop(3);
111: query.add(new Term("field", "one"));
112: query.add(new Term("field", "five"));
113: Hits hits = searcher.search(query);
114: assertEquals(1, hits.length());
115: QueryUtils.check(query, searcher);
116: }
117:
118: /**
119: * Ensures slop of 0 works for exact matches, but not reversed
120: */
121: public void testExact() throws Exception {
122: // slop is zero by default
123: query.add(new Term("field", "four"));
124: query.add(new Term("field", "five"));
125: Hits hits = searcher.search(query);
126: assertEquals("exact match", 1, hits.length());
127: QueryUtils.check(query, searcher);
128:
129: query = new PhraseQuery();
130: query.add(new Term("field", "two"));
131: query.add(new Term("field", "one"));
132: hits = searcher.search(query);
133: assertEquals("reverse not exact", 0, hits.length());
134: QueryUtils.check(query, searcher);
135: }
136:
137: public void testSlop1() throws Exception {
138: // Ensures slop of 1 works with terms in order.
139: query.setSlop(1);
140: query.add(new Term("field", "one"));
141: query.add(new Term("field", "two"));
142: Hits hits = searcher.search(query);
143: assertEquals("in order", 1, hits.length());
144: QueryUtils.check(query, searcher);
145:
146: // Ensures slop of 1 does not work for phrases out of order;
147: // must be at least 2.
148: query = new PhraseQuery();
149: query.setSlop(1);
150: query.add(new Term("field", "two"));
151: query.add(new Term("field", "one"));
152: hits = searcher.search(query);
153: assertEquals("reversed, slop not 2 or more", 0, hits.length());
154: QueryUtils.check(query, searcher);
155: }
156:
157: /**
158: * As long as slop is at least 2, terms can be reversed
159: */
160: public void testOrderDoesntMatter() throws Exception {
161: query.setSlop(2); // must be at least two for reverse order match
162: query.add(new Term("field", "two"));
163: query.add(new Term("field", "one"));
164: Hits hits = searcher.search(query);
165: assertEquals("just sloppy enough", 1, hits.length());
166: QueryUtils.check(query, searcher);
167:
168: query = new PhraseQuery();
169: query.setSlop(2);
170: query.add(new Term("field", "three"));
171: query.add(new Term("field", "one"));
172: hits = searcher.search(query);
173: assertEquals("not sloppy enough", 0, hits.length());
174: QueryUtils.check(query, searcher);
175:
176: }
177:
178: /**
179: * slop is the total number of positional moves allowed
180: * to line up a phrase
181: */
182: public void testMulipleTerms() throws Exception {
183: query.setSlop(2);
184: query.add(new Term("field", "one"));
185: query.add(new Term("field", "three"));
186: query.add(new Term("field", "five"));
187: Hits hits = searcher.search(query);
188: assertEquals("two total moves", 1, hits.length());
189: QueryUtils.check(query, searcher);
190:
191: query = new PhraseQuery();
192: query.setSlop(5); // it takes six moves to match this phrase
193: query.add(new Term("field", "five"));
194: query.add(new Term("field", "three"));
195: query.add(new Term("field", "one"));
196: hits = searcher.search(query);
197: assertEquals("slop of 5 not close enough", 0, hits.length());
198: QueryUtils.check(query, searcher);
199:
200: query.setSlop(6);
201: hits = searcher.search(query);
202: assertEquals("slop of 6 just right", 1, hits.length());
203: QueryUtils.check(query, searcher);
204:
205: }
206:
207: public void testPhraseQueryWithStopAnalyzer() throws Exception {
208: RAMDirectory directory = new RAMDirectory();
209: StopAnalyzer stopAnalyzer = new StopAnalyzer();
210: IndexWriter writer = new IndexWriter(directory, stopAnalyzer,
211: true);
212: Document doc = new Document();
213: doc.add(new Field("field", "the stop words are here",
214: Field.Store.YES, Field.Index.TOKENIZED));
215: writer.addDocument(doc);
216: writer.close();
217:
218: IndexSearcher searcher = new IndexSearcher(directory);
219:
220: // valid exact phrase query
221: PhraseQuery query = new PhraseQuery();
222: query.add(new Term("field", "stop"));
223: query.add(new Term("field", "words"));
224: Hits hits = searcher.search(query);
225: assertEquals(1, hits.length());
226: QueryUtils.check(query, searcher);
227:
228: // currently StopAnalyzer does not leave "holes", so this matches.
229: query = new PhraseQuery();
230: query.add(new Term("field", "words"));
231: query.add(new Term("field", "here"));
232: hits = searcher.search(query);
233: assertEquals(1, hits.length());
234: QueryUtils.check(query, searcher);
235:
236: searcher.close();
237: }
238:
239: public void testPhraseQueryInConjunctionScorer() throws Exception {
240: RAMDirectory directory = new RAMDirectory();
241: IndexWriter writer = new IndexWriter(directory,
242: new WhitespaceAnalyzer(), true);
243:
244: Document doc = new Document();
245: doc.add(new Field("source", "marketing info", Field.Store.YES,
246: Field.Index.TOKENIZED));
247: writer.addDocument(doc);
248:
249: doc = new Document();
250: doc.add(new Field("contents", "foobar", Field.Store.YES,
251: Field.Index.TOKENIZED));
252: doc.add(new Field("source", "marketing info", Field.Store.YES,
253: Field.Index.TOKENIZED));
254: writer.addDocument(doc);
255:
256: writer.optimize();
257: writer.close();
258:
259: IndexSearcher searcher = new IndexSearcher(directory);
260:
261: PhraseQuery phraseQuery = new PhraseQuery();
262: phraseQuery.add(new Term("source", "marketing"));
263: phraseQuery.add(new Term("source", "info"));
264: Hits hits = searcher.search(phraseQuery);
265: assertEquals(2, hits.length());
266: QueryUtils.check(phraseQuery, searcher);
267:
268: TermQuery termQuery = new TermQuery(new Term("contents",
269: "foobar"));
270: BooleanQuery booleanQuery = new BooleanQuery();
271: booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
272: booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
273: hits = searcher.search(booleanQuery);
274: assertEquals(1, hits.length());
275: QueryUtils.check(termQuery, searcher);
276:
277: searcher.close();
278:
279: writer = new IndexWriter(directory, new WhitespaceAnalyzer(),
280: true);
281: doc = new Document();
282: doc.add(new Field("contents", "map entry woo", Field.Store.YES,
283: Field.Index.TOKENIZED));
284: writer.addDocument(doc);
285:
286: doc = new Document();
287: doc.add(new Field("contents", "woo map entry", Field.Store.YES,
288: Field.Index.TOKENIZED));
289: writer.addDocument(doc);
290:
291: doc = new Document();
292: doc.add(new Field("contents", "map foobarword entry woo",
293: Field.Store.YES, Field.Index.TOKENIZED));
294: writer.addDocument(doc);
295:
296: writer.optimize();
297: writer.close();
298:
299: searcher = new IndexSearcher(directory);
300:
301: termQuery = new TermQuery(new Term("contents", "woo"));
302: phraseQuery = new PhraseQuery();
303: phraseQuery.add(new Term("contents", "map"));
304: phraseQuery.add(new Term("contents", "entry"));
305:
306: hits = searcher.search(termQuery);
307: assertEquals(3, hits.length());
308: hits = searcher.search(phraseQuery);
309: assertEquals(2, hits.length());
310:
311: booleanQuery = new BooleanQuery();
312: booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
313: booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
314: hits = searcher.search(booleanQuery);
315: assertEquals(2, hits.length());
316:
317: booleanQuery = new BooleanQuery();
318: booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
319: booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
320: hits = searcher.search(booleanQuery);
321: assertEquals(2, hits.length());
322: QueryUtils.check(booleanQuery, searcher);
323:
324: searcher.close();
325: directory.close();
326: }
327:
328: public void testSlopScoring() throws IOException {
329: Directory directory = new RAMDirectory();
330: IndexWriter writer = new IndexWriter(directory,
331: new WhitespaceAnalyzer(), true);
332:
333: Document doc = new Document();
334: doc.add(new Field("field", "foo firstname lastname foo",
335: Field.Store.YES, Field.Index.TOKENIZED));
336: writer.addDocument(doc);
337:
338: Document doc2 = new Document();
339: doc2.add(new Field("field", "foo firstname xxx lastname foo",
340: Field.Store.YES, Field.Index.TOKENIZED));
341: writer.addDocument(doc2);
342:
343: Document doc3 = new Document();
344: doc3.add(new Field("field",
345: "foo firstname xxx yyy lastname foo", Field.Store.YES,
346: Field.Index.TOKENIZED));
347: writer.addDocument(doc3);
348:
349: writer.optimize();
350: writer.close();
351:
352: Searcher searcher = new IndexSearcher(directory);
353: PhraseQuery query = new PhraseQuery();
354: query.add(new Term("field", "firstname"));
355: query.add(new Term("field", "lastname"));
356: query.setSlop(Integer.MAX_VALUE);
357: Hits hits = searcher.search(query);
358: assertEquals(3, hits.length());
359: // Make sure that those matches where the terms appear closer to
360: // each other get a higher score:
361: assertEquals(0.71, hits.score(0), 0.01);
362: assertEquals(0, hits.id(0));
363: assertEquals(0.44, hits.score(1), 0.01);
364: assertEquals(1, hits.id(1));
365: assertEquals(0.31, hits.score(2), 0.01);
366: assertEquals(2, hits.id(2));
367: QueryUtils.check(query, searcher);
368: }
369:
370: public void testWrappedPhrase() throws IOException {
371: query.add(new Term("repeated", "first"));
372: query.add(new Term("repeated", "part"));
373: query.add(new Term("repeated", "second"));
374: query.add(new Term("repeated", "part"));
375: query.setSlop(100);
376:
377: Hits hits = searcher.search(query);
378: assertEquals("slop of 100 just right", 1, hits.length());
379: QueryUtils.check(query, searcher);
380:
381: query.setSlop(99);
382:
383: hits = searcher.search(query);
384: assertEquals("slop of 99 not enough", 0, hits.length());
385: QueryUtils.check(query, searcher);
386: }
387:
388: // work on two docs like this: "phrase exist notexist exist found"
389: public void testNonExistingPhrase() throws IOException {
390: // phrase without repetitions that exists in 2 docs
391: query.add(new Term("nonexist", "phrase"));
392: query.add(new Term("nonexist", "notexist"));
393: query.add(new Term("nonexist", "found"));
394: query.setSlop(2); // would be found this way
395:
396: Hits hits = searcher.search(query);
397: assertEquals("phrase without repetitions exists in 2 docs", 2,
398: hits.length());
399: QueryUtils.check(query, searcher);
400:
401: // phrase with repetitions that exists in 2 docs
402: query = new PhraseQuery();
403: query.add(new Term("nonexist", "phrase"));
404: query.add(new Term("nonexist", "exist"));
405: query.add(new Term("nonexist", "exist"));
406: query.setSlop(1); // would be found
407:
408: hits = searcher.search(query);
409: assertEquals("phrase with repetitions exists in two docs", 2,
410: hits.length());
411: QueryUtils.check(query, searcher);
412:
413: // phrase I with repetitions that does not exist in any doc
414: query = new PhraseQuery();
415: query.add(new Term("nonexist", "phrase"));
416: query.add(new Term("nonexist", "notexist"));
417: query.add(new Term("nonexist", "phrase"));
418: query.setSlop(1000); // would not be found no matter how high the slop is
419:
420: hits = searcher.search(query);
421: assertEquals(
422: "nonexisting phrase with repetitions does not exist in any doc",
423: 0, hits.length());
424: QueryUtils.check(query, searcher);
425:
426: // phrase II with repetitions that does not exist in any doc
427: query = new PhraseQuery();
428: query.add(new Term("nonexist", "phrase"));
429: query.add(new Term("nonexist", "exist"));
430: query.add(new Term("nonexist", "exist"));
431: query.add(new Term("nonexist", "exist"));
432: query.setSlop(1000); // would not be found no matter how high the slop is
433:
434: hits = searcher.search(query);
435: assertEquals(
436: "nonexisting phrase with repetitions does not exist in any doc",
437: 0, hits.length());
438: QueryUtils.check(query, searcher);
439:
440: }
441:
442: /**
443: * Working on a 2 fields like this:
444: * Field("field", "one two three four five")
445: * Field("palindrome", "one two three two one")
446: * Phrase of size 2 occuriong twice, once in order and once in reverse,
447: * because doc is a palyndrome, is counted twice.
448: * Also, in this case order in query does not matter.
449: * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
450: */
451: public void testPalyndrome2() throws Exception {
452:
453: // search on non palyndrome, find phrase with no slop, using exact phrase scorer
454: query.setSlop(0); // to use exact phrase scorer
455: query.add(new Term("field", "two"));
456: query.add(new Term("field", "three"));
457: Hits hits = searcher.search(query);
458: assertEquals("phrase found with exact phrase scorer", 1, hits
459: .length());
460: float score0 = hits.score(0);
461: //System.out.println("(exact) field: two three: "+score0);
462: QueryUtils.check(query, searcher);
463:
464: // search on non palyndrome, find phrase with slop 2, though no slop required here.
465: query.setSlop(2); // to use sloppy scorer
466: hits = searcher.search(query);
467: assertEquals("just sloppy enough", 1, hits.length());
468: float score1 = hits.score(0);
469: //System.out.println("(sloppy) field: two three: "+score1);
470: assertEquals(
471: "exact scorer and sloppy scorer score the same when slop does not matter",
472: score0, score1, SCORE_COMP_THRESH);
473: QueryUtils.check(query, searcher);
474:
475: // search ordered in palyndrome, find it twice
476: query = new PhraseQuery();
477: query.setSlop(2); // must be at least two for both ordered and reversed to match
478: query.add(new Term("palindrome", "two"));
479: query.add(new Term("palindrome", "three"));
480: hits = searcher.search(query);
481: assertEquals("just sloppy enough", 1, hits.length());
482: float score2 = hits.score(0);
483: //System.out.println("palindrome: two three: "+score2);
484: QueryUtils.check(query, searcher);
485:
486: //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
487: //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
488:
489: // search reveresed in palyndrome, find it twice
490: query = new PhraseQuery();
491: query.setSlop(2); // must be at least two for both ordered and reversed to match
492: query.add(new Term("palindrome", "three"));
493: query.add(new Term("palindrome", "two"));
494: hits = searcher.search(query);
495: assertEquals("just sloppy enough", 1, hits.length());
496: float score3 = hits.score(0);
497: //System.out.println("palindrome: three two: "+score3);
498: QueryUtils.check(query, searcher);
499:
500: //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
501: //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
502: //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
503: }
504:
505: /**
506: * Working on a 2 fields like this:
507: * Field("field", "one two three four five")
508: * Field("palindrome", "one two three two one")
509: * Phrase of size 3 occuriong twice, once in order and once in reverse,
510: * because doc is a palyndrome, is counted twice.
511: * Also, in this case order in query does not matter.
512: * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
513: */
514: public void testPalyndrome3() throws Exception {
515:
516: // search on non palyndrome, find phrase with no slop, using exact phrase scorer
517: query.setSlop(0); // to use exact phrase scorer
518: query.add(new Term("field", "one"));
519: query.add(new Term("field", "two"));
520: query.add(new Term("field", "three"));
521: Hits hits = searcher.search(query);
522: assertEquals("phrase found with exact phrase scorer", 1, hits
523: .length());
524: float score0 = hits.score(0);
525: //System.out.println("(exact) field: one two three: "+score0);
526: QueryUtils.check(query, searcher);
527:
528: // search on non palyndrome, find phrase with slop 3, though no slop required here.
529: query.setSlop(4); // to use sloppy scorer
530: hits = searcher.search(query);
531: assertEquals("just sloppy enough", 1, hits.length());
532: float score1 = hits.score(0);
533: //System.out.println("(sloppy) field: one two three: "+score1);
534: assertEquals(
535: "exact scorer and sloppy scorer score the same when slop does not matter",
536: score0, score1, SCORE_COMP_THRESH);
537: QueryUtils.check(query, searcher);
538:
539: // search ordered in palyndrome, find it twice
540: query = new PhraseQuery();
541: query.setSlop(4); // must be at least four for both ordered and reversed to match
542: query.add(new Term("palindrome", "one"));
543: query.add(new Term("palindrome", "two"));
544: query.add(new Term("palindrome", "three"));
545: hits = searcher.search(query);
546: assertEquals("just sloppy enough", 1, hits.length());
547: float score2 = hits.score(0);
548: //System.out.println("palindrome: one two three: "+score2);
549: QueryUtils.check(query, searcher);
550:
551: //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
552: //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
553:
554: // search reveresed in palyndrome, find it twice
555: query = new PhraseQuery();
556: query.setSlop(4); // must be at least four for both ordered and reversed to match
557: query.add(new Term("palindrome", "three"));
558: query.add(new Term("palindrome", "two"));
559: query.add(new Term("palindrome", "one"));
560: hits = searcher.search(query);
561: assertEquals("just sloppy enough", 1, hits.length());
562: float score3 = hits.score(0);
563: //System.out.println("palindrome: three two one: "+score3);
564: QueryUtils.check(query, searcher);
565:
566: //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
567: //assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
568: //assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
569: }
570:
571: }
|