001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import org.apache.lucene.util.LuceneTestCase;
023: import org.apache.lucene.analysis.WhitespaceAnalyzer;
024: import org.apache.lucene.document.Document;
025: import org.apache.lucene.document.Field;
026: import org.apache.lucene.index.IndexWriter;
027: import org.apache.lucene.index.Term;
028: import org.apache.lucene.store.RAMDirectory;
029:
030: /**
031: * Tests {@link FuzzyQuery}.
032: *
033: * @author Daniel Naber
034: */
035: public class TestFuzzyQuery extends LuceneTestCase {
036:
037: public void testFuzziness() throws Exception {
038: RAMDirectory directory = new RAMDirectory();
039: IndexWriter writer = new IndexWriter(directory,
040: new WhitespaceAnalyzer(), true);
041: addDoc("aaaaa", writer);
042: addDoc("aaaab", writer);
043: addDoc("aaabb", writer);
044: addDoc("aabbb", writer);
045: addDoc("abbbb", writer);
046: addDoc("bbbbb", writer);
047: addDoc("ddddd", writer);
048: writer.optimize();
049: writer.close();
050: IndexSearcher searcher = new IndexSearcher(directory);
051:
052: FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"),
053: FuzzyQuery.defaultMinSimilarity, 0);
054: Hits hits = searcher.search(query);
055: assertEquals(3, hits.length());
056:
057: // same with prefix
058: query = new FuzzyQuery(new Term("field", "aaaaa"),
059: FuzzyQuery.defaultMinSimilarity, 1);
060: hits = searcher.search(query);
061: assertEquals(3, hits.length());
062: query = new FuzzyQuery(new Term("field", "aaaaa"),
063: FuzzyQuery.defaultMinSimilarity, 2);
064: hits = searcher.search(query);
065: assertEquals(3, hits.length());
066: query = new FuzzyQuery(new Term("field", "aaaaa"),
067: FuzzyQuery.defaultMinSimilarity, 3);
068: hits = searcher.search(query);
069: assertEquals(3, hits.length());
070: query = new FuzzyQuery(new Term("field", "aaaaa"),
071: FuzzyQuery.defaultMinSimilarity, 4);
072: hits = searcher.search(query);
073: assertEquals(2, hits.length());
074: query = new FuzzyQuery(new Term("field", "aaaaa"),
075: FuzzyQuery.defaultMinSimilarity, 5);
076: hits = searcher.search(query);
077: assertEquals(1, hits.length());
078: query = new FuzzyQuery(new Term("field", "aaaaa"),
079: FuzzyQuery.defaultMinSimilarity, 6);
080: hits = searcher.search(query);
081: assertEquals(1, hits.length());
082:
083: // not similar enough:
084: query = new FuzzyQuery(new Term("field", "xxxxx"),
085: FuzzyQuery.defaultMinSimilarity, 0);
086: hits = searcher.search(query);
087: assertEquals(0, hits.length());
088: query = new FuzzyQuery(new Term("field", "aaccc"),
089: FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
090: hits = searcher.search(query);
091: assertEquals(0, hits.length());
092:
093: // query identical to a word in the index:
094: query = new FuzzyQuery(new Term("field", "aaaaa"),
095: FuzzyQuery.defaultMinSimilarity, 0);
096: hits = searcher.search(query);
097: assertEquals(3, hits.length());
098: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
099: // default allows for up to two edits:
100: assertEquals(hits.doc(1).get("field"), ("aaaab"));
101: assertEquals(hits.doc(2).get("field"), ("aaabb"));
102:
103: // query similar to a word in the index:
104: query = new FuzzyQuery(new Term("field", "aaaac"),
105: FuzzyQuery.defaultMinSimilarity, 0);
106: hits = searcher.search(query);
107: assertEquals(3, hits.length());
108: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
109: assertEquals(hits.doc(1).get("field"), ("aaaab"));
110: assertEquals(hits.doc(2).get("field"), ("aaabb"));
111:
112: // now with prefix
113: query = new FuzzyQuery(new Term("field", "aaaac"),
114: FuzzyQuery.defaultMinSimilarity, 1);
115: hits = searcher.search(query);
116: assertEquals(3, hits.length());
117: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
118: assertEquals(hits.doc(1).get("field"), ("aaaab"));
119: assertEquals(hits.doc(2).get("field"), ("aaabb"));
120: query = new FuzzyQuery(new Term("field", "aaaac"),
121: FuzzyQuery.defaultMinSimilarity, 2);
122: hits = searcher.search(query);
123: assertEquals(3, hits.length());
124: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
125: assertEquals(hits.doc(1).get("field"), ("aaaab"));
126: assertEquals(hits.doc(2).get("field"), ("aaabb"));
127: query = new FuzzyQuery(new Term("field", "aaaac"),
128: FuzzyQuery.defaultMinSimilarity, 3);
129: hits = searcher.search(query);
130: assertEquals(3, hits.length());
131: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
132: assertEquals(hits.doc(1).get("field"), ("aaaab"));
133: assertEquals(hits.doc(2).get("field"), ("aaabb"));
134: query = new FuzzyQuery(new Term("field", "aaaac"),
135: FuzzyQuery.defaultMinSimilarity, 4);
136: hits = searcher.search(query);
137: assertEquals(2, hits.length());
138: assertEquals(hits.doc(0).get("field"), ("aaaaa"));
139: assertEquals(hits.doc(1).get("field"), ("aaaab"));
140: query = new FuzzyQuery(new Term("field", "aaaac"),
141: FuzzyQuery.defaultMinSimilarity, 5);
142: hits = searcher.search(query);
143: assertEquals(0, hits.length());
144:
145: query = new FuzzyQuery(new Term("field", "ddddX"),
146: FuzzyQuery.defaultMinSimilarity, 0);
147: hits = searcher.search(query);
148: assertEquals(1, hits.length());
149: assertEquals(hits.doc(0).get("field"), ("ddddd"));
150:
151: // now with prefix
152: query = new FuzzyQuery(new Term("field", "ddddX"),
153: FuzzyQuery.defaultMinSimilarity, 1);
154: hits = searcher.search(query);
155: assertEquals(1, hits.length());
156: assertEquals(hits.doc(0).get("field"), ("ddddd"));
157: query = new FuzzyQuery(new Term("field", "ddddX"),
158: FuzzyQuery.defaultMinSimilarity, 2);
159: hits = searcher.search(query);
160: assertEquals(1, hits.length());
161: assertEquals(hits.doc(0).get("field"), ("ddddd"));
162: query = new FuzzyQuery(new Term("field", "ddddX"),
163: FuzzyQuery.defaultMinSimilarity, 3);
164: hits = searcher.search(query);
165: assertEquals(1, hits.length());
166: assertEquals(hits.doc(0).get("field"), ("ddddd"));
167: query = new FuzzyQuery(new Term("field", "ddddX"),
168: FuzzyQuery.defaultMinSimilarity, 4);
169: hits = searcher.search(query);
170: assertEquals(1, hits.length());
171: assertEquals(hits.doc(0).get("field"), ("ddddd"));
172: query = new FuzzyQuery(new Term("field", "ddddX"),
173: FuzzyQuery.defaultMinSimilarity, 5);
174: hits = searcher.search(query);
175: assertEquals(0, hits.length());
176:
177: // different field = no match:
178: query = new FuzzyQuery(new Term("anotherfield", "ddddX"),
179: FuzzyQuery.defaultMinSimilarity, 0);
180: hits = searcher.search(query);
181: assertEquals(0, hits.length());
182:
183: searcher.close();
184: directory.close();
185: }
186:
187: public void testFuzzinessLong() throws Exception {
188: RAMDirectory directory = new RAMDirectory();
189: IndexWriter writer = new IndexWriter(directory,
190: new WhitespaceAnalyzer(), true);
191: addDoc("aaaaaaa", writer);
192: addDoc("segment", writer);
193: writer.optimize();
194: writer.close();
195: IndexSearcher searcher = new IndexSearcher(directory);
196:
197: FuzzyQuery query;
198: // not similar enough:
199: query = new FuzzyQuery(new Term("field", "xxxxx"),
200: FuzzyQuery.defaultMinSimilarity, 0);
201: Hits hits = searcher.search(query);
202: assertEquals(0, hits.length());
203: // edit distance to "aaaaaaa" = 3, this matches because the string is longer than
204: // in testDefaultFuzziness so a bigger difference is allowed:
205: query = new FuzzyQuery(new Term("field", "aaaaccc"),
206: FuzzyQuery.defaultMinSimilarity, 0);
207: hits = searcher.search(query);
208: assertEquals(1, hits.length());
209: assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
210:
211: // now with prefix
212: query = new FuzzyQuery(new Term("field", "aaaaccc"),
213: FuzzyQuery.defaultMinSimilarity, 1);
214: hits = searcher.search(query);
215: assertEquals(1, hits.length());
216: assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
217: query = new FuzzyQuery(new Term("field", "aaaaccc"),
218: FuzzyQuery.defaultMinSimilarity, 4);
219: hits = searcher.search(query);
220: assertEquals(1, hits.length());
221: assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
222: query = new FuzzyQuery(new Term("field", "aaaaccc"),
223: FuzzyQuery.defaultMinSimilarity, 5);
224: hits = searcher.search(query);
225: assertEquals(0, hits.length());
226:
227: // no match, more than half of the characters is wrong:
228: query = new FuzzyQuery(new Term("field", "aaacccc"),
229: FuzzyQuery.defaultMinSimilarity, 0);
230: hits = searcher.search(query);
231: assertEquals(0, hits.length());
232:
233: // now with prefix
234: query = new FuzzyQuery(new Term("field", "aaacccc"),
235: FuzzyQuery.defaultMinSimilarity, 2);
236: hits = searcher.search(query);
237: assertEquals(0, hits.length());
238:
239: // "student" and "stellent" are indeed similar to "segment" by default:
240: query = new FuzzyQuery(new Term("field", "student"),
241: FuzzyQuery.defaultMinSimilarity, 0);
242: hits = searcher.search(query);
243: assertEquals(1, hits.length());
244: query = new FuzzyQuery(new Term("field", "stellent"),
245: FuzzyQuery.defaultMinSimilarity, 0);
246: hits = searcher.search(query);
247: assertEquals(1, hits.length());
248:
249: // now with prefix
250: query = new FuzzyQuery(new Term("field", "student"),
251: FuzzyQuery.defaultMinSimilarity, 1);
252: hits = searcher.search(query);
253: assertEquals(1, hits.length());
254: query = new FuzzyQuery(new Term("field", "stellent"),
255: FuzzyQuery.defaultMinSimilarity, 1);
256: hits = searcher.search(query);
257: assertEquals(1, hits.length());
258: query = new FuzzyQuery(new Term("field", "student"),
259: FuzzyQuery.defaultMinSimilarity, 2);
260: hits = searcher.search(query);
261: assertEquals(0, hits.length());
262: query = new FuzzyQuery(new Term("field", "stellent"),
263: FuzzyQuery.defaultMinSimilarity, 2);
264: hits = searcher.search(query);
265: assertEquals(0, hits.length());
266:
267: // "student" doesn't match anymore thanks to increased minimum similarity:
268: query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
269: hits = searcher.search(query);
270: assertEquals(0, hits.length());
271:
272: try {
273: query = new FuzzyQuery(new Term("field", "student"), 1.1f);
274: fail("Expected IllegalArgumentException");
275: } catch (IllegalArgumentException e) {
276: // expecting exception
277: }
278: try {
279: query = new FuzzyQuery(new Term("field", "student"), -0.1f);
280: fail("Expected IllegalArgumentException");
281: } catch (IllegalArgumentException e) {
282: // expecting exception
283: }
284:
285: searcher.close();
286: directory.close();
287: }
288:
289: private void addDoc(String text, IndexWriter writer)
290: throws IOException {
291: Document doc = new Document();
292: doc.add(new Field("field", text, Field.Store.YES,
293: Field.Index.TOKENIZED));
294: writer.addDocument(doc);
295: }
296:
297: }
|