# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
from unittest import TestCase,main
from lucene import *
class FuzzyQueryTestCase(TestCase):
"""
Unit tests ported from Java Lucene
"""
def _addDoc(self, text, writer):
doc = Document()
doc.add(Field("field", text,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
def testDefaultFuzziness(self):
directory = RAMDirectory()
writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
self._addDoc("aaaaa", writer)
self._addDoc("aaaab", writer)
self._addDoc("aaabb", writer)
self._addDoc("aabbb", writer)
self._addDoc("abbbb", writer)
self._addDoc("bbbbb", writer)
self._addDoc("ddddd", writer)
writer.optimize()
writer.close()
searcher = IndexSearcher(directory, True)
query = FuzzyQuery(Term("field", "aaaaa"))
topDocs = searcher.search(query, 50)
self.assertEqual(3, topDocs.totalHits)
# not similar enough:
query = FuzzyQuery(Term("field", "xxxxx"))
topDocs = searcher.search(query, 50)
self.assertEqual(0, topDocs.totalHits)
# edit distance to "aaaaa" = 3
query = FuzzyQuery(Term("field", "aaccc"))
topDocs = searcher.search(query, 50)
self.assertEqual(0, topDocs.totalHits)
# query identical to a word in the index:
query = FuzzyQuery(Term("field", "aaaaa"))
scoreDocs = searcher.search(query, 50).scoreDocs
self.assertEqual(3, len(scoreDocs))
self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
# default allows for up to two edits:
self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
# query similar to a word in the index:
query = FuzzyQuery(Term("field", "aaaac"))
scoreDocs = searcher.search(query, 50).scoreDocs
self.assertEqual(3, len(scoreDocs))
self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
query = FuzzyQuery(Term("field", "ddddX"))
scoreDocs = searcher.search(query, 50).scoreDocs
self.assertEqual(1, len(scoreDocs))
self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "ddddd")
# different field = no match:
query = FuzzyQuery(Term("anotherfield", "ddddX"))
topDocs = searcher.search(query, 50)
self.assertEqual(0, topDocs.totalHits)
searcher.close()
directory.close()
def testDefaultFuzzinessLong(self):
directory = RAMDirectory()
writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
self._addDoc("aaaaaaa", writer)
self._addDoc("segment", writer)
writer.optimize()
writer.close()
searcher = IndexSearcher(directory, True)
# not similar enough:
query = FuzzyQuery(Term("field", "xxxxx"))
topDocs = searcher.search(query, 50)
self.assertEqual(0, topDocs.totalHits)
# edit distance to "aaaaaaa" = 3, this matches because
# the string is longer than
# in testDefaultFuzziness so a bigger difference is allowed:
query = FuzzyQuery(Term("field", "aaaaccc"))
scoreDocs = searcher.search(query, 50).scoreDocs
self.assertEqual(1, len(scoreDocs))
self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaaaa")
# no match, more than half of the characters is wrong:
query = FuzzyQuery(Term("field", "aaacccc"))
topDocs = searcher.search(query, 50)
self.assertEqual(0, topDocs.totalHits)
# "student" and "stellent" are indeed similar to "segment" by default:
query = FuzzyQuery(Term("field", "student"))
topDocs = searcher.search(query, 50)
self.assertEqual(1, topDocs.totalHits)
query = FuzzyQuery(Term("field", "stellent"))
topDocs = searcher.search(query, 50)
self.assertEqual(1, topDocs.totalHits)
searcher.close()
directory.close()
if __name__ == "__main__":
import sys, lucene
lucene.initVM()
if '-loop' in sys.argv:
sys.argv.remove('-loop')
while True:
try:
main()
except:
pass
else:
main()
|