# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
import math
from itertools import izip
from random import randint
from unittest import TestCase,main
from lucene import *
NUM_STRINGS = 6000
class SortTestCase(TestCase):
"""
Unit tests for sorting code, ported from Java Lucene
"""
def __init__(self, *args, **kwds):
super(SortTestCase, self).__init__(*args, **kwds)
self.data = [
# tracer contents int float string custom i18n long double, short, byte, custom parser encoding'
[ "A", "x a", "5", "4f", "c", "A-3", u"p\u00EAche", "10", "-4.0", "3", "126", "J" ],
[ "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0", "24", "1", "I" ],
[ "C", "x a b c", "2147483647", "1.0", "j", "A-2", u"p\u00E9ch\u00E9", "99999999", "40.00002343", "125", "15", "H" ],
[ "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", str(Long.MAX_VALUE), str(Double.MIN_VALUE), str(Short.MIN_VALUE), str(Byte.MIN_VALUE), "G" ],
[ "E", "x a b c d", "5", "2f", "h", "B-8", "peach", str(Long.MIN_VALUE), str(Double.MAX_VALUE), str(Short.MAX_VALUE), str(Byte.MAX_VALUE), "F" ],
[ "F", "y a b c d", "2", "3.14159f", "g", "B-1", u"H\u00C5T", "-44", "343.034435444", "-3", "0", "E" ],
[ "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544", "5", "100", "D" ],
[ "H", "y a b c d", "0", "1.4E-45", "e", "C-88", u"H\u00D8T", "1023423423005", "4.043545", "10", "-50", "C" ],
[ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", u"s\u00EDn", "332422459999", "4.043546", "-340", "51", "B" ],
[ "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343", "300", "2", "A" ],
[ "W", "g", "1", None, None, None, None, None, None, None, None, None ],
[ "X", "g", "1", "0.1", None, None, None, None, None, None, None, None ],
[ "Y", "g", "1", "0.2", None, None, None, None, None, None, None, None ],
[ "Z", "f g", None, None, None, None, None, None, None, None, None, None ],
]
def _getIndex(self, even, odd):
indexStore = RAMDirectory()
writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setMaxBufferedDocs(2)
writer.setMergeFactor(1000)
for i in xrange(len(self.data)):
if (i % 2 == 0 and even) or (i % 2 == 1 and odd):
doc = Document()
doc.add(Field("tracer", self.data[i][0], Field.Store.YES,
Field.Index.NO))
doc.add(Field("contents", self.data[i][1], Field.Store.NO,
Field.Index.ANALYZED))
if self.data[i][2] is not None:
doc.add(Field("int", self.data[i][2], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][3] is not None:
doc.add(Field("float", self.data[i][3], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][4] is not None:
doc.add(Field("string", self.data[i][4], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][5] is not None:
doc.add(Field("custom", self.data[i][5], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][6] is not None:
doc.add(Field("i18n", self.data[i][6], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][7] is not None:
doc.add(Field("long", self.data[i][7], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][8] is not None:
doc.add(Field("double", self.data[i][8], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][9] is not None:
doc.add(Field("short", self.data[i][9], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][10] is not None:
doc.add(Field("byte", self.data[i][10], Field.Store.NO,
Field.Index.NOT_ANALYZED))
if self.data[i][11] is not None:
doc.add(Field("parser", self.data[i][11], Field.Store.NO,
Field.Index.NOT_ANALYZED))
doc.setBoost(2.0) # produce some scores above 1.0
writer.addDocument(doc)
# writer.optimize()
writer.close()
s = IndexSearcher(indexStore, True)
s.setDefaultFieldSortScoring(True, True)
return s
def _getFullIndex(self):
return self._getIndex(True, True)
def getFullStrings(self):
indexStore = RAMDirectory()
writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setMaxBufferedDocs(4)
writer.setMergeFactor(97)
for i in xrange(NUM_STRINGS):
doc = Document()
num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52)
doc.add(Field("tracer", num, Field.Store.YES, Field.Index.NO))
# doc.add(Field("contents", str(i), Field.Store.NO,
# Field.Index.ANALYZED))
doc.add(Field("string", num, Field.Store.NO,
Field.Index.NOT_ANALYZED))
num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50)
doc.add(Field("string2", num2, Field.Store.NO,
Field.Index.NOT_ANALYZED))
doc.add(Field("tracer2", num2, Field.Store.YES, Field.Index.NO))
doc.setBoost(2.0) # produce some scores above 1.0
writer.setMaxBufferedDocs(self.getRandomNumber(2, 12))
writer.addDocument(doc)
# writer.optimize()
# print writer.getSegmentCount()
writer.close()
return IndexSearcher(indexStore, True)
def getRandomNumberString(self, num, low, high):
return ''.join([self.getRandomNumber(low, high) for i in xrange(num)])
def getRandomCharString(self, num):
return self.getRandomCharString(num, 48, 122)
def getRandomCharString(self, num, start, end):
return ''.join([chr(self.getRandomNumber(start, end))
for i in xrange(num)])
def getRandomNumber(self, low, high):
return randint(low, high)
def _getXIndex(self):
return self._getIndex(True, False)
def _getYIndex(self):
return self._getIndex(False, True)
def _getEmptyIndex(self):
return self._getIndex(False, False)
def setUp(self):
self.full = self._getFullIndex()
self.searchX = self._getXIndex()
self.searchY = self._getYIndex()
self.queryX = TermQuery(Term("contents", "x"))
self.queryY = TermQuery(Term("contents", "y"))
self.queryA = TermQuery(Term("contents", "a"))
self.queryE = TermQuery(Term("contents", "e"))
self.queryF = TermQuery(Term("contents", "f"))
self.queryG = TermQuery(Term("contents", "g"))
def testBuiltInSorts(self):
"""
test the sorts by score and document number
"""
sort = Sort()
self._assertMatches(self.full, self.queryX, sort, "ACEGI")
self._assertMatches(self.full, self.queryY, sort, "BDFHJ")
sort.setSort(SortField.FIELD_DOC)
self._assertMatches(self.full, self.queryX, sort, "ACEGI")
self._assertMatches(self.full, self.queryY, sort, "BDFHJ")
def testTypedSort(self):
"""
test sorts where the type of field is specified
"""
sort = Sort()
sort.setSort([SortField("int", SortField.INT),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "IGAEC")
self._assertMatches(self.full, self.queryY, sort, "DHFJB")
sort.setSort([SortField("float", SortField.FLOAT),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "GCIEA")
self._assertMatches(self.full, self.queryY, sort, "DHJFB")
sort.setSort([SortField("long", SortField.LONG),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "EACGI")
self._assertMatches(self.full, self.queryY, sort, "FBJHD")
sort.setSort([SortField("double", SortField.DOUBLE),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "AGICE")
self._assertMatches(self.full, self.queryY, sort, "DJHBF")
sort.setSort([SortField("byte", SortField.BYTE),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "CIGAE")
self._assertMatches(self.full, self.queryY, sort, "DHFBJ")
sort.setSort([SortField("short", SortField.SHORT),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "IAGCE")
self._assertMatches(self.full, self.queryY, sort, "DFHBJ")
sort.setSort([SortField("string", SortField.STRING),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "AIGEC")
self._assertMatches(self.full, self.queryY, sort, "DJHFB")
def testStringSort(self):
"""
Test String sorting: small queue to many matches, multi field sort,
reverse sort
"""
sort = Sort()
searcher = self.getFullStrings()
sort.setSort([SortField("string", SortField.STRING),
SortField("string2", SortField.STRING, True),
SortField.FIELD_DOC])
result = searcher.search(MatchAllDocsQuery(), None, 500, sort).scoreDocs
buff = []
last = None
lastSub = None
lastDocId = 0
fail = False
for scoreDoc in result:
doc2 = searcher.doc(scoreDoc.doc)
v = doc2.getValues("tracer")
v2 = doc2.getValues("tracer2")
for _v, _v2 in izip(v, v2):
if last is not None:
_cmp = cmp(_v, last)
if _cmp < 0: # ensure first field is in order
fail = True
print "fail:", _v, "<", last
if _cmp == 0: # ensure second field is in reverse order
_cmp = cmp(_v2, lastSub)
if _cmp > 0:
fail = True
print "rev field fail:", _v2, ">", lastSub
elif _cmp == 0: # ensure docid is in order
if scoreDoc.doc < lastDocId:
fail = True
print "doc fail:", scoreDoc.doc, ">", lastDocId
last = _v
lastSub = _v2
lastDocId = scoreDoc.doc
buff.append(_v + "(" + _v2 + ")(" + str(scoreDoc.doc) + ") ")
if fail:
print "topn field1(field2)(docID):", ''.join(buff)
self.assert_(not fail, "Found sort results out of order")
def testCustomFieldParserSort(self):
"""
test sorts where the type of field is specified and a custom field
parser is used, that uses a simple char encoding. The sorted string
contains a character beginning from 'A' that is mapped to a numeric
value using some "funny" algorithm to be different for each data
type.
"""
# since tests explicitly use different parsers on the same field name
# we explicitly check/purge the FieldCache between each assertMatch
fc = FieldCache.DEFAULT
class intParser(PythonIntParser):
def parseInt(_self, val):
return (ord(val[0]) - ord('A')) * 123456
class floatParser(PythonFloatParser):
def parseFloat(_self, val):
return math.sqrt(ord(val[0]))
class longParser(PythonLongParser):
def parseLong(_self, val):
return (ord(val[0]) - ord('A')) * 1234567890L
class doubleParser(PythonDoubleParser):
def parseDouble(_self, val):
return math.pow(ord(val[0]), ord(val[0]) - ord('A'))
class byteParser(PythonByteParser):
def parseByte(_self, val):
return chr(ord(val[0]) - ord('A'))
class shortParser(PythonShortParser):
def parseShort(_self, val):
return ord(val[0]) - ord('A')
sort = Sort()
sort.setSort([SortField("parser", intParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " IntParser")
fc.purgeAllCaches()
sort.setSort([SortField("parser", floatParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " FloatParser")
fc.purgeAllCaches()
sort.setSort([SortField("parser", longParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " LongParser")
fc.purgeAllCaches()
sort.setSort([SortField("parser", doubleParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " DoubleParser")
fc.purgeAllCaches()
sort.setSort([SortField("parser", byteParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " ByteParser")
fc.purgeAllCaches()
sort.setSort([SortField("parser", shortParser()),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
self._assertSaneFieldCaches(self.getName() + " ShortParser")
fc.purgeAllCaches()
def testEmptyIndex(self):
"""
test sorts when there's nothing in the index
"""
sort = Sort()
empty = self._getEmptyIndex()
self._assertMatches(empty, self.queryX, sort, "")
sort.setSort(SortField.FIELD_DOC)
self._assertMatches(empty, self.queryX, sort, "")
sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
self._assertMatches(empty, self.queryX, sort, "")
sort.setSort([SortField("string", SortField.STRING, True),
SortField.FIELD_DOC])
self._assertMatches(empty, self.queryX, sort, "")
sort.setSort([SortField("float", SortField.FLOAT),
SortField("string", SortField.STRING)])
self._assertMatches(empty, self.queryX, sort, "")
def testNewCustomFieldParserSort(self):
"""
Test sorting w/ custom FieldComparator
"""
sort = Sort()
sort.setSort([SortField("parser", MyFieldComparatorSource())])
self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
def testReverseSort(self):
"""
test sorts in reverse
"""
sort = Sort()
sort.setSort([SortField(None, SortField.SCORE, True),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "IEGCA")
self._assertMatches(self.full, self.queryY, sort, "JFHDB")
sort.setSort(SortField(None, SortField.DOC, True))
self._assertMatches(self.full, self.queryX, sort, "IGECA")
self._assertMatches(self.full, self.queryY, sort, "JHFDB")
sort.setSort(SortField("int", SortField.INT, True))
self._assertMatches(self.full, self.queryX, sort, "CAEGI")
self._assertMatches(self.full, self.queryY, sort, "BJFHD")
sort.setSort(SortField("float", SortField.FLOAT, True))
self._assertMatches(self.full, self.queryX, sort, "AECIG")
self._assertMatches(self.full, self.queryY, sort, "BFJHD")
sort.setSort(SortField("string", SortField.STRING, True))
self._assertMatches(self.full, self.queryX, sort, "CEGIA")
self._assertMatches(self.full, self.queryY, sort, "BFHJD")
def testEmptyFieldSort(self):
"""
test sorting when the sort field is empty(undefined) for some of the
documents
"""
sort = Sort()
sort.setSort(SortField("string", SortField.STRING))
self._assertMatches(self.full, self.queryF, sort, "ZJI")
sort.setSort(SortField("string", SortField.STRING, True))
self._assertMatches(self.full, self.queryF, sort, "IJZ")
sort.setSort(SortField("i18n", Locale.ENGLISH))
self._assertMatches(self.full, self.queryF, sort, "ZJI")
sort.setSort(SortField("i18n", Locale.ENGLISH, True))
self._assertMatches(self.full, self.queryF, sort, "IJZ")
sort.setSort(SortField("int", SortField.INT))
self._assertMatches(self.full, self.queryF, sort, "IZJ")
sort.setSort(SortField("int", SortField.INT, True))
self._assertMatches(self.full, self.queryF, sort, "JZI")
sort.setSort(SortField("float", SortField.FLOAT))
self._assertMatches(self.full, self.queryF, sort, "ZJI")
# using a nonexisting field as first sort key shouldn't make a
# difference:
sort.setSort([SortField("nosuchfield", SortField.STRING),
SortField("float", SortField.FLOAT)])
self._assertMatches(self.full, self.queryF, sort, "ZJI")
sort.setSort(SortField("float", SortField.FLOAT, True))
self._assertMatches(self.full, self.queryF, sort, "IJZ")
# When a field is None for both documents, the next SortField should
# be used.
# Works for
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT)])
self._assertMatches(self.full, self.queryG, sort, "ZWXY")
# Reverse the last criterium to make sure the test didn't pass by
# chance
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT, True)])
self._assertMatches(self.full, self.queryG, sort, "ZYXW")
# Do the same for a MultiSearcher
multiSearcher = MultiSearcher([self.full])
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT)])
self._assertMatches(multiSearcher, self.queryG, sort, "ZWXY")
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT, True)])
self._assertMatches(multiSearcher, self.queryG, sort, "ZYXW")
# Don't close the multiSearcher. it would close the full searcher too!
# Do the same for a ParallelMultiSearcher
parallelSearcher = ParallelMultiSearcher([self.full])
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT)])
self._assertMatches(parallelSearcher, self.queryG, sort, "ZWXY")
sort.setSort([SortField("int", SortField.INT),
SortField("string", SortField.STRING),
SortField("float", SortField.FLOAT, True)])
self._assertMatches(parallelSearcher, self.queryG, sort, "ZYXW")
# Don't close the parallelSearcher. it would close the full searcher
# too!
def testSortCombos(self):
"""
test sorts using a series of fields
"""
sort = Sort()
sort.setSort([SortField("int", SortField.INT),
SortField("float", SortField.FLOAT)])
self._assertMatches(self.full, self.queryX, sort, "IGEAC")
sort.setSort([SortField("int", SortField.INT, True),
SortField(None, SortField.DOC, True)])
self._assertMatches(self.full, self.queryX, sort, "CEAGI")
sort.setSort([SortField("float", SortField.FLOAT),
SortField("string", SortField.STRING)])
self._assertMatches(self.full, self.queryX, sort, "GICEA")
def testLocaleSort(self):
"""
test using a Locale for sorting strings
"""
sort = Sort()
sort.setSort([SortField("string", Locale.US)])
self._assertMatches(self.full, self.queryX, sort, "AIGEC")
self._assertMatches(self.full, self.queryY, sort, "DJHFB")
sort.setSort([SortField("string", Locale.US, True)])
self._assertMatches(self.full, self.queryX, sort, "CEGIA")
self._assertMatches(self.full, self.queryY, sort, "BFHJD")
def testInternationalSort(self):
"""
test using various international locales with accented characters
(which sort differently depending on locale)
"""
sort = Sort()
sort.setSort(SortField("i18n", Locale.US))
self._assertMatches(self.full, self.queryY, sort, "BFJDH")
sort.setSort(SortField("i18n", Locale("sv", "se")))
self._assertMatches(self.full, self.queryY, sort, "BJDFH")
sort.setSort(SortField("i18n", Locale("da", "dk")))
self._assertMatches(self.full, self.queryY, sort, "BJDHF")
sort.setSort(SortField("i18n", Locale.US))
self._assertMatches(self.full, self.queryX, sort, "ECAGI")
sort.setSort(SortField("i18n", Locale.FRANCE))
self._assertMatches(self.full, self.queryX, sort, "EACGI")
def testInternationalMultiSearcherSort(self):
"""
Test the MultiSearcher's ability to preserve locale-sensitive ordering
by wrapping it around a single searcher
"""
sort = Sort()
multiSearcher = MultiSearcher([self.full])
sort.setSort(SortField("i18n", Locale("sv", "se")))
self._assertMatches(multiSearcher, self.queryY, sort, "BJDFH")
sort.setSort(SortField("i18n", Locale.US))
self._assertMatches(multiSearcher, self.queryY, sort, "BFJDH")
sort.setSort(SortField("i18n", Locale("da", "dk")))
self._assertMatches(multiSearcher, self.queryY, sort, "BJDHF")
def testMultiSort(self):
"""
test a variety of sorts using more than one searcher
"""
searcher = MultiSearcher([self.searchX, self.searchY])
self.runMultiSorts(searcher, False)
def testParallelMultiSort(self):
"""
test a variety of sorts using a parallel multisearcher
"""
searcher = ParallelMultiSearcher([self.searchX, self.searchY])
self.runMultiSorts(searcher, False)
def testNormalizedScores(self):
"""
test that the relevancy scores are the same even if
hits are sorted
"""
# capture relevancy scores
scoresX = self.getScores(self.full.search(self.queryX, None,
1000).scoreDocs, self.full)
scoresY = self.getScores(self.full.search(self.queryY, None,
1000).scoreDocs, self.full)
scoresA = self.getScores(self.full.search(self.queryA, None,
1000).scoreDocs, self.full)
# we'll test searching locally, remote and multi
multi = MultiSearcher([self.searchX, self.searchY])
# change sorting and make sure relevancy stays the same
sort = Sort()
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(SortField.FIELD_DOC)
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(SortField("int", SortField.INT))
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(SortField("float", SortField.FLOAT))
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(SortField("string", SortField.STRING))
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort([SortField("int", SortField.INT),
SortField("float", SortField.FLOAT)])
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort([SortField("int", SortField.INT, True),
SortField(None, SortField.DOC, True)])
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort([SortField("float", SortField.FLOAT),
SortField("string", SortField.STRING)])
self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
def testTopDocsScores(self):
"""
There was previously a bug in FieldSortedHitQueue.maxscore when only
a single doc was added. That is what the following tests for.
"""
sort = Sort()
nDocs = 10
# try to pick a query that will result in an unnormalized
# score greater than 1 to test for correct normalization
docs1 = self.full.search(self.queryE, None, nDocs, sort)
# a filter that only allows through the first hit
class filter(PythonFilter):
def getDocIdSet(_self, reader):
bs = BitSet(reader.maxDoc())
bs.set(0, reader.maxDoc())
bs.set(docs1.scoreDocs[0].doc)
return DocIdBitSet(bs)
filt = filter()
docs2 = self.full.search(self.queryE, filt, nDocs, sort)
self.assertEqual(docs1.scoreDocs[0].score,
docs2.scoreDocs[0].score,
1e-6)
def testSortWithoutFillFields(self):
"""
There was previously a bug in TopFieldCollector when fillFields was
set to False - the same doc and score was set in ScoreDoc[]
array. This test asserts that if fillFields is False, the documents
are set properly. It does not use Searcher's default search
methods(with Sort) since all set fillFields to True.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
for sort in sorts:
q = MatchAllDocsQuery()
tdc = TopFieldCollector.create(sort, 10, False,
False, False, True)
self.full.search(q, tdc)
sds = tdc.topDocs().scoreDocs
for i in xrange(1, len(sds)):
self.assert_(sds[i].doc != sds[i - 1].doc)
def testSortWithoutScoreTracking(self):
"""
Two Sort criteria to instantiate the multi/single comparators.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
for sort in sorts:
q = MatchAllDocsQuery()
tdc = TopFieldCollector.create(sort, 10, True, False,
False, True)
self.full.search(q, tdc)
tds = tdc.topDocs()
sds = tds.scoreDocs
for sd in sds:
self.assert_(Float.isNaN_(sd.score))
self.assert_(Float.isNaN_(tds.getMaxScore()))
def testSortWithScoreNoMaxScoreTracking(self):
"""
Two Sort criteria to instantiate the multi/single comparators.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
for sort in sorts:
q = MatchAllDocsQuery()
tdc = TopFieldCollector.create(sort, 10, True, True,
False, True)
self.full.search(q, tdc)
tds = tdc.topDocs()
sds = tds.scoreDocs
for sd in sds:
self.assert_(not Float.isNaN_(sd.score))
self.assert_(Float.isNaN_(tds.getMaxScore()))
def testSortWithScoreAndMaxScoreTracking(self):
"""
Two Sort criteria to instantiate the multi/single comparators.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
for sort in sorts:
q = MatchAllDocsQuery()
tdc = TopFieldCollector.create(sort, 10, True, True,
True, True)
self.full.search(q, tdc)
tds = tdc.topDocs()
sds = tds.scoreDocs
for sd in sds:
self.assert_(not Float.isNaN_(sd.score))
self.assert_(not Float.isNaN_(tds.getMaxScore()))
def testOutOfOrderDocsScoringSort(self):
"""
Two Sort criteria to instantiate the multi/single comparators.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
tfcOptions = [[False, False, False],
[False, False, True],
[False, True, False],
[False, True, True],
[True, False, False],
[True, False, True],
[True, True, False],
[True, True, True]]
actualTFCClasses = [
"OutOfOrderOneComparatorNonScoringCollector",
"OutOfOrderOneComparatorScoringMaxScoreCollector",
"OutOfOrderOneComparatorScoringNoMaxScoreCollector",
"OutOfOrderOneComparatorScoringMaxScoreCollector",
"OutOfOrderOneComparatorNonScoringCollector",
"OutOfOrderOneComparatorScoringMaxScoreCollector",
"OutOfOrderOneComparatorScoringNoMaxScoreCollector",
"OutOfOrderOneComparatorScoringMaxScoreCollector"
]
bq = BooleanQuery()
# Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
# which delegates to BS if there are no mandatory clauses.
bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
# Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
# return the clause instead of BQ.
bq.setMinimumNumberShouldMatch(1)
for sort in sorts:
for tfcOption, actualTFCClass in izip(tfcOptions,
actualTFCClasses):
tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
tfcOption[1], tfcOption[2],
False)
self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
self.full.search(bq, tdc)
tds = tdc.topDocs()
sds = tds.scoreDocs
self.assertEqual(10, len(sds))
def testSortWithScoreAndMaxScoreTrackingNoResults(self):
"""
Two Sort criteria to instantiate the multi/single comparators.
"""
sorts = [Sort(SortField.FIELD_DOC), Sort()]
for sort in sorts:
tdc = TopFieldCollector.create(sort, 10, True, True, True, True)
tds = tdc.topDocs()
self.assertEqual(0, tds.totalHits)
self.assert_(Float.isNaN_(tds.getMaxScore()))
def runMultiSorts(self, multi, isFull):
"""
runs a variety of sorts useful for multisearchers
"""
sort = Sort()
sort.setSort(SortField.FIELD_DOC)
expected = isFull and "ABCDEFGHIJ" or "ACEGIBDFHJ"
self._assertMatches(multi, self.queryA, sort, expected)
sort.setSort(SortField("int", SortField.INT))
expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
self._assertMatches(multi, self.queryA, sort, expected)
sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
self._assertMatches(multi, self.queryA, sort, expected)
sort.setSort(SortField("int", SortField.INT))
expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
self._assertMatches(multi, self.queryA, sort, expected)
sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC])
self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
sort.setSort(SortField("float", SortField.FLOAT))
self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
sort.setSort(SortField("string", SortField.STRING))
self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
sort.setSort(SortField("int", SortField.INT, True))
expected = isFull and "CABEJGFHDI" or "CAEBJGFHDI"
self._assertMatches(multi, self.queryA, sort, expected)
sort.setSort(SortField("float", SortField.FLOAT, True))
self._assertMatches(multi, self.queryA, sort, "BAFECIJHDG")
sort.setSort(SortField("string", SortField.STRING, True))
self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
sort.setSort([SortField("int", SortField.INT),
SortField("float", SortField.FLOAT)])
self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC")
sort.setSort([SortField("float", SortField.FLOAT),
SortField("string", SortField.STRING)])
self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB")
sort.setSort(SortField("int", SortField.INT))
self._assertMatches(multi, self.queryF, sort, "IZJ")
sort.setSort(SortField("int", SortField.INT, True))
self._assertMatches(multi, self.queryF, sort, "JZI")
sort.setSort(SortField("float", SortField.FLOAT))
self._assertMatches(multi, self.queryF, sort, "ZJI")
sort.setSort(SortField("string", SortField.STRING))
self._assertMatches(multi, self.queryF, sort, "ZJI")
sort.setSort(SortField("string", SortField.STRING, True))
self._assertMatches(multi, self.queryF, sort, "IJZ")
# up to this point, all of the searches should have "sane"
# FieldCache behavior, and should have reused hte cache in several
# cases
self._assertSaneFieldCaches(self.getName() + " various")
# next we'll check Locale based(String[]) for 'string', so purge first
FieldCache.DEFAULT.purgeAllCaches()
sort.setSort([SortField("string", Locale.US)])
self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
sort.setSort([SortField("string", Locale.US, True)])
self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
sort.setSort([SortField("string", Locale.UK)])
self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
self._assertSaneFieldCaches(self.getName() + " Locale.US + Locale.UK")
FieldCache.DEFAULT.purgeAllCaches()
def _assertMatches(self, searcher, query, sort, expectedResult):
"""
make sure the documents returned by the search match the expected
list
"""
# ScoreDoc[] result = searcher.search(query, None, 1000, sort).scoreDocs
hits = searcher.search(query, None, len(expectedResult), sort)
sds = hits.scoreDocs
self.assertEqual(hits.totalHits, len(expectedResult))
buff = []
for sd in sds:
doc = searcher.doc(sd.doc)
v = doc.getValues("tracer")
for _v in v:
buff.append(_v)
self.assertEqual(expectedResult, ''.join(buff))
def getScores(self, hits, searcher):
scoreMap = {}
for hit in hits:
doc = searcher.doc(hit.doc)
v = doc.getValues("tracer")
self.assertEqual(len(v), 1)
scoreMap[v[0]] = hit.score
return scoreMap
def _assertSameValues(self, m1, m2):
"""
make sure all the values in the maps match
"""
self.assertEquals(len(m1), len(m2))
for key in m1.iterkeys():
self.assertEquals(m1[key], m2[key], 1e-6)
def getName(self):
return type(self).__name__
def _assertSaneFieldCaches(self, msg):
entries = FieldCache.DEFAULT.getCacheEntries()
insanity = FieldCacheSanityChecker.checkSanity(entries)
self.assertEqual(0, len(insanity),
msg + ": Insane FieldCache usage(s) found")
class MyFieldComparator(PythonFieldComparator):
def __init__(self, numHits):
super(MyFieldComparator, self).__init__()
self.slotValues = [0] * numHits
def copy(self, slot, doc):
self.slotValues[slot] = self.docValues[doc]
def compare(self, slot1, slot2):
return self.slotValues[slot1] - self.slotValues[slot2]
def compareBottom(self, doc):
return self.bottomValue - self.docValues[doc]
def setBottom(self, bottom):
self.bottomValue = self.slotValues[bottom]
def setNextReader(self, reader, docBase):
class intParser(PythonIntParser):
def parseInt(_self, val):
return (ord(val[0]) - ord('A')) * 123456
self.docValues = FieldCache.DEFAULT.getInts(reader, "parser",
intParser())
def value(self, slot):
return Integer(self.slotValues[slot])
class MyFieldComparatorSource(PythonFieldComparatorSource):
def newComparator(self, fieldname, numHits, sortPos, reversed):
return MyFieldComparator(numHits)
if __name__ == "__main__":
import sys, lucene
env = lucene.initVM()
if '-loop' in sys.argv:
sys.argv.remove('-loop')
while True:
try:
main()
except:
pass
# refs = sorted(env._dumpRefs(classes=True).items(),
# key=lambda x: x[1], reverse=True)
# print refs[0:4]
else:
main()
|