# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
from unittest import TestCase,main
from lucene import *
class BaseTokenStreamTestCase(TestCase):
"""
Base class for all Lucene unit tests that use TokenStreams.
"""
def _assertTokenStreamContents(self, ts, output,
startOffsets=None, endOffsets=None,
types=None, posIncrements=None):
self.assert_(output is not None)
self.assert_(ts.hasAttribute(TermAttribute.class_),
"has TermAttribute")
termAtt = ts.getAttribute(TermAttribute.class_)
offsetAtt = None
if startOffsets is not None or endOffsets is not None:
self.assert_(ts.hasAttribute(OffsetAttribute.class_),
"has OffsetAttribute")
offsetAtt = ts.getAttribute(OffsetAttribute.class_)
typeAtt = None
if types is not None:
self.assert_(ts.hasAttribute(TypeAttribute.class_),
"has TypeAttribute")
typeAtt = ts.getAttribute(TypeAttribute.class_)
posIncrAtt = None
if posIncrements is not None:
self.assert_(ts.hasAttribute(PositionIncrementAttribute.class_),
"has PositionIncrementAttribute")
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class_)
ts.reset()
for i in xrange(len(output)):
# extra safety to enforce, that the state is not preserved and
# also assign bogus values
ts.clearAttributes()
termAtt.setTermBuffer("bogusTerm")
if offsetAtt is not None:
offsetAtt.setOffset(14584724, 24683243)
if typeAtt is not None:
typeAtt.setType("bogusType")
if posIncrAtt is not None:
posIncrAtt.setPositionIncrement(45987657)
self.assert_(ts.incrementToken(), "token %d exists" %(i))
self.assertEqual(output[i], termAtt.term(), "term %d" %(i))
if startOffsets is not None:
self.assertEqual(startOffsets[i], offsetAtt.startOffset(),
"startOffset %d" %(i))
if endOffsets is not None:
self.assertEqual(endOffsets[i], offsetAtt.endOffset(),
"endOffset %d" %(i))
if types is not None:
self.assertEqual(types[i], typeAtt.type(), "type %d" %(i))
if posIncrements is not None:
self.assertEqual(posIncrements[i],
posIncrAtt.getPositionIncrement(),
"posIncrement %d" %(i))
self.assert_(not ts.incrementToken(), "end of stream")
ts.end()
ts.close()
def _assertAnalyzesTo(self, a, input, output,
startOffsets=None, endOffsets=None,
types=None, posIncrements=None):
ts = a.tokenStream("dummy", StringReader(input))
self._assertTokenStreamContents(ts, output, startOffsets, endOffsets,
types, posIncrements)
def _assertAnalyzesToReuse(self, a, input, output,
startOffsets=None, endOffsets=None,
types=None, posIncrements=None):
ts = a.reusableTokenStream("dummy", StringReader(input))
self._assertTokenStreamContents(ts, output, startOffsets, endOffsets,
types, posIncrements)
# simple utility method for testing stemmers
def _checkOneTerm(self, a, input, expected):
self._assertAnalyzesTo(a, input, JArray('string')(expected))
def _checkOneTermReuse(self, a, input, expected):
self._assertAnalyzesToReuse(a, input, JArray('string')(expected))
|