# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
import os
from unittest import TestCase
from time import time
from datetime import timedelta
from lucene import \
IndexWriter, SimpleAnalyzer, SimpleFSDirectory, Document, Field, \
System, File
class CompoundVersusMultiFileIndexTest(TestCase):
def __init__(self, *args):
super(CompoundVersusMultiFileIndexTest, self).__init__(*args)
self.docs = self.loadDocuments(5000, 10)
def setUp(self):
indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
"index-dir")
cIndexDir = "%s-compound" %(indexDir)
mIndexDir = "%s-multi" %(indexDir)
self.rmdir(cIndexDir)
self.rmdir(mIndexDir)
self.cDir = SimpleFSDirectory(File(cIndexDir))
self.mDir = SimpleFSDirectory(File(mIndexDir))
def rmdir(self, dir):
for dir, dirnames, filenames in os.walk(dir):
for filename in filenames:
os.remove(os.path.join(dir, filename))
for dirname in dirnames:
os.rmdir(os.path.join(dir, dirname))
def testTiming(self):
cTiming = self.timeIndexWriter(self.cDir, True)
mTiming = self.timeIndexWriter(self.mDir, False)
print "Compound Time :", cTiming
print "Multi-file Time:", mTiming
self.assert_(cTiming > mTiming)
def timeIndexWriter(self, dir, isCompound):
start = time()
self.addDocuments(dir, isCompound)
return timedelta(seconds=time() - start)
def addDocuments(self, dir, isCompound):
writer = IndexWriter(dir, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(isCompound)
# change to adjust performance of indexing with FSDirectory
# writer.mergeFactor = writer.mergeFactor
# writer.maxMergeDocs = writer.maxMergeDocs
# writer.minMergeDocs = writer.minMergeDocs
for word in self.docs:
doc = Document()
doc.add(Field("keyword", word,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("unindexed", word,
Field.Store.YES, Field.Index.NO))
doc.add(Field("unstored", word,
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("text", word,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
def loadDocuments(self, numDocs, wordsPerDoc):
return ["Bibamus " * wordsPerDoc] * numDocs
|