# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
import os
from lucene import \
Document, Field, IndexWriter, StandardAnalyzer, NumericField, \
SimpleDateFormat, Version, SimpleFSDirectory, File, DateTools, DateField
# date culled from LuceneInAction.zip archive from Manning site
samplesModified = SimpleDateFormat('yyyy-MM-dd').parse('2004-12-02')
class TestDataDocumentHandler(object):
def createIndex(cls, dataDir, indexDir, useCompound):
indexDir = SimpleFSDirectory(File(indexDir))
writer = IndexWriter(indexDir,
StandardAnalyzer(Version.LUCENE_CURRENT), True,
IndexWriter.MaxFieldLength.UNLIMITED)
writer.setUseCompoundFile(useCompound)
for dir, dirnames, filenames in os.walk(dataDir):
for filename in filenames:
if filename.endswith('.properties'):
cls.indexFile(writer, os.path.join(dir, filename), dataDir)
writer.optimize()
writer.close()
def indexFile(cls, writer, path, baseDir):
input = file(path)
props = {}
while True:
line = input.readline().strip()
if not line:
break
name, value = line.split('=', 1)
props[name] = value.decode('unicode-escape')
input.close()
doc = Document()
# category comes from relative path below the base directory
category = os.path.dirname(path)[len(baseDir):]
if os.path.sep != '/':
category = category.replace(os.path.sep, '/')
isbn = props['isbn']
title = props['title']
author = props['author']
url = props['url']
subject = props['subject']
pubmonth = props['pubmonth']
print title.encode('utf8')
print author.encode('utf-8')
print subject.encode('utf-8')
print category.encode('utf-8')
print "---------"
doc.add(Field("isbn", isbn,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("category", category,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("title", title,
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field("title2", title.lower(),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS,
Field.TermVector.WITH_POSITIONS_OFFSETS))
# split multiple authors into unique field instances
authors = author.split(',')
for a in authors:
doc.add(Field("author", a,
Field.Store.YES, Field.Index.NOT_ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field("url", url,
Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS))
doc.add(Field("subject", subject,
Field.Store.NO, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(NumericField("pubmonth",
Field.Store.YES,
True).setIntValue(int(pubmonth)))
d = DateTools.stringToDate(pubmonth)
d = int(d.getTime() / (1000 * 3600 * 24.0))
doc.add(NumericField("pubmonthAsDay").setIntValue(d))
doc.add(Field("contents", ' '.join([title, subject, author, category]),
Field.Store.NO, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field("path", path,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("modified", DateField.dateToString(samplesModified),
Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
createIndex = classmethod(createIndex)
indexFile = classmethod(indexFile)
|