# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
import os
from lucene import \
Document, IndexReader, Term, BooleanQuery, IndexSearcher, TermQuery, \
SimpleFSDirectory, File, System, BooleanClause
class BooksLikeThis(object):
def main(cls, argv):
indexDir = System.getProperty("index.dir")
directory = SimpleFSDirectory(File(indexDir))
reader = IndexReader.open(directory, True)
blt = BooksLikeThis(reader)
for id in xrange(reader.maxDoc()):
if reader.isDeleted(id):
continue
doc = reader.document(id)
print ''
print doc.get("title").encode('utf-8')
docs = blt.docsLike(id, doc, 10)
if not docs:
print " None like this"
else:
for doc in docs:
print " ->", doc.get("title").encode('utf-8')
def __init__(self, reader):
self.reader = reader
self.searcher = IndexSearcher(reader)
def docsLike(self, id, doc, max):
authors = doc.getValues("author")
authorQuery = BooleanQuery()
for author in authors:
authorQuery.add(TermQuery(Term("author", author)),
BooleanClause.Occur.SHOULD)
authorQuery.setBoost(2.0)
vector = self.reader.getTermFreqVector(id, "subject")
subjectQuery = BooleanQuery()
for term in vector.getTerms():
tq = TermQuery(Term("subject", term))
subjectQuery.add(tq, BooleanClause.Occur.SHOULD)
likeThisQuery = BooleanQuery()
likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)
# exclude myself
likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
BooleanClause.Occur.MUST_NOT)
print " Query:", likeThisQuery.toString("contents")
scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs
docs = []
for scoreDoc in scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
if len(docs) < max:
docs.append(doc)
else:
break
return docs
main = classmethod(main)
|