001: /**
002: * Copyright 2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */package org.apache.lucene.search.similar;
016:
017: import java.io.IOException;
018: import java.io.StringReader;
019: import java.util.HashSet;
020: import java.util.Set;
021:
022: import org.apache.lucene.analysis.Analyzer;
023: import org.apache.lucene.analysis.TokenStream;
024: import org.apache.lucene.index.Term;
025: import org.apache.lucene.search.BooleanClause;
026: import org.apache.lucene.search.BooleanQuery;
027: import org.apache.lucene.search.IndexSearcher;
028: import org.apache.lucene.search.Query;
029: import org.apache.lucene.search.TermQuery;
030:
031: /**
032: * Simple similarity measures.
033: *
034: * @see MoreLikeThis
035: */
036: public final class SimilarityQueries {
037: /**
038: *
039: */
040: private SimilarityQueries() {
041: }
042:
043: /**
044: * Simple similarity query generators.
045: * Takes every unique word and forms a boolean query where all words are optional.
046: * After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
047: * The only caveat is the first hit returned <b>should be</b> your source document - you'll
048: * need to then ignore that.
049: *
050: * <p>
051: * So, if you have a code fragment like this:
052: * <br>
053: * <code>
054: * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
055: * </code>
056: *
057: * <p>
058: * The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
059: *
060: * <p>
061: * The philosophy behind this method is "two documents are similar if they share lots of words".
062: * Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
063: *
064: * <P>
065: * This method is fail-safe in that if a long 'body' is passed in and
066: * {@link BooleanQuery#add BooleanQuery.add()} (used internally)
067: * throws
068: * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
069: * query as it is will be returned.
070: *
071: * @param body the body of the document you want to find similar documents to
072: * @param a the analyzer to use to parse the body
073: * @param field the field you want to search on, probably something like "contents" or "body"
074: * @param stop optional set of stop words to ignore
075: * @return a query with all unique words in 'body'
076: * @throws IOException this can't happen...
077: */
078: public static Query formSimilarQuery(String body, Analyzer a,
079: String field, Set stop) throws IOException {
080: TokenStream ts = a.tokenStream(field, new StringReader(body));
081: org.apache.lucene.analysis.Token t;
082: BooleanQuery tmp = new BooleanQuery();
083: Set already = new HashSet(); // ignore dups
084: while ((t = ts.next()) != null) {
085: String word = t.termText();
086: // ignore opt stop words
087: if (stop != null && stop.contains(word))
088: continue;
089: // ignore dups
090: if (!already.add(word))
091: continue;
092: // add to query
093: TermQuery tq = new TermQuery(new Term(field, word));
094: try {
095: tmp.add(tq, BooleanClause.Occur.SHOULD);
096: } catch (BooleanQuery.TooManyClauses too) {
097: // fail-safe, just return what we have, not the end of the world
098: break;
099: }
100: }
101: return tmp;
102: }
103: }
|