001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.StringReader;
022: import java.util.ArrayList;
023: import java.util.HashMap;
024: import java.util.HashSet;
025: import java.util.Iterator;
026:
027: import org.apache.lucene.analysis.Analyzer;
028: import org.apache.lucene.analysis.Token;
029: import org.apache.lucene.analysis.TokenStream;
030: import org.apache.lucene.index.IndexReader;
031: import org.apache.lucene.index.Term;
032: import org.apache.lucene.index.TermEnum;
033: import org.apache.lucene.util.PriorityQueue;
034:
035: /**
036: * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
037: * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
038: * of fuzzy scoring factors.
039: * This generally produces good results for queries where users may provide details in a number of
040: * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
041: * a fast query.
042: *
043: * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
044: * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
045: * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
046: * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
047: * term) and this is factored into the variant's boost. If the source query term does not exist in the
048: * index the average IDF of the variants is used.
049: * @author maharwood
050: */
051: public class FuzzyLikeThisQuery extends Query {
052: static Similarity sim = new DefaultSimilarity();
053: Query rewrittenQuery = null;
054: ArrayList fieldVals = new ArrayList();
055: Analyzer analyzer;
056:
057: ScoreTermQueue q;
058: int MAX_VARIANTS_PER_TERM = 50;
059: boolean ignoreTF = false;
060:
061: /**
062: *
063: * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
064: * @param analyzer
065: */
066: public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) {
067: q = new ScoreTermQueue(maxNumTerms);
068: this .analyzer = analyzer;
069: }
070:
071: class FieldVals {
072: String queryString;
073: String fieldName;
074: float minSimilarity;
075: int prefixLength;
076:
077: public FieldVals(String name, float similarity, int length,
078: String queryString) {
079: fieldName = name;
080: minSimilarity = similarity;
081: prefixLength = length;
082: this .queryString = queryString;
083: }
084:
085: }
086:
087: /**
088: * Adds user input for "fuzzification"
089: * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
090: * @param fieldName
091: * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
092: * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
093: */
094: public void addTerms(String queryString, String fieldName,
095: float minSimilarity, int prefixLength) {
096: fieldVals.add(new FieldVals(fieldName, minSimilarity,
097: prefixLength, queryString));
098: }
099:
100: private void addTerms(IndexReader reader, FieldVals f)
101: throws IOException {
102: if (f.queryString == null)
103: return;
104: TokenStream ts = analyzer.tokenStream(f.fieldName,
105: new StringReader(f.queryString));
106: Token token = ts.next();
107: int corpusNumDocs = reader.numDocs();
108: Term internSavingTemplateTerm = new Term(f.fieldName, ""); //optimization to avoid constructing new Term() objects
109: HashSet processedTerms = new HashSet();
110: while (token != null) {
111: if (!processedTerms.contains(token.termText())) {
112: processedTerms.add(token.termText());
113: ScoreTermQueue variantsQ = new ScoreTermQueue(
114: MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
115: float minScore = 0;
116: Term startTerm = internSavingTemplateTerm
117: .createTerm(token.termText());
118: FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm,
119: f.minSimilarity, f.prefixLength);
120: TermEnum origEnum = reader.terms(startTerm);
121: int df = 0;
122: if (startTerm.equals(origEnum.term())) {
123: df = origEnum.docFreq(); //store the df so all variants use same idf
124: }
125: int numVariants = 0;
126: int totalVariantDocFreqs = 0;
127: do {
128: Term possibleMatch = fe.term();
129: if (possibleMatch != null) {
130: numVariants++;
131: totalVariantDocFreqs += fe.docFreq();
132: float score = fe.difference();
133: if (variantsQ.size() < MAX_VARIANTS_PER_TERM
134: || score > minScore) {
135: ScoreTerm st = new ScoreTerm(possibleMatch,
136: score, startTerm);
137: variantsQ.insert(st);
138: minScore = ((ScoreTerm) variantsQ.top()).score; // maintain minScore
139: }
140: }
141: } while (fe.next());
142: if (numVariants == 0) {
143: //no variants to rank here
144: break;
145: }
146: int avgDf = totalVariantDocFreqs / numVariants;
147: if (df == 0)//no direct match we can use as df for all variants
148: {
149: df = avgDf; //use avg df of all variants
150: }
151:
152: // take the top variants (scored by edit distance) and reset the score
153: // to include an IDF factor then add to the global queue for ranking overall top query terms
154: int size = variantsQ.size();
155: for (int i = 0; i < size; i++) {
156: ScoreTerm st = (ScoreTerm) variantsQ.pop();
157: st.score = (st.score * st.score)
158: * sim.idf(df, corpusNumDocs);
159: q.insert(st);
160: }
161: }
162: token = ts.next();
163: }
164: }
165:
166: public Query rewrite(IndexReader reader) throws IOException {
167: if (rewrittenQuery != null) {
168: return rewrittenQuery;
169: }
170: //load up the list of possible terms
171: for (Iterator iter = fieldVals.iterator(); iter.hasNext();) {
172: FieldVals f = (FieldVals) iter.next();
173: addTerms(reader, f);
174: }
175: //clear the list of fields
176: fieldVals.clear();
177:
178: BooleanQuery bq = new BooleanQuery();
179:
180: //create BooleanQueries to hold the variants for each token/field pair and ensure it
181: // has no coord factor
182: //Step 1: sort the termqueries by term/field
183: HashMap variantQueries = new HashMap();
184: int size = q.size();
185: for (int i = 0; i < size; i++) {
186: ScoreTerm st = (ScoreTerm) q.pop();
187: ArrayList l = (ArrayList) variantQueries
188: .get(st.fuzziedSourceTerm);
189: if (l == null) {
190: l = new ArrayList();
191: variantQueries.put(st.fuzziedSourceTerm, l);
192: }
193: l.add(st);
194: }
195: //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
196: for (Iterator iter = variantQueries.values().iterator(); iter
197: .hasNext();) {
198: ArrayList variants = (ArrayList) iter.next();
199: if (variants.size() == 1) {
200: //optimize where only one selected variant
201: ScoreTerm st = (ScoreTerm) variants.get(0);
202: TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
203: tq.setBoost(st.score); // set the boost to a mix of IDF and score
204: bq.add(tq, BooleanClause.Occur.SHOULD);
205: } else {
206: BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
207: for (Iterator iterator2 = variants.iterator(); iterator2
208: .hasNext();) {
209: ScoreTerm st = (ScoreTerm) iterator2.next();
210: TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match
211: tq.setBoost(st.score); // set the boost using the ScoreTerm's score
212: termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
213: }
214: bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
215: }
216: }
217: //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
218: // booleans with a minimum-should-match of NumFields-1?
219: bq.setBoost(getBoost());
220: this .rewrittenQuery = bq;
221: return bq;
222: }
223:
224: //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
225: // term variants) then is reset with IDF for use in ranking against all other
226: // terms/fields
227: private static class ScoreTerm {
228: public Term term;
229: public float score;
230: Term fuzziedSourceTerm;
231:
232: public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) {
233: this .term = term;
234: this .score = score;
235: this .fuzziedSourceTerm = fuzziedSourceTerm;
236: }
237: }
238:
239: private static class ScoreTermQueue extends PriorityQueue {
240: public ScoreTermQueue(int size) {
241: initialize(size);
242: }
243:
244: /* (non-Javadoc)
245: * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
246: */
247: protected boolean lessThan(Object a, Object b) {
248: ScoreTerm termA = (ScoreTerm) a;
249: ScoreTerm termB = (ScoreTerm) b;
250: if (termA.score == termB.score)
251: return termA.term.compareTo(termB.term) > 0;
252: else
253: return termA.score < termB.score;
254: }
255:
256: }
257:
258: //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
259: private static class FuzzyTermQuery extends TermQuery {
260: boolean ignoreTF;
261:
262: public FuzzyTermQuery(Term t, boolean ignoreTF) {
263: super (t);
264: this .ignoreTF = ignoreTF;
265: }
266:
267: public Similarity getSimilarity(Searcher searcher) {
268: Similarity result = super .getSimilarity(searcher);
269: result = new SimilarityDelegator(result) {
270:
271: public float tf(float freq) {
272: if (ignoreTF) {
273: return 1; //ignore tf
274: }
275: return super .tf(freq);
276: }
277:
278: public float idf(int docFreq, int numDocs) {
279: //IDF is already factored into individual term boosts
280: return 1;
281: }
282: };
283: return result;
284: }
285: }
286:
287: /* (non-Javadoc)
288: * @see org.apache.lucene.search.Query#toString(java.lang.String)
289: */
290: public String toString(String field) {
291: return null;
292: }
293:
294: public boolean isIgnoreTF() {
295: return ignoreTF;
296: }
297:
298: public void setIgnoreTF(boolean ignoreTF) {
299: this.ignoreTF = ignoreTF;
300: }
301:
302: }
|