001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.index.IndexReader;
021: import org.apache.lucene.index.Term;
022: import org.apache.lucene.util.PriorityQueue;
023: import org.apache.lucene.util.ToStringUtils;
024:
025: import java.io.IOException;
026:
027: /** Implements the fuzzy search query. The similiarity measurement
028: * is based on the Levenshtein (edit distance) algorithm.
029: */
030: public class FuzzyQuery extends MultiTermQuery {
031:
032: public final static float defaultMinSimilarity = 0.5f;
033: public final static int defaultPrefixLength = 0;
034:
035: private float minimumSimilarity;
036: private int prefixLength;
037:
038: /**
039: * Create a new FuzzyQuery that will match terms with a similarity
040: * of at least <code>minimumSimilarity</code> to <code>term</code>.
041: * If a <code>prefixLength</code> > 0 is specified, a common prefix
042: * of that length is also required.
043: *
044: * @param term the term to search for
045: * @param minimumSimilarity a value between 0 and 1 to set the required similarity
046: * between the query term and the matching terms. For example, for a
047: * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
048: * as the query term is considered similar to the query term if the edit distance
049: * between both terms is less than <code>length(term)*0.5</code>
050: * @param prefixLength length of common (non-fuzzy) prefix
051: * @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
052: * or if prefixLength < 0
053: */
054: public FuzzyQuery(Term term, float minimumSimilarity,
055: int prefixLength) throws IllegalArgumentException {
056: super (term);
057:
058: if (minimumSimilarity >= 1.0f)
059: throw new IllegalArgumentException("minimumSimilarity >= 1");
060: else if (minimumSimilarity < 0.0f)
061: throw new IllegalArgumentException("minimumSimilarity < 0");
062: if (prefixLength < 0)
063: throw new IllegalArgumentException("prefixLength < 0");
064:
065: this .minimumSimilarity = minimumSimilarity;
066: this .prefixLength = prefixLength;
067: }
068:
069: /**
070: * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
071: */
072: public FuzzyQuery(Term term, float minimumSimilarity)
073: throws IllegalArgumentException {
074: this (term, minimumSimilarity, defaultPrefixLength);
075: }
076:
077: /**
078: * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
079: */
080: public FuzzyQuery(Term term) {
081: this (term, defaultMinSimilarity, defaultPrefixLength);
082: }
083:
084: /**
085: * Returns the minimum similarity that is required for this query to match.
086: * @return float value between 0.0 and 1.0
087: */
088: public float getMinSimilarity() {
089: return minimumSimilarity;
090: }
091:
092: /**
093: * Returns the non-fuzzy prefix length. This is the number of characters at the start
094: * of a term that must be identical (not fuzzy) to the query term if the query
095: * is to match that term.
096: */
097: public int getPrefixLength() {
098: return prefixLength;
099: }
100:
101: protected FilteredTermEnum getEnum(IndexReader reader)
102: throws IOException {
103: return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity,
104: prefixLength);
105: }
106:
107: public Query rewrite(IndexReader reader) throws IOException {
108: FilteredTermEnum enumerator = getEnum(reader);
109: int maxClauseCount = BooleanQuery.getMaxClauseCount();
110: ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
111: ScoreTerm reusableST = null;
112:
113: try {
114: do {
115: float score = 0.0f;
116: Term t = enumerator.term();
117: if (t != null) {
118: score = enumerator.difference();
119: if (reusableST == null) {
120: reusableST = new ScoreTerm(t, score);
121: } else if (score >= reusableST.score) {
122: // reusableST holds the last "rejected" entry, so, if
123: // this new score is not better than that, there's no
124: // need to try inserting it
125: reusableST.score = score;
126: reusableST.term = t;
127: } else {
128: continue;
129: }
130:
131: reusableST = (ScoreTerm) stQueue
132: .insertWithOverflow(reusableST);
133: }
134: } while (enumerator.next());
135: } finally {
136: enumerator.close();
137: }
138:
139: BooleanQuery query = new BooleanQuery(true);
140: int size = stQueue.size();
141: for (int i = 0; i < size; i++) {
142: ScoreTerm st = (ScoreTerm) stQueue.pop();
143: TermQuery tq = new TermQuery(st.term); // found a match
144: tq.setBoost(getBoost() * st.score); // set the boost
145: query.add(tq, BooleanClause.Occur.SHOULD); // add to query
146: }
147:
148: return query;
149: }
150:
151: public String toString(String field) {
152: StringBuffer buffer = new StringBuffer();
153: Term term = getTerm();
154: if (!term.field().equals(field)) {
155: buffer.append(term.field());
156: buffer.append(":");
157: }
158: buffer.append(term.text());
159: buffer.append('~');
160: buffer.append(Float.toString(minimumSimilarity));
161: buffer.append(ToStringUtils.boost(getBoost()));
162: return buffer.toString();
163: }
164:
165: protected static class ScoreTerm {
166: public Term term;
167: public float score;
168:
169: public ScoreTerm(Term term, float score) {
170: this .term = term;
171: this .score = score;
172: }
173: }
174:
175: protected static class ScoreTermQueue extends PriorityQueue {
176:
177: public ScoreTermQueue(int size) {
178: initialize(size);
179: }
180:
181: /* (non-Javadoc)
182: * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
183: */
184: protected boolean lessThan(Object a, Object b) {
185: ScoreTerm termA = (ScoreTerm) a;
186: ScoreTerm termB = (ScoreTerm) b;
187: if (termA.score == termB.score)
188: return termA.term.compareTo(termB.term) > 0;
189: else
190: return termA.score < termB.score;
191: }
192:
193: }
194:
195: public boolean equals(Object o) {
196: if (this == o)
197: return true;
198: if (!(o instanceof FuzzyQuery))
199: return false;
200: if (!super .equals(o))
201: return false;
202:
203: final FuzzyQuery fuzzyQuery = (FuzzyQuery) o;
204:
205: if (minimumSimilarity != fuzzyQuery.minimumSimilarity)
206: return false;
207: if (prefixLength != fuzzyQuery.prefixLength)
208: return false;
209:
210: return true;
211: }
212:
213: public int hashCode() {
214: int result = super .hashCode();
215: result = 29 * result + minimumSimilarity != +0.0f ? Float
216: .floatToIntBits(minimumSimilarity) : 0;
217: result = 29 * result + prefixLength;
218: return result;
219: }
220: }
|