001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.index.IndexReader;
021: import org.apache.lucene.index.Term;
022:
023: import java.io.IOException;
024:
025: /** Subclass of FilteredTermEnum for enumerating all terms that are similiar
026: * to the specified filter term.
027: *
028: * <p>Term enumerations are always ordered by Term.compareTo(). Each term in
029: * the enumeration is greater than all that precede it.
030: */
031: public final class FuzzyTermEnum extends FilteredTermEnum {
032:
033: /* This should be somewhere around the average long word.
034: * If it is longer, we waste time and space. If it is shorter, we waste a
035: * little bit of time growing the array as we encounter longer words.
036: */
037: private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
038:
039: /* Allows us save time required to create a new array
040: * everytime similarity is called.
041: */
042: private int[][] d;
043:
044: private float similarity;
045: private boolean endEnum = false;
046:
047: private Term searchTerm = null;
048: private final String field;
049: private final String text;
050: private final String prefix;
051:
052: private final float minimumSimilarity;
053: private final float scale_factor;
054: private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
055:
056: /**
057: * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
058: * <p>
059: * After calling the constructor the enumeration is already pointing to the first
060: * valid term if such a term exists.
061: *
062: * @param reader
063: * @param term
064: * @throws IOException
065: * @see #FuzzyTermEnum(IndexReader, Term, float, int)
066: */
067: public FuzzyTermEnum(IndexReader reader, Term term)
068: throws IOException {
069: this (reader, term, FuzzyQuery.defaultMinSimilarity,
070: FuzzyQuery.defaultPrefixLength);
071: }
072:
073: /**
074: * Creates a FuzzyTermEnum with an empty prefix.
075: * <p>
076: * After calling the constructor the enumeration is already pointing to the first
077: * valid term if such a term exists.
078: *
079: * @param reader
080: * @param term
081: * @param minSimilarity
082: * @throws IOException
083: * @see #FuzzyTermEnum(IndexReader, Term, float, int)
084: */
085: public FuzzyTermEnum(IndexReader reader, Term term,
086: float minSimilarity) throws IOException {
087: this (reader, term, minSimilarity,
088: FuzzyQuery.defaultPrefixLength);
089: }
090:
091: /**
092: * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
093: * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
094: * <code>minSimilarity</code>.
095: * <p>
096: * After calling the constructor the enumeration is already pointing to the first
097: * valid term if such a term exists.
098: *
099: * @param reader Delivers terms.
100: * @param term Pattern term.
101: * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
102: * @param prefixLength Length of required common prefix. Default value is 0.
103: * @throws IOException
104: */
105: public FuzzyTermEnum(IndexReader reader, Term term,
106: final float minSimilarity, final int prefixLength)
107: throws IOException {
108: super ();
109:
110: if (minSimilarity >= 1.0f)
111: throw new IllegalArgumentException(
112: "minimumSimilarity cannot be greater than or equal to 1");
113: else if (minSimilarity < 0.0f)
114: throw new IllegalArgumentException(
115: "minimumSimilarity cannot be less than 0");
116: if (prefixLength < 0)
117: throw new IllegalArgumentException(
118: "prefixLength cannot be less than 0");
119:
120: this .minimumSimilarity = minSimilarity;
121: this .scale_factor = 1.0f / (1.0f - minimumSimilarity);
122: this .searchTerm = term;
123: this .field = searchTerm.field();
124:
125: //The prefix could be longer than the word.
126: //It's kind of silly though. It means we must match the entire word.
127: final int fullSearchTermLength = searchTerm.text().length();
128: final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength
129: : prefixLength;
130:
131: this .text = searchTerm.text().substring(realPrefixLength);
132: this .prefix = searchTerm.text().substring(0, realPrefixLength);
133:
134: initializeMaxDistances();
135: this .d = initDistanceArray();
136:
137: setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
138: }
139:
140: /**
141: * The termCompare method in FuzzyTermEnum uses Levenshtein distance to
142: * calculate the distance between the given term and the comparing term.
143: */
144: protected final boolean termCompare(Term term) {
145: if (field == term.field() && term.text().startsWith(prefix)) {
146: final String target = term.text()
147: .substring(prefix.length());
148: this .similarity = similarity(target);
149: return (similarity > minimumSimilarity);
150: }
151: endEnum = true;
152: return false;
153: }
154:
155: public final float difference() {
156: return (float) ((similarity - minimumSimilarity) * scale_factor);
157: }
158:
159: public final boolean endEnum() {
160: return endEnum;
161: }
162:
163: /******************************
164: * Compute Levenshtein distance
165: ******************************/
166:
167: /**
168: * Finds and returns the smallest of three integers
169: */
170: private static final int min(int a, int b, int c) {
171: final int t = (a < b) ? a : b;
172: return (t < c) ? t : c;
173: }
174:
175: private final int[][] initDistanceArray() {
176: return new int[this .text.length() + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
177: }
178:
179: /**
180: * <p>Similarity returns a number that is 1.0f or less (including negative numbers)
181: * based on how similar the Term is compared to a target term. It returns
182: * exactly 0.0f when
183: * <pre>
184: * editDistance < maximumEditDistance</pre>
185: * Otherwise it returns:
186: * <pre>
187: * 1 - (editDistance / length)</pre>
188: * where length is the length of the shortest term (text or target) including a
189: * prefix that are identical and editDistance is the Levenshtein distance for
190: * the two words.</p>
191: *
192: * <p>Embedded within this algorithm is a fail-fast Levenshtein distance
193: * algorithm. The fail-fast algorithm differs from the standard Levenshtein
194: * distance algorithm in that it is aborted if it is discovered that the
195: * mimimum distance between the words is greater than some threshold.
196: *
197: * <p>To calculate the maximum distance threshold we use the following formula:
198: * <pre>
199: * (1 - minimumSimilarity) * length</pre>
200: * where length is the shortest term including any prefix that is not part of the
201: * similarity comparision. This formula was derived by solving for what maximum value
202: * of distance returns false for the following statements:
203: * <pre>
204: * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
205: * return (similarity > minimumSimilarity);</pre>
206: * where distance is the Levenshtein distance for the two words.
207: * </p>
208: * <p>Levenshtein distance (also known as edit distance) is a measure of similiarity
209: * between two strings where the distance is measured as the number of character
210: * deletions, insertions or substitutions required to transform one string to
211: * the other string.
212: * @param target the target word or phrase
213: * @return the similarity, 0.0 or less indicates that it matches less than the required
214: * threshold and 1.0 indicates that the text and target are identical
215: */
216: private synchronized final float similarity(final String target) {
217: final int m = target.length();
218: final int n = text.length();
219: if (n == 0) {
220: //we don't have anything to compare. That means if we just add
221: //the letters for m we get the new word
222: return prefix.length() == 0 ? 0.0f
223: : 1.0f - ((float) m / prefix.length());
224: }
225: if (m == 0) {
226: return prefix.length() == 0 ? 0.0f
227: : 1.0f - ((float) n / prefix.length());
228: }
229:
230: final int maxDistance = getMaxDistance(m);
231:
232: if (maxDistance < Math.abs(m - n)) {
233: //just adding the characters of m to n or vice-versa results in
234: //too many edits
235: //for example "pre" length is 3 and "prefixes" length is 8. We can see that
236: //given this optimal circumstance, the edit distance cannot be less than 5.
237: //which is 8-3 or more precisesly Math.abs(3-8).
238: //if our maximum edit distance is 4, then we can discard this word
239: //without looking at it.
240: return 0.0f;
241: }
242:
243: //let's make sure we have enough room in our array to do the distance calculations.
244: if (d[0].length <= m) {
245: growDistanceArray(m);
246: }
247:
248: // init matrix d
249: for (int i = 0; i <= n; i++)
250: d[i][0] = i;
251: for (int j = 0; j <= m; j++)
252: d[0][j] = j;
253:
254: // start computing edit distance
255: for (int i = 1; i <= n; i++) {
256: int bestPossibleEditDistance = m;
257: final char s_i = text.charAt(i - 1);
258: for (int j = 1; j <= m; j++) {
259: if (s_i != target.charAt(j - 1)) {
260: d[i][j] = min(d[i - 1][j], d[i][j - 1],
261: d[i - 1][j - 1]) + 1;
262: } else {
263: d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1,
264: d[i - 1][j - 1]);
265: }
266: bestPossibleEditDistance = Math.min(
267: bestPossibleEditDistance, d[i][j]);
268: }
269:
270: //After calculating row i, the best possible edit distance
271: //can be found by found by finding the smallest value in a given column.
272: //If the bestPossibleEditDistance is greater than the max distance, abort.
273:
274: if (i > maxDistance
275: && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
276: //the closest the target can be to the text is just too far away.
277: //this target is leaving the party early.
278: return 0.0f;
279: }
280: }
281:
282: // this will return less than 0.0 when the edit distance is
283: // greater than the number of characters in the shorter word.
284: // but this was the formula that was previously used in FuzzyTermEnum,
285: // so it has not been changed (even though minimumSimilarity must be
286: // greater than 0.0)
287: return 1.0f - ((float) d[n][m] / (float) (prefix.length() + Math
288: .min(n, m)));
289: }
290:
291: /**
292: * Grow the second dimension of the array, so that we can calculate the
293: * Levenshtein difference.
294: */
295: private void growDistanceArray(int m) {
296: for (int i = 0; i < d.length; i++) {
297: d[i] = new int[m + 1];
298: }
299: }
300:
301: /**
302: * The max Distance is the maximum Levenshtein distance for the text
303: * compared to some other value that results in score that is
304: * better than the minimum similarity.
305: * @param m the length of the "other value"
306: * @return the maximum levenshtein distance that we care about
307: */
308: private final int getMaxDistance(int m) {
309: return (m < maxDistances.length) ? maxDistances[m]
310: : calculateMaxDistance(m);
311: }
312:
313: private void initializeMaxDistances() {
314: for (int i = 0; i < maxDistances.length; i++) {
315: maxDistances[i] = calculateMaxDistance(i);
316: }
317: }
318:
319: private int calculateMaxDistance(int m) {
320: return (int) ((1 - minimumSimilarity) * (Math.min(
321: text.length(), m) + prefix.length()));
322: }
323:
324: public void close() throws IOException {
325: super .close(); //call super.close() and let the garbage collector do its work.
326: }
327:
328: }
|