001: package org.apache.lucene.search.spell;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.Iterator;
022:
023: import org.apache.lucene.analysis.WhitespaceAnalyzer;
024: import org.apache.lucene.document.Document;
025: import org.apache.lucene.document.Field;
026: import org.apache.lucene.index.IndexReader;
027: import org.apache.lucene.index.IndexWriter;
028: import org.apache.lucene.index.Term;
029: import org.apache.lucene.search.BooleanClause;
030: import org.apache.lucene.search.BooleanQuery;
031: import org.apache.lucene.search.Hits;
032: import org.apache.lucene.search.IndexSearcher;
033: import org.apache.lucene.search.Query;
034: import org.apache.lucene.search.TermQuery;
035: import org.apache.lucene.store.Directory;
036:
037: /**
038: * <p>
039: * Spell Checker class (Main class) <br/>
040: * (initially inspired by the David Spencer code).
041: * </p>
042: *
043: * <p>Example Usage:
044: *
045: * <pre>
046: * SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
047: * // To index a field of a user index:
048: * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
049: * // To index a file containing words:
050: * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
051: * String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
052: * </pre>
053: *
054: *
055: * @version 1.0
056: */
057: public class SpellChecker {
058:
059: /**
060: * Field name for each word in the ngram index.
061: */
062: public static final String F_WORD = "word";
063:
064: /**
065: * the spell index
066: */
067: Directory spellIndex;
068:
069: /**
070: * Boost value for start and end grams
071: */
072: private float bStart = 2.0f;
073: private float bEnd = 1.0f;
074:
075: private IndexReader reader;
076: private IndexSearcher searcher;
077:
078: // minimum score for hits generated by the spell checker query
079: private float minScore = 0.5f;
080:
081: /**
082: * Use the given directory as a spell checker index. The directory
083: * is created if it doesn't exist yet.
084: *
085: * @param spellIndex
086: * @throws IOException
087: */
088: public SpellChecker(Directory spellIndex) throws IOException {
089: this .setSpellIndex(spellIndex);
090: }
091:
092: /**
093: * Use a different index as the spell checker index or re-open
094: * the existing index if <code>spellIndex</code> is the same value
095: * as given in the constructor.
096: *
097: * @param spellIndex
098: * @throws IOException
099: */
100: public void setSpellIndex(Directory spellIndex) throws IOException {
101: this .spellIndex = spellIndex;
102: if (!IndexReader.indexExists(spellIndex)) {
103: IndexWriter writer = new IndexWriter(spellIndex, null, true);
104: writer.close();
105: }
106: // close the old searcher, if there was one
107: if (searcher != null) {
108: searcher.close();
109: }
110: searcher = new IndexSearcher(this .spellIndex);
111: }
112:
113: /**
114: * Sets the accuracy 0 < minScore < 1; default 0.5
115: */
116: public void setAccuracy(float minScore) {
117: this .minScore = minScore;
118: }
119:
120: /**
121: * Suggest similar words.
122: *
123: * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
124: * is not the same as the edit distance strategy used to calculate the best
125: * matching spell-checked word from the hits that Lucene found, one usually has
126: * to retrieve a couple of numSug's in order to get the true best match.
127: *
128: * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
129: * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
130: *
131: * @param word the word you want a spell check done on
132: * @param numSug the number of suggested words
133: * @throws IOException
134: * @return String[]
135: */
136: public String[] suggestSimilar(String word, int numSug)
137: throws IOException {
138: return this .suggestSimilar(word, numSug, null, null, false);
139: }
140:
141: /**
142: * Suggest similar words (optionally restricted to a field of an index).
143: *
144: * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
145: * is not the same as the edit distance strategy used to calculate the best
146: * matching spell-checked word from the hits that Lucene found, one usually has
147: * to retrieve a couple of numSug's in order to get the true best match.
148: *
149: * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
150: * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
151: *
152: * @param word the word you want a spell check done on
153: * @param numSug the number of suggested words
154: * @param ir the indexReader of the user index (can be null see field param)
155: * @param field the field of the user index: if field is not null, the suggested
156: * words are restricted to the words present in this field.
157: * @param morePopular return only the suggest words that are more frequent than the searched word
158: * (only if restricted mode = (indexReader!=null and field!=null)
159: * @throws IOException
160: * @return String[] the sorted list of the suggest words with these 2 criteria:
161: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
162: * of the suggest words in the field of the user index
163: */
164: public String[] suggestSimilar(String word, int numSug,
165: IndexReader ir, String field, boolean morePopular)
166: throws IOException {
167:
168: float min = this .minScore;
169: final TRStringDistance sd = new TRStringDistance(word);
170: final int lengthWord = word.length();
171:
172: final int goalFreq = (morePopular && ir != null) ? ir
173: .docFreq(new Term(field, word)) : 0;
174: // if the word exists in the real index and we don't care for word frequency, return the word itself
175: if (!morePopular && goalFreq > 0) {
176: return new String[] { word };
177: }
178:
179: BooleanQuery query = new BooleanQuery();
180: String[] grams;
181: String key;
182:
183: for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
184:
185: key = "gram" + ng; // form key
186:
187: grams = formGrams(word, ng); // form word into ngrams (allow dups too)
188:
189: if (grams.length == 0) {
190: continue; // hmm
191: }
192:
193: if (bStart > 0) { // should we boost prefixes?
194: add(query, "start" + ng, grams[0], bStart); // matches start of word
195:
196: }
197: if (bEnd > 0) { // should we boost suffixes
198: add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
199:
200: }
201: for (int i = 0; i < grams.length; i++) {
202: add(query, key, grams[i]);
203: }
204: }
205:
206: // System.out.println("Q: " + query);
207: Hits hits = searcher.search(query);
208: // System.out.println("HITS: " + hits.length());
209: SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
210:
211: // go thru more than 'maxr' matches in case the distance filter triggers
212: int stop = Math.min(hits.length(), 10 * numSug);
213: SuggestWord sugWord = new SuggestWord();
214: for (int i = 0; i < stop; i++) {
215:
216: sugWord.string = hits.doc(i).get(F_WORD); // get orig word
217:
218: // don't suggest a word for itself, that would be silly
219: if (sugWord.string.equals(word)) {
220: continue;
221: }
222:
223: // edit distance/normalize with the minScore word length
224: sugWord.score = 1.0f - ((float) sd
225: .getDistance(sugWord.string) / Math.min(
226: sugWord.string.length(), lengthWord));
227: if (sugWord.score < min) {
228: continue;
229: }
230:
231: if (ir != null) { // use the user index
232: sugWord.freq = ir.docFreq(new Term(field,
233: sugWord.string)); // freq in the index
234: // don't suggest a word that is not present in the field
235: if ((morePopular && goalFreq > sugWord.freq)
236: || sugWord.freq < 1) {
237: continue;
238: }
239: }
240: sugQueue.insert(sugWord);
241: if (sugQueue.size() == numSug) {
242: // if queue full, maintain the minScore score
243: min = ((SuggestWord) sugQueue.top()).score;
244: }
245: sugWord = new SuggestWord();
246: }
247:
248: // convert to array string
249: String[] list = new String[sugQueue.size()];
250: for (int i = sugQueue.size() - 1; i >= 0; i--) {
251: list[i] = ((SuggestWord) sugQueue.pop()).string;
252: }
253:
254: return list;
255: }
256:
257: /**
258: * Add a clause to a boolean query.
259: */
260: private static void add(BooleanQuery q, String name, String value,
261: float boost) {
262: Query tq = new TermQuery(new Term(name, value));
263: tq.setBoost(boost);
264: q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
265: }
266:
267: /**
268: * Add a clause to a boolean query.
269: */
270: private static void add(BooleanQuery q, String name, String value) {
271: q.add(new BooleanClause(new TermQuery(new Term(name, value)),
272: BooleanClause.Occur.SHOULD));
273: }
274:
275: /**
276: * Form all ngrams for a given word.
277: * @param text the word to parse
278: * @param ng the ngram length e.g. 3
279: * @return an array of all ngrams in the word and note that duplicates are not removed
280: */
281: private static String[] formGrams(String text, int ng) {
282: int len = text.length();
283: String[] res = new String[len - ng + 1];
284: for (int i = 0; i < len - ng + 1; i++) {
285: res[i] = text.substring(i, i + ng);
286: }
287: return res;
288: }
289:
290: /**
291: * Removes all terms from the spell check index.
292: * @throws IOException
293: */
294: public void clearIndex() throws IOException {
295: if (IndexReader.isLocked(spellIndex)) {
296: IndexReader.unlock(spellIndex);
297: }
298: IndexWriter writer = new IndexWriter(spellIndex, null, true);
299: writer.close();
300: }
301:
302: /**
303: * Check whether the word exists in the index.
304: * @param word
305: * @throws IOException
306: * @return true iff the word exists in the index
307: */
308: public boolean exist(String word) throws IOException {
309: if (reader == null) {
310: reader = IndexReader.open(spellIndex);
311: }
312: return reader.docFreq(new Term(F_WORD, word)) > 0;
313: }
314:
315: /**
316: * Index a Dictionary
317: * @param dict the dictionary to index
318: * @throws IOException
319: */
320: public void indexDictionary(Dictionary dict) throws IOException {
321: IndexReader.unlock(spellIndex);
322: IndexWriter writer = new IndexWriter(spellIndex,
323: new WhitespaceAnalyzer(), !IndexReader
324: .indexExists(spellIndex));
325: writer.setMergeFactor(300);
326: writer.setMaxBufferedDocs(150);
327:
328: Iterator iter = dict.getWordsIterator();
329: while (iter.hasNext()) {
330: String word = (String) iter.next();
331:
332: int len = word.length();
333: if (len < 3) {
334: continue; // too short we bail but "too long" is fine...
335: }
336:
337: if (this .exist(word)) { // if the word already exist in the gramindex
338: continue;
339: }
340:
341: // ok index the word
342: Document doc = createDocument(word, getMin(len),
343: getMax(len));
344: writer.addDocument(doc);
345: }
346: // close writer
347: writer.optimize();
348: writer.close();
349: // close reader so it will be re-opened (and see the new content) when exist()
350: // is called the next time:
351: if (reader != null) {
352: reader.close();
353: reader = null;
354: }
355: // also re-open the spell index to see our own changes when the next suggestion
356: // is fetched:
357: searcher.close();
358: searcher = new IndexSearcher(this .spellIndex);
359: }
360:
361: private int getMin(int l) {
362: if (l > 5) {
363: return 3;
364: }
365: if (l == 5) {
366: return 2;
367: }
368: return 1;
369: }
370:
371: private int getMax(int l) {
372: if (l > 5) {
373: return 4;
374: }
375: if (l == 5) {
376: return 3;
377: }
378: return 2;
379: }
380:
381: private static Document createDocument(String text, int ng1, int ng2) {
382: Document doc = new Document();
383: doc.add(new Field(F_WORD, text, Field.Store.YES,
384: Field.Index.UN_TOKENIZED)); // orig term
385: addGram(text, doc, ng1, ng2);
386: return doc;
387: }
388:
389: private static void addGram(String text, Document doc, int ng1,
390: int ng2) {
391: int len = text.length();
392: for (int ng = ng1; ng <= ng2; ng++) {
393: String key = "gram" + ng;
394: String end = null;
395: for (int i = 0; i < len - ng + 1; i++) {
396: String gram = text.substring(i, i + ng);
397: doc.add(new Field(key, gram, Field.Store.NO,
398: Field.Index.UN_TOKENIZED));
399: if (i == 0) {
400: doc.add(new Field("start" + ng, gram,
401: Field.Store.NO, Field.Index.UN_TOKENIZED));
402: }
403: end = gram;
404: }
405: if (end != null) { // may not be present if len==ng1
406: doc.add(new Field("end" + ng, end, Field.Store.NO,
407: Field.Index.UN_TOKENIZED));
408: }
409: }
410: }
411:
412: /**
413: * Closes the internal IndexReader.
414: */
415: protected void finalize() throws Throwable {
416: try {
417: if (reader != null) {
418: reader.close();
419: }
420: } finally {
421: super.finalize();
422: }
423: }
424: }
|