001: package org.apache.lucene.search.spell;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.Iterator;
022:
023: import org.apache.lucene.analysis.WhitespaceAnalyzer;
024: import org.apache.lucene.document.Document;
025: import org.apache.lucene.document.Field;
026: import org.apache.lucene.index.IndexReader;
027: import org.apache.lucene.index.IndexWriter;
028: import org.apache.lucene.index.Term;
029: import org.apache.lucene.search.BooleanClause;
030: import org.apache.lucene.search.BooleanQuery;
031: import org.apache.lucene.search.Hits;
032: import org.apache.lucene.search.IndexSearcher;
033: import org.apache.lucene.search.Query;
034: import org.apache.lucene.search.Searcher;
035: import org.apache.lucene.search.TermQuery;
036: import org.apache.lucene.store.Directory;
037:
038: /**
039: * <p>
040: * Spell Checker class (Main class) <br/>
041: * (initially inspired by the David Spencer code).
042: * </p>
043: *
044: * <p>Example Usage:
045: *
046: * <pre>
047: * SpellChecker spellcheck = new SpellChecker(spellIndexDirectory);
048: * // To index a field of a user index:
049: * spellcheck.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
050: * // To index a file containing words:
051: * spellcheck.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
052: * String[] suggestions = spellcheck.suggestSimilar("misspelt", 5);
053: * </pre>
054: *
055: *
056: * @version 1.0
057: */
058:
059: // Specialized SpellChecker for Compass.
060: // List of changes: (Mainly to separte between two use cases: one is for indexing and one for searching).
061: // 1. Added a constructor that accepts a searcher and reader ("searching" spell checker)
062: // 2. Changed searcher type from IndexSearcher to Searcher
063: // 3. Added close method
064: // 4. In indexDictioanry, if the searcher is null, don't reopen it
065: // 5. Added a constructor that won't open an index searcher ("indexing" spell checker)
066: // 6. Added indexDictionary that accepts a dictionary and IndexWriter so we can configure it
067: public class CompassSpellChecker {
068:
069: /**
070: * Field name for each word in the ngram index.
071: */
072: public static final String F_WORD = "word";
073:
074: /**
075: * the spell index
076: */
077: Directory spellIndex;
078:
079: /**
080: * Boost value for start and end grams
081: */
082: private float bStart = 2.0f;
083: private float bEnd = 1.0f;
084:
085: private IndexReader reader;
086: private Searcher searcher;
087:
088: // minimum score for hits generated by the spell checker query
089: private float minScore = 0.5f;
090:
091: public CompassSpellChecker(Searcher searcher, IndexReader reader) {
092: this .searcher = searcher;
093: this .reader = reader;
094: }
095:
096: /**
097: * Use the given directory as a spell checker index. The directory
098: * is created if it doesn't exist yet.
099: *
100: * @param spellIndex
101: * @throws IOException
102: */
103: public CompassSpellChecker(Directory spellIndex) throws IOException {
104: this .setSpellIndex(spellIndex);
105: }
106:
107: /**
108: */
109: public CompassSpellChecker(Directory spellIndex, boolean indexing)
110: throws IOException {
111: if (indexing) {
112: this .spellIndex = spellIndex;
113: } else {
114: setSpellIndex(spellIndex);
115: }
116: }
117:
118: public void close() {
119: try {
120: searcher.close();
121: } catch (IOException e) {
122: // do nothing
123: }
124:
125: try {
126: reader.close();
127: } catch (IOException e) {
128: // do nothing
129: }
130: }
131:
132: /**
133: * Use a different index as the spell checker index or re-open
134: * the existing index if <code>spellIndex</code> is the same value
135: * as given in the constructor.
136: *
137: * @param spellIndex
138: * @throws IOException
139: */
140: public void setSpellIndex(Directory spellIndex) throws IOException {
141: this .spellIndex = spellIndex;
142: if (!IndexReader.indexExists(spellIndex)) {
143: IndexWriter writer = new IndexWriter(spellIndex, null, true);
144: writer.close();
145: }
146: // close the old searcher, if there was one
147: if (searcher != null) {
148: searcher.close();
149: }
150: searcher = new IndexSearcher(this .spellIndex);
151: }
152:
153: /**
154: * Sets the accuracy 0 < minScore < 1; default 0.5
155: */
156: public void setAccuracy(float minScore) {
157: this .minScore = minScore;
158: }
159:
160: /**
161: * Suggest similar words.
162: *
163: * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
164: * is not the same as the edit distance strategy used to calculate the best
165: * matching spell-checked word from the hits that Lucene found, one usually has
166: * to retrieve a couple of numSug's in order to get the true best match.
167: *
168: * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
169: * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
170: *
171: * @param word the word you want a spell check done on
172: * @param numSug the number of suggested words
173: * @throws IOException
174: * @return String[]
175: */
176: public String[] suggestSimilar(String word, int numSug)
177: throws IOException {
178: return this .suggestSimilar(word, numSug, null, null, false);
179: }
180:
181: /**
182: * Suggest similar words (optionally restricted to a field of an index).
183: *
184: * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
185: * is not the same as the edit distance strategy used to calculate the best
186: * matching spell-checked word from the hits that Lucene found, one usually has
187: * to retrieve a couple of numSug's in order to get the true best match.
188: *
189: * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
190: * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
191: *
192: * @param word the word you want a spell check done on
193: * @param numSug the number of suggested words
194: * @param ir the indexReader of the user index (can be null see field param)
195: * @param field the field of the user index: if field is not null, the suggested
196: * words are restricted to the words present in this field.
197: * @param morePopular return only the suggest words that are more frequent than the searched word
198: * (only if restricted mode = (indexReader!=null and field!=null)
199: * @throws IOException
200: * @return String[] the sorted list of the suggest words with these 2 criteria:
201: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
202: * of the suggest words in the field of the user index
203: */
204: public String[] suggestSimilar(String word, int numSug,
205: IndexReader ir, String field, boolean morePopular)
206: throws IOException {
207:
208: float min = this .minScore;
209: final TRStringDistance sd = new TRStringDistance(word);
210: final int lengthWord = word.length();
211:
212: final int goalFreq = (morePopular && ir != null) ? ir
213: .docFreq(new Term(field, word)) : 0;
214: // if the word exists in the real index and we don't care for word frequency, return the word itself
215: if (!morePopular && goalFreq > 0) {
216: return new String[] { word };
217: }
218:
219: BooleanQuery query = new BooleanQuery();
220: String[] grams;
221: String key;
222:
223: for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
224:
225: key = "gram" + ng; // form key
226:
227: grams = formGrams(word, ng); // form word into ngrams (allow dups too)
228:
229: if (grams.length == 0) {
230: continue; // hmm
231: }
232:
233: if (bStart > 0) { // should we boost prefixes?
234: add(query, "start" + ng, grams[0], bStart); // matches start of word
235:
236: }
237: if (bEnd > 0) { // should we boost suffixes
238: add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
239:
240: }
241: for (int i = 0; i < grams.length; i++) {
242: add(query, key, grams[i]);
243: }
244: }
245:
246: // System.out.println("Q: " + query);
247: Hits hits = searcher.search(query);
248: // System.out.println("HITS: " + hits.length());
249: SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
250:
251: // go thru more than 'maxr' matches in case the distance filter triggers
252: int stop = Math.min(hits.length(), 10 * numSug);
253: SuggestWord sugWord = new SuggestWord();
254: for (int i = 0; i < stop; i++) {
255:
256: sugWord.string = hits.doc(i).get(F_WORD); // get orig word
257:
258: // don't suggest a word for itself, that would be silly
259: if (sugWord.string.equals(word)) {
260: continue;
261: }
262:
263: // edit distance/normalize with the minScore word length
264: sugWord.score = 1.0f - ((float) sd
265: .getDistance(sugWord.string) / Math.min(
266: sugWord.string.length(), lengthWord));
267: if (sugWord.score < min) {
268: continue;
269: }
270:
271: if (ir != null) { // use the user index
272: sugWord.freq = ir.docFreq(new Term(field,
273: sugWord.string)); // freq in the index
274: // don't suggest a word that is not present in the field
275: if ((morePopular && goalFreq > sugWord.freq)
276: || sugWord.freq < 1) {
277: continue;
278: }
279: }
280: sugQueue.insert(sugWord);
281: if (sugQueue.size() == numSug) {
282: // if queue full, maintain the minScore score
283: min = ((SuggestWord) sugQueue.top()).score;
284: }
285: sugWord = new SuggestWord();
286: }
287:
288: // convert to array string
289: String[] list = new String[sugQueue.size()];
290: for (int i = sugQueue.size() - 1; i >= 0; i--) {
291: list[i] = ((SuggestWord) sugQueue.pop()).string;
292: }
293:
294: return list;
295: }
296:
297: /**
298: * Add a clause to a boolean query.
299: */
300: private static void add(BooleanQuery q, String name, String value,
301: float boost) {
302: Query tq = new TermQuery(new Term(name, value));
303: tq.setBoost(boost);
304: q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
305: }
306:
307: /**
308: * Add a clause to a boolean query.
309: */
310: private static void add(BooleanQuery q, String name, String value) {
311: q.add(new BooleanClause(new TermQuery(new Term(name, value)),
312: BooleanClause.Occur.SHOULD));
313: }
314:
315: /**
316: * Form all ngrams for a given word.
317: * @param text the word to parse
318: * @param ng the ngram length e.g. 3
319: * @return an array of all ngrams in the word and note that duplicates are not removed
320: */
321: private static String[] formGrams(String text, int ng) {
322: int len = text.length();
323: String[] res = new String[len - ng + 1];
324: for (int i = 0; i < len - ng + 1; i++) {
325: res[i] = text.substring(i, i + ng);
326: }
327: return res;
328: }
329:
330: /**
331: * Removes all terms from the spell check index.
332: * @throws IOException
333: */
334: public void clearIndex() throws IOException {
335: if (IndexReader.isLocked(spellIndex)) {
336: IndexReader.unlock(spellIndex);
337: }
338: IndexWriter writer = new IndexWriter(spellIndex, null, true);
339: writer.close();
340: }
341:
342: /**
343: * Check whether the word exists in the index.
344: * @param word
345: * @throws IOException
346: * @return true iff the word exists in the index
347: */
348: public boolean exist(String word) throws IOException {
349: if (reader == null) {
350: reader = IndexReader.open(spellIndex);
351: }
352: return reader.docFreq(new Term(F_WORD, word)) > 0;
353: }
354:
355: public void indexDictionary(Dictionary dict) throws IOException {
356: if (IndexReader.isLocked(spellIndex)) {
357: IndexReader.unlock(spellIndex);
358: }
359: IndexWriter writer = new IndexWriter(spellIndex,
360: new WhitespaceAnalyzer(), !IndexReader
361: .indexExists(spellIndex));
362: writer.setMergeFactor(300);
363: writer.setMaxBufferedDocs(150);
364: indexDictionary(writer, dict);
365: }
366:
367: /**
368: * Index a Dictionary
369: * @param dict the dictionary to index
370: * @throws IOException
371: */
372: public void indexDictionary(IndexWriter writer, Dictionary dict)
373: throws IOException {
374:
375: Iterator iter = dict.getWordsIterator();
376: while (iter.hasNext()) {
377: String word = (String) iter.next();
378:
379: int len = word.length();
380: if (len < 3) {
381: continue; // too short we bail but "too long" is fine...
382: }
383:
384: if (this .exist(word)) { // if the word already exist in the gramindex
385: continue;
386: }
387:
388: // ok index the word
389: Document doc = createDocument(word, getMin(len),
390: getMax(len));
391: writer.addDocument(doc);
392: }
393: // close writer (REMOVED IN COMPASS), will do it on close
394: // writer.optimize();
395: // writer.close();
396: // close reader so it will be re-opened (and see the new content) when exist()
397: // is called the next time:
398: if (reader != null) {
399: reader.close();
400: reader = null;
401: }
402: // also re-open the spell index to see our own changes when the next suggestion
403: // is fetched:
404: if (searcher != null) {
405: searcher.close();
406: searcher = new IndexSearcher(this .spellIndex);
407: }
408: }
409:
410: private int getMin(int l) {
411: if (l > 5) {
412: return 3;
413: }
414: if (l == 5) {
415: return 2;
416: }
417: return 1;
418: }
419:
420: private int getMax(int l) {
421: if (l > 5) {
422: return 4;
423: }
424: if (l == 5) {
425: return 3;
426: }
427: return 2;
428: }
429:
430: private static Document createDocument(String text, int ng1, int ng2) {
431: Document doc = new Document();
432: doc.add(new Field(F_WORD, text, Field.Store.YES,
433: Field.Index.UN_TOKENIZED)); // orig term
434: addGram(text, doc, ng1, ng2);
435: return doc;
436: }
437:
438: private static void addGram(String text, Document doc, int ng1,
439: int ng2) {
440: int len = text.length();
441: for (int ng = ng1; ng <= ng2; ng++) {
442: String key = "gram" + ng;
443: String end = null;
444: for (int i = 0; i < len - ng + 1; i++) {
445: String gram = text.substring(i, i + ng);
446: doc.add(new Field(key, gram, Field.Store.NO,
447: Field.Index.UN_TOKENIZED));
448: if (i == 0) {
449: doc.add(new Field("start" + ng, gram,
450: Field.Store.NO, Field.Index.UN_TOKENIZED));
451: }
452: end = gram;
453: }
454: if (end != null) { // may not be present if len==ng1
455: doc.add(new Field("end" + ng, end, Field.Store.NO,
456: Field.Index.UN_TOKENIZED));
457: }
458: }
459: }
460:
461: /**
462: * Closes the internal IndexReader.
463: */
464: protected void finalize() throws Throwable {
465: try {
466: if (reader != null) {
467: reader.close();
468: }
469: } finally {
470: super.finalize();
471: }
472: }
473: }
|