001: package net.bagaluten.jca.lucene.connector.impl;
002:
003: import java.io.IOException;
004: import java.io.StringReader;
005: import java.util.ArrayList;
006: import java.util.Iterator;
007: import java.util.LinkedHashSet;
008: import java.util.Set;
009:
010: import org.apache.lucene.analysis.Analyzer;
011: import org.apache.lucene.analysis.Token;
012: import org.apache.lucene.analysis.TokenStream;
013: import org.apache.lucene.index.Term;
014: import org.apache.lucene.queryParser.ParseException;
015: import org.apache.lucene.queryParser.QueryParser;
016: import org.apache.lucene.search.PhraseQuery;
017: import org.apache.lucene.search.Query;
018: import org.apache.lucene.search.TermQuery;
019: import org.apache.lucene.search.spell.SpellChecker;
020: import org.apache.lucene.store.Directory;
021:
022: /**
023: * Idea from Tom White (http://today.java.net/pub/a/today/2005/08/09/didyoumean.html)
024: *
025: * @author Achim Heiland
026: *
027: */
028: public class CompositeQuerySuggester extends QueryParser {
029:
030: /** number of similar words */
031: private int numberOfSimilarWords = 1;
032:
033: /** internal counter */
034: private int counter = 0;
035:
036: /** spellchecker instance */
037: private SpellChecker spellChecker;
038:
039: /**
040: * @param field the suggested words are restricted to the words present in this field
041: * @param analyzer the analyser
042: * @param spellIndexDirectory the directory of the index
043: * @param numberOfSimilarWords the number of suggest words
044: */
045: public CompositeQuerySuggester(String field, Analyzer analyzer,
046: Directory spellIndexDirectory, int numberOfSimilarWords) {
047: super (field, analyzer);
048: this .numberOfSimilarWords = numberOfSimilarWords;
049: setDefaultOperator(QueryParser.AND_OPERATOR);
050: spellChecker = new SpellChecker(spellIndexDirectory);
051: }
052:
053: /* (non-Javadoc)
054: * @see org.apache.lucene.queryParser.QueryParser#getFieldQuery(java.lang.String, java.lang.String)
055: */
056: protected Query getFieldQuery(String field, String queryText)
057: throws ParseException {
058: TokenStream source = getAnalyzer().tokenStream(field,
059: new StringReader(queryText));
060: ArrayList<String> list = new ArrayList<String>();
061: Token t;
062: while (true) {
063: try {
064: t = source.next();
065: } catch (IOException e) {
066: t = null;
067: }
068: if (t == null)
069: break;
070: list.add(t.termText());
071: }
072: try {
073: source.close();
074: } catch (IOException e) {
075: // ignore
076: }
077: if (list.size() == 0)
078: return null;
079: else if (list.size() == 1)
080: return new TermQuery(getTerm(field, (String) list.get(0)));
081: else {
082: PhraseQuery q = new PhraseQuery();
083: q.setSlop(getPhraseSlop());
084: for (int i = 0; i < list.size(); i++) {
085: q.add(getTerm(field, (String) list.get(i)));
086: }
087: return q;
088: }
089: }
090:
091: /**
092: * @param field
093: * @param queryText
094: * @return
095: *
096: * @throws ParseException
097: */
098: private Term getTerm(String field, String queryText)
099: throws ParseException {
100: try {
101: if (spellChecker.exist(queryText)) {
102: return new Term(field, queryText);
103: }
104: String[] similarWords = spellChecker.suggestSimilar(
105: queryText, numberOfSimilarWords);
106: if (similarWords.length == 0) {
107: return new Term(field, queryText);
108: }
109: if (similarWords.length <= counter) {
110: return new Term(field,
111: similarWords[similarWords.length - 1]);
112: }
113: return new Term(field, similarWords[counter]);
114: } catch (IOException e) {
115: throw new ParseException(e.getMessage());
116: }
117: }
118:
119: /**
120: * @param words the word or words you want a spell check done on
121: *
122: * @return a set of suggest words
123: *
124: * @throws ParseException
125: */
126: public Set<String> suggest(String words) throws ParseException {
127: LinkedHashSet<String> set = new LinkedHashSet<String>();
128: LinkedHashSet s = new LinkedHashSet();
129: Query q;
130: StringBuffer sb = new StringBuffer();
131: for (counter = 0; counter < numberOfSimilarWords; counter++) {
132: s.clear();
133: q = super .parse(words);
134: q.extractTerms(s);
135: sb.delete(0, sb.length());
136: for (Iterator iter = s.iterator(); iter.hasNext();) {
137: if (sb.length() != 0) {
138: sb.append(" ");
139: }
140: Term term = (Term) iter.next();
141: sb.append(term.text());
142: }
143: set.add(sb.toString());
144: }
145: return set;
146: }
147:
148: }
|