001: package org.apache.lucene.queryParser.analyzing;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.StringReader;
022: import java.util.ArrayList;
023: import java.util.List;
024:
025: import org.apache.lucene.analysis.Analyzer;
026: import org.apache.lucene.analysis.TokenStream;
027: import org.apache.lucene.queryParser.ParseException;
028: import org.apache.lucene.search.Query;
029:
030: /**
031: * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
032: * are also passed through the given analyzer, but wild card characters (like <code>*</code>)
033: * don't get removed from the search terms.
034: *
035: * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
036: * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
037: * will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
038: * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
039: * using this parser will be no improvement over QueryParser in such cases).
040: *
041: * @author Ronnie Kolehmainen (ronnie.kolehmainen at ub.uu.se)
042: * @version $Revision$, $Date$
043: */
044: public class AnalyzingQueryParser extends
045: org.apache.lucene.queryParser.QueryParser {
046:
047: /**
048: * Constructs a query parser.
049: * @param field the default field for query terms.
050: * @param analyzer used to find terms in the query text.
051: */
052: public AnalyzingQueryParser(String field, Analyzer analyzer) {
053: super (field, analyzer);
054: }
055:
056: /**
057: * Called when parser
058: * parses an input term token that contains one or more wildcard
059: * characters (like <code>*</code>), but is not a prefix term token (one
060: * that has just a single * character at the end).
061: * <p>
062: * Example: will be called for <code>H?user</code> or for <code>H*user</code>
063: * but not for <code>*user</code>.
064: * <p>
065: * Depending on analyzer and settings, a wildcard term may (most probably will)
066: * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
067: * <p>
068: * Overrides super class, by passing terms through analyzer.
069: *
070: * @param field Name of the field query will use.
071: * @param termStr Term token that contains one or more wild card
072: * characters (? or *), but is not simple prefix term
073: *
074: * @return Resulting {@link Query} built for the term
075: * @throws ParseException
076: */
077: protected Query getWildcardQuery(String field, String termStr)
078: throws ParseException {
079: List tlist = new ArrayList();
080: List wlist = new ArrayList();
081: /* somewhat a hack: find/store wildcard chars
082: * in order to put them back after analyzing */
083: boolean isWithinToken = (!termStr.startsWith("?") && !termStr
084: .startsWith("*"));
085: StringBuffer tmpBuffer = new StringBuffer();
086: char[] chars = termStr.toCharArray();
087: for (int i = 0; i < termStr.length(); i++) {
088: if (chars[i] == '?' || chars[i] == '*') {
089: if (isWithinToken) {
090: tlist.add(tmpBuffer.toString());
091: tmpBuffer.setLength(0);
092: }
093: isWithinToken = false;
094: } else {
095: if (!isWithinToken) {
096: wlist.add(tmpBuffer.toString());
097: tmpBuffer.setLength(0);
098: }
099: isWithinToken = true;
100: }
101: tmpBuffer.append(chars[i]);
102: }
103: if (isWithinToken) {
104: tlist.add(tmpBuffer.toString());
105: } else {
106: wlist.add(tmpBuffer.toString());
107: }
108:
109: // get Analyzer from superclass and tokenize the term
110: TokenStream source = getAnalyzer().tokenStream(field,
111: new StringReader(termStr));
112: org.apache.lucene.analysis.Token t;
113:
114: int countTokens = 0;
115: while (true) {
116: try {
117: t = source.next();
118: } catch (IOException e) {
119: t = null;
120: }
121: if (t == null) {
122: break;
123: }
124: if (!"".equals(t.termText())) {
125: try {
126: tlist.set(countTokens++, t.termText());
127: } catch (IndexOutOfBoundsException ioobe) {
128: countTokens = -1;
129: }
130: }
131: }
132: try {
133: source.close();
134: } catch (IOException e) {
135: // ignore
136: }
137:
138: if (countTokens != tlist.size()) {
139: /* this means that the analyzer used either added or consumed
140: * (common for a stemmer) tokens, and we can't build a WildcardQuery */
141: throw new ParseException(
142: "Cannot build WildcardQuery with analyzer "
143: + getAnalyzer().getClass()
144: + " - tokens added or lost");
145: }
146:
147: if (tlist.size() == 0) {
148: return null;
149: } else if (tlist.size() == 1) {
150: if (wlist != null && wlist.size() == 1) {
151: /* if wlist contains one wildcard, it must be at the end, because:
152: * 1) wildcards are not allowed in 1st position of a term by QueryParser
153: * 2) if wildcard was *not* in end, there would be *two* or more tokens */
154: return super .getWildcardQuery(field, (String) tlist
155: .get(0)
156: + (((String) wlist.get(0)).toString()));
157: } else {
158: /* we should never get here! if so, this method was called
159: * with a termStr containing no wildcard ... */
160: throw new IllegalArgumentException(
161: "getWildcardQuery called without wildcard");
162: }
163: } else {
164: /* the term was tokenized, let's rebuild to one token
165: * with wildcards put back in postion */
166: StringBuffer sb = new StringBuffer();
167: for (int i = 0; i < tlist.size(); i++) {
168: sb.append((String) tlist.get(i));
169: if (wlist != null && wlist.size() > i) {
170: sb.append((String) wlist.get(i));
171: }
172: }
173: return super .getWildcardQuery(field, sb.toString());
174: }
175: }
176:
177: /**
178: * Called when parser parses an input term
179: * token that uses prefix notation; that is, contains a single '*' wildcard
180: * character as its last character. Since this is a special case
181: * of generic wildcard term, and such a query can be optimized easily,
182: * this usually results in a different query object.
183: * <p>
184: * Depending on analyzer and settings, a prefix term may (most probably will)
185: * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
186: * <p>
187: * Overrides super class, by passing terms through analyzer.
188: *
189: * @param field Name of the field query will use.
190: * @param termStr Term token to use for building term for the query
191: * (<b>without</b> trailing '*' character!)
192: *
193: * @return Resulting {@link Query} built for the term
194: * @throws ParseException
195: */
196: protected Query getPrefixQuery(String field, String termStr)
197: throws ParseException {
198: // get Analyzer from superclass and tokenize the term
199: TokenStream source = getAnalyzer().tokenStream(field,
200: new StringReader(termStr));
201: List tlist = new ArrayList();
202: org.apache.lucene.analysis.Token t;
203:
204: while (true) {
205: try {
206: t = source.next();
207: } catch (IOException e) {
208: t = null;
209: }
210: if (t == null) {
211: break;
212: }
213: tlist.add(t.termText());
214: }
215:
216: try {
217: source.close();
218: } catch (IOException e) {
219: // ignore
220: }
221:
222: if (tlist.size() == 1) {
223: return super .getPrefixQuery(field, (String) tlist.get(0));
224: } else {
225: /* this means that the analyzer used either added or consumed
226: * (common for a stemmer) tokens, and we can't build a PrefixQuery */
227: throw new ParseException(
228: "Cannot build PrefixQuery with analyzer "
229: + getAnalyzer().getClass()
230: + (tlist.size() > 1 ? " - token(s) added"
231: : " - token consumed"));
232: }
233: }
234:
235: /**
236: * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
237: * <p>
238: * Depending on analyzer and settings, a fuzzy term may (most probably will)
239: * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
240: * <p>
241: * Overrides super class, by passing terms through analyzer.
242: *
243: * @param field Name of the field query will use.
244: * @param termStr Term token to use for building term for the query
245: *
246: * @return Resulting {@link Query} built for the term
247: * @exception ParseException
248: */
249: protected Query getFuzzyQuery(String field, String termStr,
250: float minSimilarity) throws ParseException {
251: // get Analyzer from superclass and tokenize the term
252: TokenStream source = getAnalyzer().tokenStream(field,
253: new StringReader(termStr));
254: org.apache.lucene.analysis.Token t;
255: boolean multipleTokens = false;
256:
257: try {
258: t = source.next();
259: multipleTokens = source.next() != null;
260: } catch (IOException e) {
261: t = null;
262: }
263:
264: try {
265: source.close();
266: } catch (IOException e) {
267: // ignore
268: }
269:
270: if (multipleTokens) {
271: throw new ParseException(
272: "Cannot build FuzzyQuery with analyzer "
273: + getAnalyzer().getClass()
274: + " - tokens were added");
275: }
276:
277: return (t == null) ? null : super .getFuzzyQuery(field, t
278: .termText(), minSimilarity);
279: }
280:
281: /**
282: * Overrides super class, by passing terms through analyzer.
283: * @exception ParseException
284: */
285: protected Query getRangeQuery(String field, String part1,
286: String part2, boolean inclusive) throws ParseException {
287: // get Analyzer from superclass and tokenize the terms
288: TokenStream source = getAnalyzer().tokenStream(field,
289: new StringReader(part1));
290: org.apache.lucene.analysis.Token t;
291: boolean multipleTokens = false;
292:
293: // part1
294: try {
295: t = source.next();
296: if (t != null) {
297: part1 = t.termText();
298: }
299: multipleTokens = source.next() != null;
300: } catch (IOException e) {
301: t = null;
302: }
303: try {
304: source.close();
305: } catch (IOException e) {
306: // ignore
307: }
308: if (multipleTokens) {
309: throw new ParseException(
310: "Cannot build RangeQuery with analyzer "
311: + getAnalyzer().getClass()
312: + " - tokens were added to part1");
313: }
314:
315: source = getAnalyzer().tokenStream(field,
316: new StringReader(part2));
317: // part2
318: try {
319: t = source.next();
320: if (t != null) {
321: part2 = t.termText();
322: }
323: multipleTokens = source.next() != null;
324: } catch (IOException e) {
325: t = null;
326: }
327: try {
328: source.close();
329: } catch (IOException e) {
330: // ignore
331: }
332: if (multipleTokens) {
333: throw new ParseException(
334: "Cannot build RangeQuery with analyzer "
335: + getAnalyzer().getClass()
336: + " - tokens were added to part2");
337: }
338: return super.getRangeQuery(field, part1, part2, inclusive);
339: }
340:
341: }
|