01: package org.apache.lucene.analysis;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import java.io.File;
21: import java.io.IOException;
22: import java.io.Reader;
23: import java.util.Set;
24:
25: /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
26:
27: public final class StopAnalyzer extends Analyzer {
28: private Set stopWords;
29:
30: /** An array containing some common English words that are not usually useful
31: for searching. */
32: public static final String[] ENGLISH_STOP_WORDS = { "a", "an",
33: "and", "are", "as", "at", "be", "but", "by", "for", "if",
34: "in", "into", "is", "it", "no", "not", "of", "on", "or",
35: "such", "that", "the", "their", "then", "there", "these",
36: "they", "this", "to", "was", "will", "with" };
37:
38: /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
39: public StopAnalyzer() {
40: stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
41: }
42:
43: /** Builds an analyzer with the stop words from the given set.
44: */
45: public StopAnalyzer(Set stopWords) {
46: this .stopWords = stopWords;
47: }
48:
49: /** Builds an analyzer which removes words in the provided array. */
50: public StopAnalyzer(String[] stopWords) {
51: this .stopWords = StopFilter.makeStopSet(stopWords);
52: }
53:
54: /** Builds an analyzer with the stop words from the given file.
55: * @see WordlistLoader#getWordSet(File)
56: */
57: public StopAnalyzer(File stopwordsFile) throws IOException {
58: stopWords = WordlistLoader.getWordSet(stopwordsFile);
59: }
60:
61: /** Builds an analyzer with the stop words from the given reader.
62: * @see WordlistLoader#getWordSet(Reader)
63: */
64: public StopAnalyzer(Reader stopwords) throws IOException {
65: stopWords = WordlistLoader.getWordSet(stopwords);
66: }
67:
68: /** Filters LowerCaseTokenizer with StopFilter. */
69: public TokenStream tokenStream(String fieldName, Reader reader) {
70: return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
71: }
72:
73: /** Filters LowerCaseTokenizer with StopFilter. */
74: private class SavedStreams {
75: Tokenizer source;
76: TokenStream result;
77: };
78:
79: public TokenStream reusableTokenStream(String fieldName,
80: Reader reader) throws IOException {
81: SavedStreams streams = (SavedStreams) getPreviousTokenStream();
82: if (streams == null) {
83: streams = new SavedStreams();
84: streams.source = new LowerCaseTokenizer(reader);
85: streams.result = new StopFilter(streams.source, stopWords);
86: setPreviousTokenStream(streams);
87: } else
88: streams.source.reset(reader);
89: return streams.result;
90: }
91: }
|