001: package org.apache.lucene.analysis.standard;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.*;
021:
022: import java.io.File;
023: import java.io.IOException;
024: import java.io.Reader;
025: import java.util.Set;
026:
027: /**
028: * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
029: * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
030: *
031: * @version $Id: StandardAnalyzer.java 613280 2008-01-18 21:27:10Z gsingers $
032: */
033: public class StandardAnalyzer extends Analyzer {
034: private Set stopSet;
035:
036: /**
037: * Specifies whether deprecated acronyms should be replaced with HOST type.
038: * This is false by default to support backward compatibility.
039: *
040: * @deprecated this should be removed in the next release (3.0).
041: *
042: * See https://issues.apache.org/jira/browse/LUCENE-1068
043: */
044: private boolean replaceInvalidAcronym = false;
045:
046: /** An array containing some common English words that are usually not
047: useful for searching. */
048: public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
049:
050: /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
051: public StandardAnalyzer() {
052: this (STOP_WORDS);
053: }
054:
055: /** Builds an analyzer with the given stop words. */
056: public StandardAnalyzer(Set stopWords) {
057: stopSet = stopWords;
058: }
059:
060: /** Builds an analyzer with the given stop words. */
061: public StandardAnalyzer(String[] stopWords) {
062: stopSet = StopFilter.makeStopSet(stopWords);
063: }
064:
065: /** Builds an analyzer with the stop words from the given file.
066: * @see WordlistLoader#getWordSet(File)
067: */
068: public StandardAnalyzer(File stopwords) throws IOException {
069: stopSet = WordlistLoader.getWordSet(stopwords);
070: }
071:
072: /** Builds an analyzer with the stop words from the given reader.
073: * @see WordlistLoader#getWordSet(Reader)
074: */
075: public StandardAnalyzer(Reader stopwords) throws IOException {
076: stopSet = WordlistLoader.getWordSet(stopwords);
077: }
078:
079: /**
080: *
081: * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
082: *
083: * See https://issues.apache.org/jira/browse/LUCENE-1068
084: *
085: * @deprecated Remove in 3.X and make true the only valid value
086: */
087: public StandardAnalyzer(boolean replaceInvalidAcronym) {
088: this (STOP_WORDS);
089: this .replaceInvalidAcronym = replaceInvalidAcronym;
090: }
091:
092: /**
093: * @param stopwords The stopwords to use
094: * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
095: *
096: * See https://issues.apache.org/jira/browse/LUCENE-1068
097: *
098: * @deprecated Remove in 3.X and make true the only valid value
099: */
100: public StandardAnalyzer(Reader stopwords,
101: boolean replaceInvalidAcronym) throws IOException {
102: this (stopwords);
103: this .replaceInvalidAcronym = replaceInvalidAcronym;
104: }
105:
106: /**
107: * @param stopwords The stopwords to use
108: * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
109: *
110: * See https://issues.apache.org/jira/browse/LUCENE-1068
111: *
112: * @deprecated Remove in 3.X and make true the only valid value
113: */
114: public StandardAnalyzer(File stopwords,
115: boolean replaceInvalidAcronym) throws IOException {
116: this (stopwords);
117: this .replaceInvalidAcronym = replaceInvalidAcronym;
118: }
119:
120: /**
121: *
122: * @param stopwords The stopwords to use
123: * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
124: *
125: * See https://issues.apache.org/jira/browse/LUCENE-1068
126: *
127: * @deprecated Remove in 3.X and make true the only valid value
128: */
129: public StandardAnalyzer(String[] stopwords,
130: boolean replaceInvalidAcronym) throws IOException {
131: this (stopwords);
132: this .replaceInvalidAcronym = replaceInvalidAcronym;
133: }
134:
135: /**
136: * @param stopwords The stopwords to use
137: * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
138: *
139: * See https://issues.apache.org/jira/browse/LUCENE-1068
140: *
141: * @deprecated Remove in 3.X and make true the only valid value
142: */
143: public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym)
144: throws IOException {
145: this (stopwords);
146: this .replaceInvalidAcronym = replaceInvalidAcronym;
147: }
148:
149: /** Constructs a {@link StandardTokenizer} filtered by a {@link
150: StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
151: public TokenStream tokenStream(String fieldName, Reader reader) {
152: StandardTokenizer tokenStream = new StandardTokenizer(reader,
153: replaceInvalidAcronym);
154: tokenStream.setMaxTokenLength(maxTokenLength);
155: TokenStream result = new StandardFilter(tokenStream);
156: result = new LowerCaseFilter(result);
157: result = new StopFilter(result, stopSet);
158: return result;
159: }
160:
161: private static final class SavedStreams {
162: StandardTokenizer tokenStream;
163: TokenStream filteredTokenStream;
164: }
165:
166: /** Default maximum allowed token length */
167: public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
168:
169: private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
170:
171: /**
172: * Set maximum allowed token length. If a token is seen
173: * that exceeds this length then it is discarded. This
174: * setting only takes effect the next time tokenStream or
175: * reusableTokenStream is called.
176: */
177: public void setMaxTokenLength(int length) {
178: maxTokenLength = length;
179: }
180:
181: /**
182: * @see #setMaxTokenLength
183: */
184: public int getMaxTokenLength() {
185: return maxTokenLength;
186: }
187:
188: public TokenStream reusableTokenStream(String fieldName,
189: Reader reader) throws IOException {
190: SavedStreams streams = (SavedStreams) getPreviousTokenStream();
191: if (streams == null) {
192: streams = new SavedStreams();
193: setPreviousTokenStream(streams);
194: streams.tokenStream = new StandardTokenizer(reader);
195: streams.filteredTokenStream = new StandardFilter(
196: streams.tokenStream);
197: streams.filteredTokenStream = new LowerCaseFilter(
198: streams.filteredTokenStream);
199: streams.filteredTokenStream = new StopFilter(
200: streams.filteredTokenStream, stopSet);
201: } else {
202: streams.tokenStream.reset(reader);
203: }
204: streams.tokenStream.setMaxTokenLength(maxTokenLength);
205:
206: streams.tokenStream
207: .setReplaceInvalidAcronym(replaceInvalidAcronym);
208:
209: return streams.filteredTokenStream;
210: }
211:
212: /**
213: *
214: * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
215: *
216: * See https://issues.apache.org/jira/browse/LUCENE-1068
217: */
218: public boolean isReplaceInvalidAcronym() {
219: return replaceInvalidAcronym;
220: }
221:
222: /**
223: *
224: * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
225: *
226: * See https://issues.apache.org/jira/browse/LUCENE-1068
227: */
228: public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
229: this.replaceInvalidAcronym = replaceInvalidAcronym;
230: }
231: }
|