01: package org.apache.lucene.analysis.cjk;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import org.apache.lucene.analysis.Analyzer;
21: import org.apache.lucene.analysis.StopFilter;
22: import org.apache.lucene.analysis.TokenStream;
23:
24: import java.io.Reader;
25: import java.util.Set;
26:
27: /**
28: * Filters CJKTokenizer with StopFilter.
29: *
30: * @author Che, Dong
31: */
32: public class CJKAnalyzer extends Analyzer {
33: //~ Static fields/initializers ---------------------------------------------
34:
35: /**
36: * An array containing some common English words that are not usually
37: * useful for searching and some double-byte interpunctions.
38: */
39: public final static String[] STOP_WORDS = { "a", "and", "are",
40: "as", "at", "be", "but", "by", "for", "if", "in", "into",
41: "is", "it", "no", "not", "of", "on", "or", "s", "such",
42: "t", "that", "the", "their", "then", "there", "these",
43: "they", "this", "to", "was", "will", "with", "", "www" };
44:
45: //~ Instance fields --------------------------------------------------------
46:
47: /**
48: * stop word list
49: */
50: private Set stopTable;
51:
52: //~ Constructors -----------------------------------------------------------
53:
54: /**
55: * Builds an analyzer which removes words in {@link #STOP_WORDS}.
56: */
57: public CJKAnalyzer() {
58: stopTable = StopFilter.makeStopSet(STOP_WORDS);
59: }
60:
61: /**
62: * Builds an analyzer which removes words in the provided array.
63: *
64: * @param stopWords stop word array
65: */
66: public CJKAnalyzer(String[] stopWords) {
67: stopTable = StopFilter.makeStopSet(stopWords);
68: }
69:
70: //~ Methods ----------------------------------------------------------------
71:
72: /**
73: * get token stream from input
74: *
75: * @param fieldName lucene field name
76: * @param reader input reader
77: * @return TokenStream
78: */
79: public final TokenStream tokenStream(String fieldName, Reader reader) {
80: return new StopFilter(new CJKTokenizer(reader), stopTable);
81: }
82: }
|