001: package org.apache.lucene.analysis.cz;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.LowerCaseFilter;
022: import org.apache.lucene.analysis.StopFilter;
023: import org.apache.lucene.analysis.TokenStream;
024: import org.apache.lucene.analysis.WordlistLoader;
025: import org.apache.lucene.analysis.standard.StandardFilter;
026: import org.apache.lucene.analysis.standard.StandardTokenizer;
027:
028: import java.io.*;
029: import java.util.Hashtable;
030: import java.util.HashSet;
031: import java.util.Set;
032:
033: /**
034: * Analyzer for Czech language. Supports an external list of stopwords (words that
035: * will not be indexed at all).
036: * A default set of stopwords is used unless an alternative list is specified, the
037: * exclusion list is empty by default.
038: *
039: * @author Lukas Zapletal [lzap@root.cz]
040: */
041: public final class CzechAnalyzer extends Analyzer {
042:
043: /**
044: * List of typical stopwords.
045: */
046: public final static String[] CZECH_STOP_WORDS = { "a", "s", "k",
047: "o", "i", "u", "v", "z", "dnes", "cz", "t\u00edmto",
048: "bude\u0161", "budem", "byli", "jse\u0161", "m\u016fj",
049: "sv\u00fdm", "ta", "tomto", "tohle", "tuto", "tyto", "jej",
050: "zda", "pro\u010d", "m\u00e1te", "tato", "kam", "tohoto",
051: "kdo", "kte\u0159\u00ed", "mi", "n\u00e1m", "tom",
052: "tomuto", "m\u00edt", "nic", "proto", "kterou", "byla",
053: "toho", "proto\u017ee", "asi", "ho", "na\u0161i",
054: "napi\u0161te", "re", "co\u017e", "t\u00edm", "tak\u017ee",
055: "sv\u00fdch", "jej\u00ed", "sv\u00fdmi", "jste", "aj",
056: "tu", "tedy", "teto", "bylo", "kde", "ke", "prav\u00e9",
057: "ji", "nad", "nejsou", "\u010di", "pod", "t\u00e9ma",
058: "mezi", "p\u0159es", "ty", "pak", "v\u00e1m", "ani",
059: "kdy\u017e", "v\u0161ak", "neg", "jsem", "tento",
060: "\u010dl\u00e1nku", "\u010dl\u00e1nky", "aby", "jsme",
061: "p\u0159ed", "pta", "jejich", "byl", "je\u0161t\u011b",
062: "a\u017e", "bez", "tak\u00e9", "pouze", "prvn\u00ed",
063: "va\u0161e", "kter\u00e1", "n\u00e1s", "nov\u00fd", "tipy",
064: "pokud", "m\u016f\u017ee", "strana", "jeho", "sv\u00e9",
065: "jin\u00e9", "zpr\u00e1vy", "nov\u00e9", "nen\u00ed",
066: "v\u00e1s", "jen", "podle", "zde", "u\u017e", "b\u00fdt",
067: "v\u00edce", "bude", "ji\u017e", "ne\u017e", "kter\u00fd",
068: "by", "kter\u00e9", "co", "nebo", "ten", "tak", "m\u00e1",
069: "p\u0159i", "od", "po", "jsou", "jak", "dal\u0161\u00ed",
070: "ale", "si", "se", "ve", "to", "jako", "za", "zp\u011bt",
071: "ze", "do", "pro", "je", "na", "atd", "atp", "jakmile",
072: "p\u0159i\u010dem\u017e", "j\u00e1", "on", "ona", "ono",
073: "oni", "ony", "my", "vy", "j\u00ed", "ji", "m\u011b",
074: "mne", "jemu", "tomu", "t\u011bm", "t\u011bmu",
075: "n\u011bmu", "n\u011bmu\u017e", "jeho\u017e",
076: "j\u00ed\u017e", "jeliko\u017e", "je\u017e", "jako\u017e",
077: "na\u010de\u017e", };
078:
079: /**
080: * Contains the stopwords used with the StopFilter.
081: */
082: private Set stoptable;
083:
084: /**
085: * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
086: */
087: public CzechAnalyzer() {
088: stoptable = StopFilter.makeStopSet(CZECH_STOP_WORDS);
089: }
090:
091: /**
092: * Builds an analyzer with the given stop words.
093: */
094: public CzechAnalyzer(String[] stopwords) {
095: stoptable = StopFilter.makeStopSet(stopwords);
096: }
097:
098: public CzechAnalyzer(HashSet stopwords) {
099: stoptable = stopwords;
100: }
101:
102: /**
103: * Builds an analyzer with the given stop words.
104: */
105: public CzechAnalyzer(File stopwords) throws IOException {
106: stoptable = WordlistLoader.getWordSet(stopwords);
107: }
108:
109: /**
110: * Loads stopwords hash from resource stream (file, database...).
111: * @param wordfile File containing the wordlist
112: * @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
113: */
114: public void loadStopWords(InputStream wordfile, String encoding) {
115: if (wordfile == null) {
116: stoptable = new HashSet();
117: return;
118: }
119: try {
120: // clear any previous table (if present)
121: stoptable = new HashSet();
122:
123: InputStreamReader isr;
124: if (encoding == null)
125: isr = new InputStreamReader(wordfile);
126: else
127: isr = new InputStreamReader(wordfile, encoding);
128:
129: LineNumberReader lnr = new LineNumberReader(isr);
130: String word;
131: while ((word = lnr.readLine()) != null) {
132: stoptable.add(word);
133: }
134:
135: } catch (IOException e) {
136: stoptable = null;
137: }
138: }
139:
140: /**
141: * Creates a TokenStream which tokenizes all the text in the provided Reader.
142: *
143: * @return A TokenStream build from a StandardTokenizer filtered with
144: * StandardFilter, LowerCaseFilter, and StopFilter
145: */
146: public final TokenStream tokenStream(String fieldName, Reader reader) {
147: TokenStream result = new StandardTokenizer(reader);
148: result = new StandardFilter(result);
149: result = new LowerCaseFilter(result);
150: result = new StopFilter(result, stoptable);
151: return result;
152: }
153: }
|