001: package org.apache.lucene.analysis.cz;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2004 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import org.apache.lucene.analysis.Analyzer;
058: import org.apache.lucene.analysis.LowerCaseFilter;
059: import org.apache.lucene.analysis.StopFilter;
060: import org.apache.lucene.analysis.TokenStream;
061:
062: import org.apache.lucene.analysis.standard.StandardFilter;
063: import org.apache.lucene.analysis.standard.StandardTokenizer;
064:
065: import java.io.*;
066: import java.util.Hashtable;
067: import java.util.HashSet;
068: import java.util.Set;
069: import org.apache.lucene.analysis.de.WordlistLoader;
070:
071: /**
072: * Analyzer for Czech language. Supports an external list of stopwords (words that
073: * will not be indexed at all).
074: * A default set of stopwords is used unless an alternative list is specified, the
075: * exclusion list is empty by default.
076: *
077: * @author Lukas Zapletal [lzap@root.cz]
078: */
079: public final class CzechAnalyzer extends Analyzer {
080:
081: /**
082: * List of typical stopwords.
083: */
084: public final static String[] CZECH_STOP_WORDS = { "a", "s", "k",
085: "o", "i", "u", "v", "z", "dnes", "cz", "t\u00edmto",
086: "bude\u0161", "budem", "byli", "jse\u0161", "m\u016fj",
087: "sv\u00fdm", "ta", "tomto", "tohle", "tuto", "tyto", "jej",
088: "zda", "pro\u010d", "m\u00e1te", "tato", "kam", "tohoto",
089: "kdo", "kte\u0159\u00ed", "mi", "n\u00e1m", "tom",
090: "tomuto", "m\u00edt", "nic", "proto", "kterou", "byla",
091: "toho", "proto\u017ee", "asi", "ho", "na\u0161i",
092: "napi\u0161te", "re", "co\u017e", "t\u00edm", "tak\u017ee",
093: "sv\u00fdch", "jej\u00ed", "sv\u00fdmi", "jste", "aj",
094: "tu", "tedy", "teto", "bylo", "kde", "ke", "prav\u00e9",
095: "ji", "nad", "nejsou", "\u010di", "pod", "t\u00e9ma",
096: "mezi", "p\u0159es", "ty", "pak", "v\u00e1m", "ani",
097: "kdy\u017e", "v\u0161ak", "neg", "jsem", "tento",
098: "\u010dl\u00e1nku", "\u010dl\u00e1nky", "aby", "jsme",
099: "p\u0159ed", "pta", "jejich", "byl", "je\u0161t\u011b",
100: "a\u017e", "bez", "tak\u00e9", "pouze", "prvn\u00ed",
101: "va\u0161e", "kter\u00e1", "n\u00e1s", "nov\u00fd", "tipy",
102: "pokud", "m\u016f\u017ee", "strana", "jeho", "sv\u00e9",
103: "jin\u00e9", "zpr\u00e1vy", "nov\u00e9", "nen\u00ed",
104: "v\u00e1s", "jen", "podle", "zde", "u\u017e", "b\u00fdt",
105: "v\u00edce", "bude", "ji\u017e", "ne\u017e", "kter\u00fd",
106: "by", "kter\u00e9", "co", "nebo", "ten", "tak", "m\u00e1",
107: "p\u0159i", "od", "po", "jsou", "jak", "dal\u0161\u00ed",
108: "ale", "si", "se", "ve", "to", "jako", "za", "zp\u011bt",
109: "ze", "do", "pro", "je", "na", "atd", "atp", "jakmile",
110: "p\u0159i\u010dem\u017e", "j\u00e1", "on", "ona", "ono",
111: "oni", "ony", "my", "vy", "j\u00ed", "ji", "m\u011b",
112: "mne", "jemu", "tomu", "t\u011bm", "t\u011bmu",
113: "n\u011bmu", "n\u011bmu\u017e", "jeho\u017e",
114: "j\u00ed\u017e", "jeliko\u017e", "je\u017e", "jako\u017e",
115: "na\u010de\u017e", };
116:
117: /**
118: * Contains the stopwords used with the StopFilter.
119: */
120: private Set stoptable;
121:
122: /**
123: * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
124: */
125: public CzechAnalyzer() {
126: stoptable = StopFilter.makeStopSet(CZECH_STOP_WORDS);
127: }
128:
129: /**
130: * Builds an analyzer with the given stop words.
131: */
132: public CzechAnalyzer(String[] stopwords) {
133: stoptable = StopFilter.makeStopSet(stopwords);
134: }
135:
136: /**
137: * Builds an analyzer with the given stop words.
138: *
139: * @deprecated
140: */
141: public CzechAnalyzer(Hashtable stopwords) {
142: stoptable = new HashSet(stopwords.keySet());
143: }
144:
145: public CzechAnalyzer(HashSet stopwords) {
146: stoptable = stopwords;
147: }
148:
149: /**
150: * Builds an analyzer with the given stop words.
151: */
152: public CzechAnalyzer(File stopwords) throws IOException {
153: stoptable = WordlistLoader.getWordSet(stopwords);
154: }
155:
156: /**
157: * Loads stopwords hash from resource stream (file, database...).
158: * @param wordfile File containing the wordlist
159: * @param encoding Encoding used (win-1250, iso-8859-2, ...}, null for default system encoding
160: */
161: public void loadStopWords(InputStream wordfile, String encoding) {
162: if (wordfile == null) {
163: stoptable = new HashSet();
164: return;
165: }
166: try {
167: // clear any previous table (if present)
168: stoptable = new HashSet();
169:
170: InputStreamReader isr;
171: if (encoding == null)
172: isr = new InputStreamReader(wordfile);
173: else
174: isr = new InputStreamReader(wordfile, encoding);
175:
176: LineNumberReader lnr = new LineNumberReader(isr);
177: String word;
178: while ((word = lnr.readLine()) != null) {
179: stoptable.add(word);
180: }
181:
182: } catch (IOException e) {
183: stoptable = null;
184: }
185: }
186:
187: /**
188: * Creates a TokenStream which tokenizes all the text in the provided Reader.
189: *
190: * @return A TokenStream build from a StandardTokenizer filtered with
191: * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
192: */
193: public final TokenStream tokenStream(String fieldName, Reader reader) {
194: TokenStream result = new StandardTokenizer(reader);
195: result = new StandardFilter(result);
196: result = new LowerCaseFilter(result);
197: result = new StopFilter(result, stoptable);
198: return result;
199: }
200: }
|