001: package org.apache.lucene.analysis.de;
002:
003: // This file is encoded in UTF-8
004:
005: /**
006: * Licensed to the Apache Software Foundation (ASF) under one or more
007: * contributor license agreements. See the NOTICE file distributed with
008: * this work for additional information regarding copyright ownership.
009: * The ASF licenses this file to You under the Apache License, Version 2.0
010: * (the "License"); you may not use this file except in compliance with
011: * the License. You may obtain a copy of the License at
012: *
013: * http://www.apache.org/licenses/LICENSE-2.0
014: *
015: * Unless required by applicable law or agreed to in writing, software
016: * distributed under the License is distributed on an "AS IS" BASIS,
017: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018: * See the License for the specific language governing permissions and
019: * limitations under the License.
020: */
021:
022: import java.io.File;
023: import java.io.IOException;
024: import java.io.Reader;
025: import java.util.HashSet;
026: import java.util.Hashtable;
027: import java.util.Set;
028:
029: import org.apache.lucene.analysis.Analyzer;
030: import org.apache.lucene.analysis.LowerCaseFilter;
031: import org.apache.lucene.analysis.StopFilter;
032: import org.apache.lucene.analysis.TokenStream;
033: import org.apache.lucene.analysis.WordlistLoader;
034: import org.apache.lucene.analysis.standard.StandardFilter;
035: import org.apache.lucene.analysis.standard.StandardTokenizer;
036:
037: /**
038: * Analyzer for German language. Supports an external list of stopwords (words that
039: * will not be indexed at all) and an external list of exclusions (word that will
040: * not be stemmed, but indexed).
041: * A default set of stopwords is used unless an alternative list is specified, the
042: * exclusion list is empty by default.
043: *
044: *
045: * @version $Id: GermanAnalyzer.java 564236 2007-08-09 15:21:19Z gsingers $
046: */
047: public class GermanAnalyzer extends Analyzer {
048:
049: /**
050: * List of typical german stopwords.
051: */
052: public final static String[] GERMAN_STOP_WORDS = { "einer", "eine",
053: "eines", "einem", "einen", "der", "die", "das", "dass",
054: "daß", "du", "er", "sie", "es", "was", "wer", "wie",
055: "wir", "und", "oder", "ohne", "mit", "am", "im", "in",
056: "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre",
057: "ihres", "als", "für", "von", "mit", "dich", "dir",
058: "mich", "mir", "mein", "sein", "kein", "durch", "wegen",
059: "wird" };
060:
061: /**
062: * Contains the stopwords used with the StopFilter.
063: */
064: private Set stopSet = new HashSet();
065:
066: /**
067: * Contains words that should be indexed but not stemmed.
068: */
069: private Set exclusionSet = new HashSet();
070:
071: /**
072: * Builds an analyzer with the default stop words
073: * (<code>GERMAN_STOP_WORDS</code>).
074: */
075: public GermanAnalyzer() {
076: stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
077: }
078:
079: /**
080: * Builds an analyzer with the given stop words.
081: */
082: public GermanAnalyzer(String[] stopwords) {
083: stopSet = StopFilter.makeStopSet(stopwords);
084: }
085:
086: /**
087: * Builds an analyzer with the given stop words.
088: */
089: public GermanAnalyzer(Hashtable stopwords) {
090: stopSet = new HashSet(stopwords.keySet());
091: }
092:
093: /**
094: * Builds an analyzer with the given stop words.
095: */
096: public GermanAnalyzer(File stopwords) throws IOException {
097: stopSet = WordlistLoader.getWordSet(stopwords);
098: }
099:
100: /**
101: * Builds an exclusionlist from an array of Strings.
102: */
103: public void setStemExclusionTable(String[] exclusionlist) {
104: exclusionSet = StopFilter.makeStopSet(exclusionlist);
105: }
106:
107: /**
108: * Builds an exclusionlist from a Hashtable.
109: */
110: public void setStemExclusionTable(Hashtable exclusionlist) {
111: exclusionSet = new HashSet(exclusionlist.keySet());
112: }
113:
114: /**
115: * Builds an exclusionlist from the words contained in the given file.
116: */
117: public void setStemExclusionTable(File exclusionlist)
118: throws IOException {
119: exclusionSet = WordlistLoader.getWordSet(exclusionlist);
120: }
121:
122: /**
123: * Creates a TokenStream which tokenizes all the text in the provided Reader.
124: *
125: * @return A TokenStream build from a StandardTokenizer filtered with
126: * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
127: */
128: public TokenStream tokenStream(String fieldName, Reader reader) {
129: TokenStream result = new StandardTokenizer(reader);
130: result = new StandardFilter(result);
131: result = new LowerCaseFilter(result);
132: result = new StopFilter(result, stopSet);
133: result = new GermanStemFilter(result, exclusionSet);
134: return result;
135: }
136: }
|