001: package org.apache.lucene.analysis.nl;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.StopFilter;
022: import org.apache.lucene.analysis.TokenStream;
023: import org.apache.lucene.analysis.standard.StandardFilter;
024: import org.apache.lucene.analysis.standard.StandardTokenizer;
025:
026: import java.io.File;
027: import java.io.IOException;
028: import java.io.Reader;
029: import java.util.HashMap;
030: import java.util.HashSet;
031: import java.util.Set;
032: import java.util.Map;
033:
034: /**
035: * Analyzer for Dutch language. Supports an external list of stopwords (words that
036: * will not be indexed at all), an external list of exclusions (word that will
037: * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
038: * the algorithm (dictionary stemming).
039: * A default set of stopwords is used unless an alternative list is specified, the
040: * exclusion list is empty by default.
041: *
042: * @author Edwin de Jonge
043: */
044: public class DutchAnalyzer extends Analyzer {
045: /**
046: * List of typical Dutch stopwords.
047: */
048: public final static String[] DUTCH_STOP_WORDS = { "de", "en",
049: "van", "ik", "te", "dat", "die", "in", "een", "hij", "het",
050: "niet", "zijn", "is", "was", "op", "aan", "met", "als",
051: "voor", "had", "er", "maar", "om", "hem", "dan", "zou",
052: "of", "wat", "mijn", "men", "dit", "zo", "door", "over",
053: "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit",
054: "der", "daar", "haar", "naar", "heb", "hoe", "heeft",
055: "hebben", "deze", "u", "want", "nog", "zal", "me", "zij",
056: "nu", "ge", "geen", "omdat", "iets", "worden", "toch",
057: "al", "waren", "veel", "meer", "doen", "toen", "moet",
058: "ben", "zonder", "kan", "hun", "dus", "alles", "onder",
059: "ja", "eens", "hier", "wie", "werd", "altijd", "doch",
060: "wordt", "wezen", "kunnen", "ons", "zelf", "tegen", "na",
061: "reeds", "wil", "kon", "niets", "uw", "iemand", "geweest",
062: "andere" };
063:
064: /**
065: * Contains the stopwords used with the StopFilter.
066: */
067: private Set stoptable = new HashSet();
068:
069: /**
070: * Contains words that should be indexed but not stemmed.
071: */
072: private Set excltable = new HashSet();
073:
074: private Map stemdict = new HashMap();
075:
076: /**
077: * Builds an analyzer with the default stop words ({@link #DUTCH_STOP_WORDS})
078: * and a few default entries for the stem exclusion table.
079: *
080: */
081: public DutchAnalyzer() {
082: stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
083: stemdict.put("fiets", "fiets"); //otherwise fiet
084: stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
085: stemdict.put("ei", "eier");
086: stemdict.put("kind", "kinder");
087: }
088:
089: /**
090: * Builds an analyzer with the given stop words.
091: *
092: * @param stopwords
093: */
094: public DutchAnalyzer(String[] stopwords) {
095: stoptable = StopFilter.makeStopSet(stopwords);
096: }
097:
098: /**
099: * Builds an analyzer with the given stop words.
100: *
101: * @param stopwords
102: */
103: public DutchAnalyzer(HashSet stopwords) {
104: stoptable = stopwords;
105: }
106:
107: /**
108: * Builds an analyzer with the given stop words.
109: *
110: * @param stopwords
111: */
112: public DutchAnalyzer(File stopwords) {
113: try {
114: stoptable = org.apache.lucene.analysis.WordlistLoader
115: .getWordSet(stopwords);
116: } catch (IOException e) {
117: // TODO: throw IOException
118: throw new RuntimeException(e);
119: }
120: }
121:
122: /**
123: * Builds an exclusionlist from an array of Strings.
124: *
125: * @param exclusionlist
126: */
127: public void setStemExclusionTable(String[] exclusionlist) {
128: excltable = StopFilter.makeStopSet(exclusionlist);
129: }
130:
131: /**
132: * Builds an exclusionlist from a Hashtable.
133: */
134: public void setStemExclusionTable(HashSet exclusionlist) {
135: excltable = exclusionlist;
136: }
137:
138: /**
139: * Builds an exclusionlist from the words contained in the given file.
140: */
141: public void setStemExclusionTable(File exclusionlist) {
142: try {
143: excltable = org.apache.lucene.analysis.WordlistLoader
144: .getWordSet(exclusionlist);
145: } catch (IOException e) {
146: // TODO: throw IOException
147: throw new RuntimeException(e);
148: }
149: }
150:
151: /**
152: * Reads a stemdictionary file , that overrules the stemming algorithm
153: * This is a textfile that contains per line
154: * <tt>word<b>\t</b>stem</tt>, i.e: two tab seperated words
155: */
156: public void setStemDictionary(File stemdictFile) {
157: try {
158: stemdict = org.apache.lucene.analysis.WordlistLoader
159: .getStemDict(stemdictFile);
160: } catch (IOException e) {
161: // TODO: throw IOException
162: throw new RuntimeException(e);
163: }
164: }
165:
166: /**
167: * Creates a TokenStream which tokenizes all the text in the provided TextReader.
168: *
169: * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter,
170: * StopFilter, DutchStemFilter
171: */
172: public TokenStream tokenStream(String fieldName, Reader reader) {
173: TokenStream result = new StandardTokenizer(reader);
174: result = new StandardFilter(result);
175: result = new StopFilter(result, stoptable);
176: result = new DutchStemFilter(result, excltable, stemdict);
177: return result;
178: }
179: }
|