001: package org.apache.lucene.analysis.fr;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.LowerCaseFilter;
022: import org.apache.lucene.analysis.StopFilter;
023: import org.apache.lucene.analysis.TokenStream;
024: import org.apache.lucene.analysis.WordlistLoader;
025: import org.apache.lucene.analysis.standard.StandardFilter;
026: import org.apache.lucene.analysis.standard.StandardTokenizer;
027:
028: import java.io.File;
029: import java.io.IOException;
030: import java.io.Reader;
031: import java.util.HashSet;
032: import java.util.Hashtable;
033: import java.util.Set;
034:
035: /**
036: * Analyzer for French language. Supports an external list of stopwords (words that
037: * will not be indexed at all) and an external list of exclusions (word that will
038: * not be stemmed, but indexed).
039: * A default set of stopwords is used unless an alternative list is specified, the
040: * exclusion list is empty by default.
041: *
042: *
043: * @version $Id: FrenchAnalyzer.java 564236 2007-08-09 15:21:19Z gsingers $
044: */
045: public final class FrenchAnalyzer extends Analyzer {
046:
047: /**
048: * Extended list of typical French stopwords.
049: */
050: public final static String[] FRENCH_STOP_WORDS = { "a", "afin",
051: "ai", "ainsi", "après", "attendu", "au", "aujourd",
052: "auquel", "aussi", "autre", "autres", "aux", "auxquelles",
053: "auxquels", "avait", "avant", "avec", "avoir", "c", "car",
054: "ce", "ceci", "cela", "celle", "celles", "celui",
055: "cependant", "certain", "certaine", "certaines",
056: "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
057: "combien", "comme", "comment", "concernant", "contre", "d",
058: "dans", "de", "debout", "dedans", "dehors", "delà",
059: "depuis", "derrière", "des", "désormais", "desquelles",
060: "desquels", "dessous", "dessus", "devant", "devers",
061: "devra", "divers", "diverse", "diverses", "doit", "donc",
062: "dont", "du", "duquel", "durant", "dès", "elle", "elles",
063: "en", "entre", "environ", "est", "et", "etc", "etre", "eu",
064: "eux", "excepté", "hormis", "hors", "hélas", "hui", "il",
065: "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
066: "le", "lequel", "les", "lesquelles", "lesquels", "leur",
067: "leurs", "lorsque", "lui", "là", "ma", "mais", "malgré",
068: "me", "merci", "mes", "mien", "mienne", "miennes", "miens",
069: "moi", "moins", "mon", "moyennant", "même", "mêmes", "n",
070: "ne", "ni", "non", "nos", "notre", "nous", "néanmoins",
071: "nôtre", "nôtres", "on", "ont", "ou", "outre", "où",
072: "par", "parmi", "partant", "pas", "passé", "pendant",
073: "plein", "plus", "plusieurs", "pour", "pourquoi", "proche",
074: "près", "puisque", "qu", "quand", "que", "quel", "quelle",
075: "quelles", "quels", "qui", "quoi", "quoique", "revoici",
076: "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
077: "seront", "ses", "si", "sien", "sienne", "siennes",
078: "siens", "sinon", "soi", "soit", "son", "sont", "sous",
079: "suivant", "sur", "ta", "te", "tes", "tien", "tienne",
080: "tiennes", "tiens", "toi", "ton", "tous", "tout", "toute",
081: "toutes", "tu", "un", "une", "va", "vers", "voici",
082: "voilà", "vos", "votre", "vous", "vu", "vôtre",
083: "vôtres", "y", "à", "ça", "ès", "été", "être", "ô" };
084:
085: /**
086: * Contains the stopwords used with the StopFilter.
087: */
088: private Set stoptable = new HashSet();
089: /**
090: * Contains words that should be indexed but not stemmed.
091: */
092: private Set excltable = new HashSet();
093:
094: /**
095: * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
096: */
097: public FrenchAnalyzer() {
098: stoptable = StopFilter.makeStopSet(FRENCH_STOP_WORDS);
099: }
100:
101: /**
102: * Builds an analyzer with the given stop words.
103: */
104: public FrenchAnalyzer(String[] stopwords) {
105: stoptable = StopFilter.makeStopSet(stopwords);
106: }
107:
108: /**
109: * Builds an analyzer with the given stop words.
110: * @throws IOException
111: */
112: public FrenchAnalyzer(File stopwords) throws IOException {
113: stoptable = new HashSet(WordlistLoader.getWordSet(stopwords));
114: }
115:
116: /**
117: * Builds an exclusionlist from an array of Strings.
118: */
119: public void setStemExclusionTable(String[] exclusionlist) {
120: excltable = StopFilter.makeStopSet(exclusionlist);
121: }
122:
123: /**
124: * Builds an exclusionlist from a Hashtable.
125: */
126: public void setStemExclusionTable(Hashtable exclusionlist) {
127: excltable = new HashSet(exclusionlist.keySet());
128: }
129:
130: /**
131: * Builds an exclusionlist from the words contained in the given file.
132: * @throws IOException
133: */
134: public void setStemExclusionTable(File exclusionlist)
135: throws IOException {
136: excltable = new HashSet(WordlistLoader
137: .getWordSet(exclusionlist));
138: }
139:
140: /**
141: * Creates a TokenStream which tokenizes all the text in the provided Reader.
142: *
143: * @return A TokenStream build from a StandardTokenizer filtered with
144: * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
145: */
146: public final TokenStream tokenStream(String fieldName, Reader reader) {
147:
148: if (fieldName == null)
149: throw new IllegalArgumentException(
150: "fieldName must not be null");
151: if (reader == null)
152: throw new IllegalArgumentException(
153: "reader must not be null");
154:
155: TokenStream result = new StandardTokenizer(reader);
156: result = new StandardFilter(result);
157: result = new StopFilter(result, stoptable);
158: result = new FrenchStemFilter(result, excltable);
159: // Convert to lowercase after stemming!
160: result = new LowerCaseFilter(result);
161: return result;
162: }
163: }
|