001: package org.apache.lucene.analysis.fr;
002:
003: /* ====================================================================
004:
005: * The Apache Software License, Version 1.1
006:
007: *
008:
009: * Copyright (c) 2004 The Apache Software Foundation. All rights
010:
011: * reserved.
012:
013: *
014:
015: * Redistribution and use in source and binary forms, with or without
016:
017: * modification, are permitted provided that the following conditions
018:
019: * are met:
020:
021: *
022:
023: * 1. Redistributions of source code must retain the above copyright
024:
025: * notice, this list of conditions and the following disclaimer.
026:
027: *
028:
029: * 2. Redistributions in binary form must reproduce the above copyright
030:
031: * notice, this list of conditions and the following disclaimer in
032:
033: * the documentation and/or other materials provided with the
034:
035: * distribution.
036:
037: *
038:
039: * 3. The end-user documentation included with the redistribution,
040:
041: * if any, must include the following acknowledgment:
042:
043: * "This product includes software developed by the
044:
045: * Apache Software Foundation (http://www.apache.org/)."
046:
047: * Alternately, this acknowledgment may appear in the software itself,
048:
049: * if and wherever such third-party acknowledgments normally appear.
050:
051: *
052:
053: * 4. The names "Apache" and "Apache Software Foundation" and
054:
055: * "Apache Lucene" must not be used to endorse or promote products
056:
057: * derived from this software without prior written permission. For
058:
059: * written permission, please contact apache@apache.org.
060:
061: *
062:
063: * 5. Products derived from this software may not be called "Apache",
064:
065: * "Apache Lucene", nor may "Apache" appear in their name, without
066:
067: * prior written permission of the Apache Software Foundation.
068:
069: *
070:
071: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
072:
073: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
074:
075: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
076:
077: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
078:
079: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
080:
081: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
082:
083: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
084:
085: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
086:
087: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
088:
089: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
090:
091: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
092:
093: * SUCH DAMAGE.
094:
095: * ====================================================================
096:
097: *
098:
099: * This software consists of voluntary contributions made by many
100:
101: * individuals on behalf of the Apache Software Foundation. For more
102:
103: * information on the Apache Software Foundation, please see
104:
105: * <http://www.apache.org/>.
106:
107: */
108:
109: import org.apache.lucene.analysis.Analyzer;
110:
111: import org.apache.lucene.analysis.LowerCaseFilter;
112:
113: import org.apache.lucene.analysis.StopFilter;
114:
115: import org.apache.lucene.analysis.TokenStream;
116:
117: //import org.apache.lucene.analysis.WordlistLoader;
118:
119: import org.apache.lucene.analysis.standard.StandardFilter;
120:
121: import org.apache.lucene.analysis.standard.StandardTokenizer;
122:
123: import java.io.File;
124:
125: import java.io.IOException;
126:
127: import java.io.Reader;
128:
129: import java.util.HashSet;
130:
131: import java.util.Hashtable;
132:
133: import java.util.Set;
134:
135: import org.apache.lucene.analysis.de.WordlistLoader;
136:
137: /**
138:
139: * Analyzer for french language. Supports an external list of stopwords (words that
140:
141: * will not be indexed at all) and an external list of exclusions (word that will
142:
143: * not be stemmed, but indexed).
144:
145: * A default set of stopwords is used unless an other list is specified, the
146:
147: * exclusionlist is empty by default.
148:
149: *
150:
151: * @author Patrick Talbot (based on Gerhard Schwarz work for German)
152:
153: * @version $Id: FrenchAnalyzer.java,v 1.1 2005/06/02 01:36:00 jfendler Exp $
154:
155: */
156:
157: public final class FrenchAnalyzer
158:
159: extends Analyzer {
160:
161: /**
162:
163: * Extended list of typical french stopwords.
164:
165: */
166:
167: public final static String[] FRENCH_STOP_WORDS = {
168:
169: "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd",
170: "auquel",
171:
172: "aussi", "autre", "autres", "aux", "auxquelles",
173: "auxquels", "avait", "avant",
174:
175: "avec", "avoir", "c", "car", "ce", "ceci", "cela", "celle",
176: "celles", "celui", "cependant",
177:
178: "certain", "certaine", "certaines", "certains", "ces",
179: "cet", "cette", "ceux",
180:
181: "chez", "ci", "combien", "comme", "comment", "concernant",
182: "contre", "d", "dans",
183:
184: "de", "debout", "dedans", "dehors", "delà", "depuis",
185: "derrière", "des", "désormais",
186:
187: "desquelles", "desquels", "dessous", "dessus", "devant",
188: "devers", "devra",
189:
190: "divers", "diverse", "diverses", "doit", "donc", "dont",
191: "du", "duquel", "durant", "dès",
192:
193: "elle", "elles", "en", "entre", "environ", "est", "et",
194: "etc", "etre", "eu", "eux",
195:
196: "excepté", "hormis", "hors", "hélas", "hui", "il", "ils",
197: "j", "je", "jusqu", "jusque",
198:
199: "l", "la", "laquelle", "le", "lequel", "les", "lesquelles",
200: "lesquels", "leur", "leurs",
201:
202: "lorsque", "lui", "là", "ma", "mais", "malgré", "me",
203: "merci", "mes", "mien", "mienne",
204:
205: "miennes", "miens", "moi", "moins", "mon", "moyennant",
206: "même", "mêmes", "n", "ne", "ni", "non",
207:
208: "nos", "notre", "nous", "néanmoins", "nôtre", "nôtres",
209: "on", "ont", "ou", "outre",
210:
211: "où", "par", "parmi", "partant", "pas", "passé", "pendant",
212: "plein", "plus", "plusieurs",
213:
214: "pour", "pourquoi", "proche", "près", "puisque", "qu",
215: "quand", "que", "quel", "quelle",
216:
217: "quelles", "quels", "qui", "quoi", "quoique", "revoici",
218: "revoilà", "s", "sa", "sans",
219:
220: "sauf", "se", "selon", "seront", "ses", "si", "sien",
221: "sienne", "siennes", "siens", "sinon",
222:
223: "soi", "soit", "son", "sont", "sous", "suivant", "sur",
224: "ta", "te", "tes", "tien",
225:
226: "tienne", "tiennes", "tiens", "toi", "ton", "tous", "tout",
227: "toute", "toutes", "tu", "un",
228:
229: "une", "va", "vers", "voici", "voilà", "vos", "votre",
230: "vous", "vu", "vôtre", "vôtres",
231:
232: "y", "à", "ça", "ès", "été", "être", "ô", "l'"
233:
234: };
235:
236: /**
237:
238: * Contains the stopwords used with the StopFilter.
239:
240: */
241:
242: private Set stoptable = new HashSet();
243:
244: /**
245:
246: * Contains words that should be indexed but not stemmed.
247:
248: */
249:
250: private Set excltable = new HashSet();
251:
252: /**
253:
254: * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
255:
256: */
257:
258: public FrenchAnalyzer() {
259:
260: stoptable = StopFilter.makeStopSet(FRENCH_STOP_WORDS);
261:
262: }
263:
264: /**
265:
266: * Builds an analyzer with the given stop words.
267:
268: */
269:
270: public FrenchAnalyzer(String[] stopwords) {
271:
272: stoptable = StopFilter.makeStopSet(stopwords);
273:
274: }
275:
276: /**
277:
278: * Builds an analyzer with the given stop words.
279:
280: *
281:
282: * @deprecated
283:
284: */
285:
286: public FrenchAnalyzer(Hashtable stopwords) {
287:
288: stoptable = new HashSet(stopwords.keySet());
289:
290: }
291:
292: /**
293:
294: * Builds an analyzer with the given stop words.
295:
296: * @throws IOException
297:
298: */
299:
300: public FrenchAnalyzer(File stopwords) throws IOException {
301:
302: stoptable = new HashSet(WordlistLoader.getWordSet(stopwords));
303:
304: }
305:
306: /**
307:
308: * Builds an exclusionlist from an array of Strings.
309:
310: */
311:
312: public void setStemExclusionTable(String[] exclusionlist) {
313:
314: excltable = StopFilter.makeStopSet(exclusionlist);
315:
316: }
317:
318: /**
319:
320: * Builds an exclusionlist from a Hashtable.
321:
322: */
323:
324: public void setStemExclusionTable(Hashtable exclusionlist) {
325:
326: excltable = new HashSet(exclusionlist.keySet());
327:
328: }
329:
330: /**
331:
332: * Builds an exclusionlist from the words contained in the given file.
333:
334: * @throws IOException
335:
336: */
337:
338: public void setStemExclusionTable(File exclusionlist)
339: throws IOException {
340:
341: excltable = new HashSet(WordlistLoader
342: .getWordSet(exclusionlist));
343:
344: }
345:
346: /**
347:
348: * Creates a TokenStream which tokenizes all the text in the provided Reader.
349:
350: *
351:
352: * @return A TokenStream build from a StandardTokenizer filtered with
353:
354: * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
355:
356: */
357:
358: public final TokenStream tokenStream(String fieldName, Reader reader) {
359:
360: if (fieldName == null)
361:
362: throw new IllegalArgumentException(
363: "fieldName must not be null");
364:
365: if (reader == null)
366:
367: throw new IllegalArgumentException("readermust not be null");
368:
369: TokenStream result = new StandardTokenizer(reader);
370:
371: result = new StandardFilter(result);
372:
373: result = new StopFilter(result, stoptable);
374:
375: result = new FrenchStemFilter(result, excltable);
376:
377: // Convert to lowercase after stemming!
378:
379: result = new LowerCaseFilter(result);
380:
381: return result;
382:
383: }
384:
385: }
|