001: /*
002:
003: * LIUS - Lucene Index Update and Search
004: * http://sourceforge.net/projects/lius/
005: *
006: * Copyright (c) 2005, Laval University Library. All rights reserved.
007: *
008: * This library is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Lesser General Public
010: * License as published by the Free Software Foundation; either
011: * version 2.1 of the License, or (at your option) any later version.
012: *
013: * This library is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public
019: * License along with this library; if not, write to the Free Software
020: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
021: */
022: package org.apache.lucene.analysis.lius.unicode;
023:
024: import java.io.Reader;
025: import java.util.Set;
026:
027: import org.apache.lucene.analysis.Analyzer;
028: import org.apache.lucene.analysis.LowerCaseFilter;
029: import org.apache.lucene.analysis.StopFilter;
030: import org.apache.lucene.analysis.TokenStream;
031: import org.apache.lucene.analysis.standard.StandardFilter;
032: import org.apache.lucene.analysis.standard.StandardTokenizer;
033:
034: /**
035: * <p>
036: * Titre : UTF8AccentRemoverAnalyzer
037: * </p>
038: * <p>
039: * Description : Class used by Lius framework (http://www.bibl.ulaval.ca/lius)
040: * </p>
041: * <p>
042: * Société : Universite Laval library
043: * </p>
044: *
045: * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
046: * @version 1.0
047: */
048:
049: public class UTF8AccentRemoverAnalyzer extends Analyzer {
050: /** Creates a new instance of AccentUnicodeAnalyzer */
051: public UTF8AccentRemoverAnalyzer() {
052: this (FRENCH_STOP_WORDS);
053: }
054:
055: private Set stopSet;
056:
057: /** Builds an analyzer with the given stop words. */
058: public UTF8AccentRemoverAnalyzer(String[] stopWords) {
059: stopSet = StopFilter.makeStopSet(stopWords);
060: }
061:
062: public final static String[] FRENCH_STOP_WORDS = { "a", "afin",
063: "ai", "ainsi", "après", "attendu", "au", "aujourd",
064: "auquel", "aussi", "autre", "autres", "aux", "auxquelles",
065: "auxquels", "avait", "avant", "avec", "avoir", "c", "car",
066: "ce", "ceci", "cela", "celle", "celles", "celui",
067: "cependant", "certain", "certaine", "certaines",
068: "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
069: "combien", "comme", "comment", "concernant", "contre", "d",
070: "dans", "de", "debout", "dedans", "dehors", "delà",
071: "depuis", "derrière", "des", "désormais", "desquelles",
072: "desquels", "dessous", "dessus", "devant", "devers",
073: "devra", "divers", "diverse", "diverses", "doit", "donc",
074: "dont", "du", "duquel", "durant", "dès", "elle", "elles",
075: "en", "entre", "environ", "est", "et", "etc", "etre", "eu",
076: "eux", "excepté", "hormis", "hors", "hélas", "hui", "il",
077: "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
078: "le", "lequel", "les", "lesquelles", "lesquels", "leur",
079: "leurs", "lorsque", "lui", "là", "ma", "mais", "malgré",
080: "me", "merci", "mes", "mien", "mienne", "miennes", "miens",
081: "moi", "moins", "mon", "moyennant", "même", "mêmes", "n",
082: "ne", "ni", "non", "nos", "notre", "nous", "néanmoins",
083: "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par",
084: "parmi", "partant", "pas", "passé", "pendant", "plein",
085: "plus", "plusieurs", "pour", "pourquoi", "proche", "près",
086: "puisque", "qu", "quand", "que", "quel", "quelle",
087: "quelles", "quels", "qui", "quoi", "quoique", "revoici",
088: "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
089: "seront", "ses", "si", "sien", "sienne", "siennes",
090: "siens", "sinon", "soi", "soit", "son", "sont", "sous",
091: "suivant", "sur", "ta", "te", "tes", "tien", "tienne",
092: "tiennes", "tiens", "toi", "ton", "tous", "tout", "toute",
093: "toutes", "tu", "un", "une", "va", "vers", "voici",
094: "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres",
095: "y", "à", "ça", "ès", "été", "être", "ô", "l'"
096:
097: };
098:
099: /**
100: *
101: * Constructs a {@link StandardTokenizer}filtered by a {@link
102: * StandardFilter}, a {@link LowerCaseFilter}and a {@link StopFilter}.
103: *
104: * @param fieldName
105: * @param reader
106: * @return
107: */
108: public TokenStream tokenStream(String fieldName, Reader reader) {
109: TokenStream result = new StandardTokenizer(reader);
110: result = new StandardFilter(result);
111: result = new LowerCaseFilter(result);
112: result = new StopFilter(result, stopSet);
113: result = (TokenStream) new UTF8AccentRemoverFilter(result);
114: return result;
115: }
116: }
|