001: package org.apache.lucene.analysis.br;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.LowerCaseFilter;
022: import org.apache.lucene.analysis.StopFilter;
023: import org.apache.lucene.analysis.TokenStream;
024: import org.apache.lucene.analysis.WordlistLoader;
025: import org.apache.lucene.analysis.standard.StandardFilter;
026: import org.apache.lucene.analysis.standard.StandardTokenizer;
027: import java.io.File;
028: import java.io.IOException;
029: import java.io.Reader;
030: import java.util.Hashtable;
031: import java.util.HashSet;
032: import java.util.Set;
033:
034: /**
035: * Analyzer for Brazilian language. Supports an external list of stopwords (words that
036: * will not be indexed at all) and an external list of exclusions (word that will
037: * not be stemmed, but indexed).
038: *
039: * @author João Kramer
040: */
041: public final class BrazilianAnalyzer extends Analyzer {
042:
043: /**
044: * List of typical Brazilian stopwords.
045: */
046: public final static String[] BRAZILIAN_STOP_WORDS = { "a", "ainda",
047: "alem", "ambas", "ambos", "antes", "ao", "aonde", "aos",
048: "apos", "aquele", "aqueles", "as", "assim", "com", "como",
049: "contra", "contudo", "cuja", "cujas", "cujo", "cujos",
050: "da", "das", "de", "dela", "dele", "deles", "demais",
051: "depois", "desde", "desta", "deste", "dispoe", "dispoem",
052: "diversa", "diversas", "diversos", "do", "dos", "durante",
053: "e", "ela", "elas", "ele", "eles", "em", "entao", "entre",
054: "essa", "essas", "esse", "esses", "esta", "estas", "este",
055: "estes", "ha", "isso", "isto", "logo", "mais", "mas",
056: "mediante", "menos", "mesma", "mesmas", "mesmo", "mesmos",
057: "na", "nas", "nao", "nas", "nem", "nesse", "neste", "nos",
058: "o", "os", "ou", "outra", "outras", "outro", "outros",
059: "pelas", "pelas", "pelo", "pelos", "perante", "pois",
060: "por", "porque", "portanto", "proprio", "propios", "quais",
061: "qual", "qualquer", "quando", "quanto", "que", "quem",
062: "quer", "se", "seja", "sem", "sendo", "seu", "seus", "sob",
063: "sobre", "sua", "suas", "tal", "tambem", "teu", "teus",
064: "toda", "todas", "todo", "todos", "tua", "tuas", "tudo",
065: "um", "uma", "umas", "uns" };
066:
067: /**
068: * Contains the stopwords used with the StopFilter.
069: */
070: private Set stoptable = new HashSet();
071:
072: /**
073: * Contains words that should be indexed but not stemmed.
074: */
075: private Set excltable = new HashSet();
076:
077: /**
078: * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
079: */
080: public BrazilianAnalyzer() {
081: stoptable = StopFilter.makeStopSet(BRAZILIAN_STOP_WORDS);
082: }
083:
084: /**
085: * Builds an analyzer with the given stop words.
086: */
087: public BrazilianAnalyzer(String[] stopwords) {
088: stoptable = StopFilter.makeStopSet(stopwords);
089: }
090:
091: /**
092: * Builds an analyzer with the given stop words.
093: */
094: public BrazilianAnalyzer(Hashtable stopwords) {
095: stoptable = new HashSet(stopwords.keySet());
096: }
097:
098: /**
099: * Builds an analyzer with the given stop words.
100: */
101: public BrazilianAnalyzer(File stopwords) throws IOException {
102: stoptable = WordlistLoader.getWordSet(stopwords);
103: }
104:
105: /**
106: * Builds an exclusionlist from an array of Strings.
107: */
108: public void setStemExclusionTable(String[] exclusionlist) {
109: excltable = StopFilter.makeStopSet(exclusionlist);
110: }
111:
112: /**
113: * Builds an exclusionlist from a Hashtable.
114: */
115: public void setStemExclusionTable(Hashtable exclusionlist) {
116: excltable = new HashSet(exclusionlist.keySet());
117: }
118:
119: /**
120: * Builds an exclusionlist from the words contained in the given file.
121: */
122: public void setStemExclusionTable(File exclusionlist)
123: throws IOException {
124: excltable = WordlistLoader.getWordSet(exclusionlist);
125: }
126:
127: /**
128: * Creates a TokenStream which tokenizes all the text in the provided Reader.
129: *
130: * @return A TokenStream build from a StandardTokenizer filtered with
131: * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
132: */
133: public final TokenStream tokenStream(String fieldName, Reader reader) {
134: TokenStream result = new StandardTokenizer(reader);
135: result = new StandardFilter(result);
136: result = new StopFilter(result, stoptable);
137: result = new BrazilianStemFilter(result, excltable);
138: // Convert to lowercase after stemming!
139: result = new LowerCaseFilter(result);
140: return result;
141: }
142: }
|