001: package org.apache.lucene.analysis.ru;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.Reader;
021: import java.util.HashSet;
022: import java.util.Hashtable;
023: import java.util.Set;
024:
025: import org.apache.lucene.analysis.Analyzer;
026: import org.apache.lucene.analysis.StopFilter;
027: import org.apache.lucene.analysis.TokenStream;
028:
029: /**
030: * Analyzer for Russian language. Supports an external list of stopwords (words that
031: * will not be indexed at all).
032: * A default set of stopwords is used unless an alternative list is specified.
033: *
034: *
035: * @version $Id: RussianAnalyzer.java 564236 2007-08-09 15:21:19Z gsingers $
036: */
037: public final class RussianAnalyzer extends Analyzer {
038: // letters (currently unused letters are commented out)
039: private final static char A = 0;
040: private final static char B = 1;
041: private final static char V = 2;
042: private final static char G = 3;
043: private final static char D = 4;
044: private final static char E = 5;
045: private final static char ZH = 6;
046: private final static char Z = 7;
047: private final static char I = 8;
048: private final static char I_ = 9;
049: private final static char K = 10;
050: private final static char L = 11;
051: private final static char M = 12;
052: private final static char N = 13;
053: private final static char O = 14;
054: private final static char P = 15;
055: private final static char R = 16;
056: private final static char S = 17;
057: private final static char T = 18;
058: private final static char U = 19;
059: //private final static char F = 20;
060: private final static char X = 21;
061: //private final static char TS = 22;
062: private final static char CH = 23;
063: private final static char SH = 24;
064: private final static char SHCH = 25;
065: //private final static char HARD = 26;
066: private final static char Y = 27;
067: private final static char SOFT = 28;
068: private final static char AE = 29;
069: private final static char IU = 30;
070: private final static char IA = 31;
071:
072: /**
073: * List of typical Russian stopwords.
074: */
075: private static char[][] RUSSIAN_STOP_WORDS = { { A }, { B, E, Z },
076: { B, O, L, E, E }, { B, Y }, { B, Y, L }, { B, Y, L, A },
077: { B, Y, L, I }, { B, Y, L, O }, { B, Y, T, SOFT }, { V },
078: { V, A, M }, { V, A, S }, { V, E, S, SOFT }, { V, O },
079: { V, O, T }, { V, S, E }, { V, S, E, G, O },
080: { V, S, E, X }, { V, Y }, { G, D, E }, { D, A },
081: { D, A, ZH, E }, { D, L, IA }, { D, O }, { E, G, O },
082: { E, E }, { E, I_, }, { E, IU }, { E, S, L, I },
083: { E, S, T, SOFT }, { E, SHCH, E }, { ZH, E }, { Z, A },
084: { Z, D, E, S, SOFT }, { I }, { I, Z }, { I, L, I },
085: { I, M }, { I, X }, { K }, { K, A, K }, { K, O },
086: { K, O, G, D, A }, { K, T, O }, { L, I }, { L, I, B, O },
087: { M, N, E }, { M, O, ZH, E, T }, { M, Y }, { N, A },
088: { N, A, D, O }, { N, A, SH }, { N, E }, { N, E, G, O },
089: { N, E, E }, { N, E, T }, { N, I }, { N, I, X }, { N, O },
090: { N, U }, { O }, { O, B }, { O, D, N, A, K, O }, { O, N },
091: { O, N, A }, { O, N, I }, { O, N, O }, { O, T },
092: { O, CH, E, N, SOFT }, { P, O }, { P, O, D }, { P, R, I },
093: { S }, { S, O }, { T, A, K }, { T, A, K, ZH, E },
094: { T, A, K, O, I_ }, { T, A, M }, { T, E }, { T, E, M },
095: { T, O }, { T, O, G, O }, { T, O, ZH, E }, { T, O, I_ },
096: { T, O, L, SOFT, K, O }, { T, O, M }, { T, Y }, { U },
097: { U, ZH, E }, { X, O, T, IA }, { CH, E, G, O },
098: { CH, E, I_ }, { CH, E, M }, { CH, T, O },
099: { CH, T, O, B, Y }, { CH, SOFT, E }, { CH, SOFT, IA },
100: { AE, T, A }, { AE, T, I }, { AE, T, O }, { IA } };
101:
102: /**
103: * Contains the stopwords used with the StopFilter.
104: */
105: private Set stopSet = new HashSet();
106:
107: /**
108: * Charset for Russian letters.
109: * Represents encoding for 32 lowercase Russian letters.
110: * Predefined charsets can be taken from RussianCharSets class
111: */
112: private char[] charset;
113:
114: public RussianAnalyzer() {
115: charset = RussianCharsets.UnicodeRussian;
116: stopSet = StopFilter
117: .makeStopSet(makeStopWords(RussianCharsets.UnicodeRussian));
118: }
119:
120: /**
121: * Builds an analyzer.
122: */
123: public RussianAnalyzer(char[] charset) {
124: this .charset = charset;
125: stopSet = StopFilter.makeStopSet(makeStopWords(charset));
126: }
127:
128: /**
129: * Builds an analyzer with the given stop words.
130: */
131: public RussianAnalyzer(char[] charset, String[] stopwords) {
132: this .charset = charset;
133: stopSet = StopFilter.makeStopSet(stopwords);
134: }
135:
136: // Takes russian stop words and translates them to a String array, using
137: // the given charset
138: private static String[] makeStopWords(char[] charset) {
139: String[] res = new String[RUSSIAN_STOP_WORDS.length];
140: for (int i = 0; i < res.length; i++) {
141: char[] theStopWord = RUSSIAN_STOP_WORDS[i];
142: // translate the word, using the charset
143: StringBuffer theWord = new StringBuffer();
144: for (int j = 0; j < theStopWord.length; j++) {
145: theWord.append(charset[theStopWord[j]]);
146: }
147: res[i] = theWord.toString();
148: }
149: return res;
150: }
151:
152: /**
153: * Builds an analyzer with the given stop words.
154: * @todo create a Set version of this ctor
155: */
156: public RussianAnalyzer(char[] charset, Hashtable stopwords) {
157: this .charset = charset;
158: stopSet = new HashSet(stopwords.keySet());
159: }
160:
161: /**
162: * Creates a TokenStream which tokenizes all the text in the provided Reader.
163: *
164: * @return A TokenStream build from a RussianLetterTokenizer filtered with
165: * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
166: */
167: public TokenStream tokenStream(String fieldName, Reader reader) {
168: TokenStream result = new RussianLetterTokenizer(reader, charset);
169: result = new RussianLowerCaseFilter(result, charset);
170: result = new StopFilter(result, stopSet);
171: result = new RussianStemFilter(result, charset);
172: return result;
173: }
174: }
|