001: package org.apache.lucene.analysis.ru;
002:
003: /**
004:
005: * Copyright 2004 The Apache Software Foundation
006:
007: *
008:
009: * Licensed under the Apache License, Version 2.0 (the "License");
010:
011: * you may not use this file except in compliance with the License.
012:
013: * You may obtain a copy of the License at
014:
015: *
016:
017: * http://www.apache.org/licenses/LICENSE-2.0
018:
019: *
020:
021: * Unless required by applicable law or agreed to in writing, software
022:
023: * distributed under the License is distributed on an "AS IS" BASIS,
024:
025: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
026:
027: * See the License for the specific language governing permissions and
028:
029: * limitations under the License.
030:
031: */
032:
033: import org.apache.lucene.analysis.Analyzer;
034:
035: import org.apache.lucene.analysis.StopFilter;
036:
037: import org.apache.lucene.analysis.TokenStream;
038:
039: import java.io.Reader;
040:
041: import java.util.Hashtable;
042:
043: import java.util.Set;
044:
045: import java.util.HashSet;
046:
047: /**
048:
049: * Analyzer for Russian language. Supports an external list of stopwords (words that
050:
051: * will not be indexed at all).
052:
053: * A default set of stopwords is used unless an alternative list is specified.
054:
055: *
056:
057: * @author Boris Okner, b.okner@rogers.com
058:
059: * @version $Id: RussianAnalyzer.java,v 1.1 2005/06/02 01:35:59 jfendler Exp $
060:
061: */
062:
063: public final class RussianAnalyzer extends Analyzer
064:
065: {
066:
067: // letters (currently unused letters are commented out)
068:
069: private final static char A = 0;
070:
071: private final static char B = 1;
072:
073: private final static char V = 2;
074:
075: private final static char G = 3;
076:
077: private final static char D = 4;
078:
079: private final static char E = 5;
080:
081: private final static char ZH = 6;
082:
083: private final static char Z = 7;
084:
085: private final static char I = 8;
086:
087: private final static char I_ = 9;
088:
089: private final static char K = 10;
090:
091: private final static char L = 11;
092:
093: private final static char M = 12;
094:
095: private final static char N = 13;
096:
097: private final static char O = 14;
098:
099: private final static char P = 15;
100:
101: private final static char R = 16;
102:
103: private final static char S = 17;
104:
105: private final static char T = 18;
106:
107: private final static char U = 19;
108:
109: //private final static char F = 20;
110:
111: private final static char X = 21;
112:
113: //private final static char TS = 22;
114:
115: private final static char CH = 23;
116:
117: private final static char SH = 24;
118:
119: private final static char SHCH = 25;
120:
121: //private final static char HARD = 26;
122:
123: private final static char Y = 27;
124:
125: private final static char SOFT = 28;
126:
127: private final static char AE = 29;
128:
129: private final static char IU = 30;
130:
131: private final static char IA = 31;
132:
133: /**
134:
135: * List of typical Russian stopwords.
136:
137: */
138:
139: private static char[][] RUSSIAN_STOP_WORDS = {
140:
141: { A },
142:
143: { B, E, Z },
144:
145: { B, O, L, E, E },
146:
147: { B, Y },
148:
149: { B, Y, L },
150:
151: { B, Y, L, A },
152:
153: { B, Y, L, I },
154:
155: { B, Y, L, O },
156:
157: { B, Y, T, SOFT },
158:
159: { V },
160:
161: { V, A, M },
162:
163: { V, A, S },
164:
165: { V, E, S, SOFT },
166:
167: { V, O },
168:
169: { V, O, T },
170:
171: { V, S, E },
172:
173: { V, S, E, G, O },
174:
175: { V, S, E, X },
176:
177: { V, Y },
178:
179: { G, D, E },
180:
181: { D, A },
182:
183: { D, A, ZH, E },
184:
185: { D, L, IA },
186:
187: { D, O },
188:
189: { E, G, O },
190:
191: { E, E },
192:
193: { E, I_, },
194:
195: { E, IU },
196:
197: { E, S, L, I },
198:
199: { E, S, T, SOFT },
200:
201: { E, SHCH, E },
202:
203: { ZH, E },
204:
205: { Z, A },
206:
207: { Z, D, E, S, SOFT },
208:
209: { I },
210:
211: { I, Z },
212:
213: { I, L, I },
214:
215: { I, M },
216:
217: { I, X },
218:
219: { K },
220:
221: { K, A, K },
222:
223: { K, O },
224:
225: { K, O, G, D, A },
226:
227: { K, T, O },
228:
229: { L, I },
230:
231: { L, I, B, O },
232:
233: { M, N, E },
234:
235: { M, O, ZH, E, T },
236:
237: { M, Y },
238:
239: { N, A },
240:
241: { N, A, D, O },
242:
243: { N, A, SH },
244:
245: { N, E },
246:
247: { N, E, G, O },
248:
249: { N, E, E },
250:
251: { N, E, T },
252:
253: { N, I },
254:
255: { N, I, X },
256:
257: { N, O },
258:
259: { N, U },
260:
261: { O },
262:
263: { O, B },
264:
265: { O, D, N, A, K, O },
266:
267: { O, N },
268:
269: { O, N, A },
270:
271: { O, N, I },
272:
273: { O, N, O },
274:
275: { O, T },
276:
277: { O, CH, E, N, SOFT },
278:
279: { P, O },
280:
281: { P, O, D },
282:
283: { P, R, I },
284:
285: { S },
286:
287: { S, O },
288:
289: { T, A, K },
290:
291: { T, A, K, ZH, E },
292:
293: { T, A, K, O, I_ },
294:
295: { T, A, M },
296:
297: { T, E },
298:
299: { T, E, M },
300:
301: { T, O },
302:
303: { T, O, G, O },
304:
305: { T, O, ZH, E },
306:
307: { T, O, I_ },
308:
309: { T, O, L, SOFT, K, O },
310:
311: { T, O, M },
312:
313: { T, Y },
314:
315: { U },
316:
317: { U, ZH, E },
318:
319: { X, O, T, IA },
320:
321: { CH, E, G, O },
322:
323: { CH, E, I_ },
324:
325: { CH, E, M },
326:
327: { CH, T, O },
328:
329: { CH, T, O, B, Y },
330:
331: { CH, SOFT, E },
332:
333: { CH, SOFT, IA },
334:
335: { AE, T, A },
336:
337: { AE, T, I },
338:
339: { AE, T, O },
340:
341: { IA }
342:
343: };
344:
345: /**
346:
347: * Contains the stopwords used with the StopFilter.
348:
349: */
350:
351: private Set stopSet = new HashSet();
352:
353: /**
354:
355: * Charset for Russian letters.
356:
357: * Represents encoding for 32 lowercase Russian letters.
358:
359: * Predefined charsets can be taken from RussianCharSets class
360:
361: */
362:
363: private char[] charset;
364:
365: public RussianAnalyzer() {
366:
367: charset = RussianCharsets.UnicodeRussian;
368:
369: stopSet = StopFilter.makeStopSet(
370:
371: makeStopWords(RussianCharsets.UnicodeRussian));
372:
373: }
374:
375: /**
376:
377: * Builds an analyzer.
378:
379: */
380:
381: public RussianAnalyzer(char[] charset)
382:
383: {
384:
385: this .charset = charset;
386:
387: stopSet = StopFilter.makeStopSet(makeStopWords(charset));
388:
389: }
390:
391: /**
392:
393: * Builds an analyzer with the given stop words.
394:
395: */
396:
397: public RussianAnalyzer(char[] charset, String[] stopwords)
398:
399: {
400:
401: this .charset = charset;
402:
403: stopSet = StopFilter.makeStopSet(stopwords);
404:
405: }
406:
407: // Takes russian stop words and translates them to a String array, using
408:
409: // the given charset
410:
411: private static String[] makeStopWords(char[] charset)
412:
413: {
414:
415: String[] res = new String[RUSSIAN_STOP_WORDS.length];
416:
417: for (int i = 0; i < res.length; i++)
418:
419: {
420:
421: char[] theStopWord = RUSSIAN_STOP_WORDS[i];
422:
423: // translate the word, using the charset
424:
425: StringBuffer theWord = new StringBuffer();
426:
427: for (int j = 0; j < theStopWord.length; j++)
428:
429: {
430:
431: theWord.append(charset[theStopWord[j]]);
432:
433: }
434:
435: res[i] = theWord.toString();
436:
437: }
438:
439: return res;
440:
441: }
442:
443: /**
444:
445: * Builds an analyzer with the given stop words.
446:
447: * @todo create a Set version of this ctor
448:
449: */
450:
451: public RussianAnalyzer(char[] charset, Hashtable stopwords)
452:
453: {
454:
455: this .charset = charset;
456:
457: stopSet = new HashSet(stopwords.keySet());
458:
459: }
460:
461: /**
462:
463: * Creates a TokenStream which tokenizes all the text in the provided Reader.
464:
465: *
466:
467: * @return A TokenStream build from a RussianLetterTokenizer filtered with
468:
469: * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
470:
471: */
472:
473: public TokenStream tokenStream(String fieldName, Reader reader)
474:
475: {
476:
477: TokenStream result = new RussianLetterTokenizer(reader, charset);
478:
479: result = new RussianLowerCaseFilter(result, charset);
480:
481: result = new StopFilter(result, stopSet);
482:
483: result = new RussianStemFilter(result, charset);
484:
485: return result;
486:
487: }
488:
489: }
|