001: package org.apache.lucene.analysis.ru;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: /**
021: * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
022: *
023: *
024: * @version $Id: RussianStemmer.java 564236 2007-08-09 15:21:19Z gsingers $
025: */
026: class RussianStemmer {
027: private char[] charset;
028:
029: // positions of RV, R1 and R2 respectively
030: private int RV, R1, R2;
031:
032: // letters (currently unused letters are commented out)
033: private final static char A = 0;
034: //private final static char B = 1;
035: private final static char V = 2;
036: private final static char G = 3;
037: //private final static char D = 4;
038: private final static char E = 5;
039: //private final static char ZH = 6;
040: //private final static char Z = 7;
041: private final static char I = 8;
042: private final static char I_ = 9;
043: //private final static char K = 10;
044: private final static char L = 11;
045: private final static char M = 12;
046: private final static char N = 13;
047: private final static char O = 14;
048: //private final static char P = 15;
049: //private final static char R = 16;
050: private final static char S = 17;
051: private final static char T = 18;
052: private final static char U = 19;
053: //private final static char F = 20;
054: private final static char X = 21;
055: //private final static char TS = 22;
056: //private final static char CH = 23;
057: private final static char SH = 24;
058: private final static char SHCH = 25;
059: //private final static char HARD = 26;
060: private final static char Y = 27;
061: private final static char SOFT = 28;
062: private final static char AE = 29;
063: private final static char IU = 30;
064: private final static char IA = 31;
065:
066: // stem definitions
067: private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
068:
069: private static char[][] perfectiveGerundEndings1 = { { V },
070: { V, SH, I }, { V, SH, I, S, SOFT } };
071:
072: private static char[][] perfectiveGerund1Predessors = { { A },
073: { IA } };
074:
075: private static char[][] perfectiveGerundEndings2 = { { I, V },
076: { Y, V }, { I, V, SH, I }, { Y, V, SH, I },
077: { I, V, SH, I, S, SOFT }, { Y, V, SH, I, S, SOFT } };
078:
079: private static char[][] adjectiveEndings = { { E, E }, { I, E },
080: { Y, E }, { O, E }, { E, I_ }, { I, I_ }, { Y, I_ },
081: { O, I_ }, { E, M }, { I, M }, { Y, M }, { O, M },
082: { I, X }, { Y, X }, { U, IU }, { IU, IU }, { A, IA },
083: { IA, IA }, { O, IU }, { E, IU }, { I, M, I }, { Y, M, I },
084: { E, G, O }, { O, G, O }, { E, M, U }, { O, M, U } };
085:
086: private static char[][] participleEndings1 = { { SHCH }, { E, M },
087: { N, N }, { V, SH }, { IU, SHCH } };
088:
089: private static char[][] participleEndings2 = { { I, V, SH },
090: { Y, V, SH }, { U, IU, SHCH } };
091:
092: private static char[][] participle1Predessors = { { A }, { IA } };
093:
094: private static char[][] reflexiveEndings = { { S, IA }, { S, SOFT } };
095:
096: private static char[][] verbEndings1 = { { I_ }, { L }, { N },
097: { L, O }, { N, O }, { E, T }, { IU, T }, { L, A },
098: { N, A }, { L, I }, { E, M }, { N, Y }, { E, T, E },
099: { I_, T, E }, { T, SOFT }, { E, SH, SOFT }, { N, N, O } };
100:
101: private static char[][] verbEndings2 = { { IU }, { U, IU },
102: { E, N }, { E, I_ }, { IA, T }, { U, I_ }, { I, L },
103: { Y, L }, { I, M }, { Y, M }, { I, T }, { Y, T },
104: { I, L, A }, { Y, L, A }, { E, N, A }, { I, T, E },
105: { I, L, I }, { Y, L, I }, { I, L, O }, { Y, L, O },
106: { E, N, O }, { U, E, T }, { U, IU, T }, { E, N, Y },
107: { I, T, SOFT }, { Y, T, SOFT }, { I, SH, SOFT },
108: { E, I_, T, E }, { U, I_, T, E } };
109:
110: private static char[][] verb1Predessors = { { A }, { IA } };
111:
112: private static char[][] nounEndings = { { A }, { U }, { I_ },
113: { O }, { U }, { E }, { Y }, { I }, { SOFT }, { IA },
114: { E, V }, { O, V }, { I, E }, { SOFT, E }, { IA, X },
115: { I, IU }, { E, I }, { I, I }, { E, I_ }, { O, I_ },
116: { E, M }, { A, M }, { O, M }, { A, X }, { SOFT, IU },
117: { I, IA }, { SOFT, IA }, { I, I_ }, { IA, M },
118: { IA, M, I }, { A, M, I }, { I, E, I_ }, { I, IA, M },
119: { I, E, M }, { I, IA, X }, { I, IA, M, I } };
120:
121: private static char[][] super lativeEndings = { { E, I_, SH },
122: { E, I_, SH, E } };
123:
124: private static char[][] derivationalEndings = { { O, S, T },
125: { O, S, T, SOFT } };
126:
127: /**
128: * RussianStemmer constructor comment.
129: */
130: public RussianStemmer() {
131: super ();
132: }
133:
134: /**
135: * RussianStemmer constructor comment.
136: */
137: public RussianStemmer(char[] charset) {
138: super ();
139: this .charset = charset;
140: }
141:
142: /**
143: * Adjectival ending is an adjective ending,
144: * optionally preceded by participle ending.
145: * Creation date: (17/03/2002 12:14:58 AM)
146: * @param stemmingZone java.lang.StringBuffer
147: */
148: private boolean adjectival(StringBuffer stemmingZone) {
149: // look for adjective ending in a stemming zone
150: if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
151: return false;
152: // if adjective ending was found, try for participle ending.
153: // variable r is unused, we are just interested in the side effect of
154: // findAndRemoveEnding():
155: boolean r = findAndRemoveEnding(stemmingZone,
156: participleEndings1, participle1Predessors)
157: || findAndRemoveEnding(stemmingZone, participleEndings2);
158: return true;
159: }
160:
161: /**
162: * Derivational endings
163: * Creation date: (17/03/2002 12:14:58 AM)
164: * @param stemmingZone java.lang.StringBuffer
165: */
166: private boolean derivational(StringBuffer stemmingZone) {
167: int endingLength = findEnding(stemmingZone, derivationalEndings);
168: if (endingLength == 0)
169: // no derivational ending found
170: return false;
171: else {
172: // Ensure that the ending locates in R2
173: if (R2 - RV <= stemmingZone.length() - endingLength) {
174: stemmingZone.setLength(stemmingZone.length()
175: - endingLength);
176: return true;
177: } else {
178: return false;
179: }
180: }
181: }
182:
183: /**
184: * Finds ending among given ending class and returns the length of ending found(0, if not found).
185: * Creation date: (17/03/2002 8:18:34 PM)
186: */
187: private int findEnding(StringBuffer stemmingZone, int startIndex,
188: char[][] theEndingClass) {
189: boolean match = false;
190: for (int i = theEndingClass.length - 1; i >= 0; i--) {
191: char[] theEnding = theEndingClass[i];
192: // check if the ending is bigger than stemming zone
193: if (startIndex < theEnding.length - 1) {
194: match = false;
195: continue;
196: }
197: match = true;
198: int stemmingIndex = startIndex;
199: for (int j = theEnding.length - 1; j >= 0; j--) {
200: if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) {
201: match = false;
202: break;
203: }
204: }
205: // check if ending was found
206: if (match) {
207: return theEndingClass[i].length; // cut ending
208: }
209: }
210: return 0;
211: }
212:
213: private int findEnding(StringBuffer stemmingZone,
214: char[][] theEndingClass) {
215: return findEnding(stemmingZone, stemmingZone.length() - 1,
216: theEndingClass);
217: }
218:
219: /**
220: * Finds the ending among the given class of endings and removes it from stemming zone.
221: * Creation date: (17/03/2002 8:18:34 PM)
222: */
223: private boolean findAndRemoveEnding(StringBuffer stemmingZone,
224: char[][] theEndingClass) {
225: int endingLength = findEnding(stemmingZone, theEndingClass);
226: if (endingLength == 0)
227: // not found
228: return false;
229: else {
230: stemmingZone
231: .setLength(stemmingZone.length() - endingLength);
232: // cut the ending found
233: return true;
234: }
235: }
236:
237: /**
238: * Finds the ending among the given class of endings, then checks if this ending was
239: * preceded by any of given predessors, and if so, removes it from stemming zone.
240: * Creation date: (17/03/2002 8:18:34 PM)
241: */
242: private boolean findAndRemoveEnding(StringBuffer stemmingZone,
243: char[][] theEndingClass, char[][] thePredessors) {
244: int endingLength = findEnding(stemmingZone, theEndingClass);
245: if (endingLength == 0)
246: // not found
247: return false;
248: else {
249: int predessorLength = findEnding(stemmingZone, stemmingZone
250: .length()
251: - endingLength - 1, thePredessors);
252: if (predessorLength == 0)
253: return false;
254: else {
255: stemmingZone.setLength(stemmingZone.length()
256: - endingLength);
257: // cut the ending found
258: return true;
259: }
260: }
261:
262: }
263:
264: /**
265: * Marks positions of RV, R1 and R2 in a given word.
266: * Creation date: (16/03/2002 3:40:11 PM)
267: */
268: private void markPositions(String word) {
269: RV = 0;
270: R1 = 0;
271: R2 = 0;
272: int i = 0;
273: // find RV
274: while (word.length() > i && !isVowel(word.charAt(i))) {
275: i++;
276: }
277: if (word.length() - 1 < ++i)
278: return; // RV zone is empty
279: RV = i;
280: // find R1
281: while (word.length() > i && isVowel(word.charAt(i))) {
282: i++;
283: }
284: if (word.length() - 1 < ++i)
285: return; // R1 zone is empty
286: R1 = i;
287: // find R2
288: while (word.length() > i && !isVowel(word.charAt(i))) {
289: i++;
290: }
291: if (word.length() - 1 < ++i)
292: return; // R2 zone is empty
293: while (word.length() > i && isVowel(word.charAt(i))) {
294: i++;
295: }
296: if (word.length() - 1 < ++i)
297: return; // R2 zone is empty
298: R2 = i;
299: }
300:
301: /**
302: * Checks if character is a vowel..
303: * Creation date: (16/03/2002 10:47:03 PM)
304: * @return boolean
305: * @param letter char
306: */
307: private boolean isVowel(char letter) {
308: for (int i = 0; i < vowels.length; i++) {
309: if (letter == charset[vowels[i]])
310: return true;
311: }
312: return false;
313: }
314:
315: /**
316: * Noun endings.
317: * Creation date: (17/03/2002 12:14:58 AM)
318: * @param stemmingZone java.lang.StringBuffer
319: */
320: private boolean noun(StringBuffer stemmingZone) {
321: return findAndRemoveEnding(stemmingZone, nounEndings);
322: }
323:
324: /**
325: * Perfective gerund endings.
326: * Creation date: (17/03/2002 12:14:58 AM)
327: * @param stemmingZone java.lang.StringBuffer
328: */
329: private boolean perfectiveGerund(StringBuffer stemmingZone) {
330: return findAndRemoveEnding(stemmingZone,
331: perfectiveGerundEndings1, perfectiveGerund1Predessors)
332: || findAndRemoveEnding(stemmingZone,
333: perfectiveGerundEndings2);
334: }
335:
336: /**
337: * Reflexive endings.
338: * Creation date: (17/03/2002 12:14:58 AM)
339: * @param stemmingZone java.lang.StringBuffer
340: */
341: private boolean reflexive(StringBuffer stemmingZone) {
342: return findAndRemoveEnding(stemmingZone, reflexiveEndings);
343: }
344:
345: /**
346: * Insert the method's description here.
347: * Creation date: (17/03/2002 12:14:58 AM)
348: * @param stemmingZone java.lang.StringBuffer
349: */
350: private boolean removeI(StringBuffer stemmingZone) {
351: if (stemmingZone.length() > 0
352: && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) {
353: stemmingZone.setLength(stemmingZone.length() - 1);
354: return true;
355: } else {
356: return false;
357: }
358: }
359:
360: /**
361: * Insert the method's description here.
362: * Creation date: (17/03/2002 12:14:58 AM)
363: * @param stemmingZone java.lang.StringBuffer
364: */
365: private boolean removeSoft(StringBuffer stemmingZone) {
366: if (stemmingZone.length() > 0
367: && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) {
368: stemmingZone.setLength(stemmingZone.length() - 1);
369: return true;
370: } else {
371: return false;
372: }
373: }
374:
375: /**
376: * Insert the method's description here.
377: * Creation date: (16/03/2002 10:58:42 PM)
378: * @param newCharset char[]
379: */
380: public void setCharset(char[] newCharset) {
381: charset = newCharset;
382: }
383:
384: /**
385: * Finds the stem for given Russian word.
386: * Creation date: (16/03/2002 3:36:48 PM)
387: * @return java.lang.String
388: * @param input java.lang.String
389: */
390: public String stem(String input) {
391: markPositions(input);
392: if (RV == 0)
393: return input; //RV wasn't detected, nothing to stem
394: StringBuffer stemmingZone = new StringBuffer(input
395: .substring(RV));
396: // stemming goes on in RV
397: // Step 1
398:
399: if (!perfectiveGerund(stemmingZone)) {
400: reflexive(stemmingZone);
401: // variable r is unused, we are just interested in the flow that gets
402: // created by logical expression: apply adjectival(); if that fails,
403: // apply verb() etc
404: boolean r = adjectival(stemmingZone) || verb(stemmingZone)
405: || noun(stemmingZone);
406: }
407: // Step 2
408: removeI(stemmingZone);
409: // Step 3
410: derivational(stemmingZone);
411: // Step 4
412: super lative(stemmingZone);
413: undoubleN(stemmingZone);
414: removeSoft(stemmingZone);
415: // return result
416: return input.substring(0, RV) + stemmingZone.toString();
417: }
418:
419: /**
420: * Superlative endings.
421: * Creation date: (17/03/2002 12:14:58 AM)
422: * @param stemmingZone java.lang.StringBuffer
423: */
424: private boolean super lative(StringBuffer stemmingZone) {
425: return findAndRemoveEnding(stemmingZone, super lativeEndings);
426: }
427:
428: /**
429: * Undoubles N.
430: * Creation date: (17/03/2002 12:14:58 AM)
431: * @param stemmingZone java.lang.StringBuffer
432: */
433: private boolean undoubleN(StringBuffer stemmingZone) {
434: char[][] doubleN = { { N, N } };
435: if (findEnding(stemmingZone, doubleN) != 0) {
436: stemmingZone.setLength(stemmingZone.length() - 1);
437: return true;
438: } else {
439: return false;
440: }
441: }
442:
443: /**
444: * Verb endings.
445: * Creation date: (17/03/2002 12:14:58 AM)
446: * @param stemmingZone java.lang.StringBuffer
447: */
448: private boolean verb(StringBuffer stemmingZone) {
449: return findAndRemoveEnding(stemmingZone, verbEndings1,
450: verb1Predessors)
451: || findAndRemoveEnding(stemmingZone, verbEndings2);
452: }
453:
454: /**
455: * Static method for stemming with different charsets
456: */
457: public static String stem(String theWord, char[] charset) {
458: RussianStemmer stemmer = new RussianStemmer();
459: stemmer.setCharset(charset);
460: return stemmer.stem(theWord);
461: }
462: }
|