0001: package org.apache.lucene.analysis.ru;
0002:
0003: /**
0004:
0005: * Copyright 2004 The Apache Software Foundation
0006:
0007: *
0008:
0009: * Licensed under the Apache License, Version 2.0 (the "License");
0010:
0011: * you may not use this file except in compliance with the License.
0012:
0013: * You may obtain a copy of the License at
0014:
0015: *
0016:
0017: * http://www.apache.org/licenses/LICENSE-2.0
0018:
0019: *
0020:
0021: * Unless required by applicable law or agreed to in writing, software
0022:
0023: * distributed under the License is distributed on an "AS IS" BASIS,
0024:
0025: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0026:
0027: * See the License for the specific language governing permissions and
0028:
0029: * limitations under the License.
0030:
0031: */
0032:
0033: /**
0034:
0035: * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
0036:
0037: *
0038:
0039: * @author Boris Okner, b.okner@rogers.com
0040:
0041: * @version $Id: RussianStemmer.java,v 1.1 2005/06/02 01:35:59 jfendler Exp $
0042:
0043: */
0044:
0045: class RussianStemmer
0046:
0047: {
0048:
0049: private char[] charset;
0050:
0051: // positions of RV, R1 and R2 respectively
0052:
0053: private int RV, R1, R2;
0054:
0055: // letters (currently unused letters are commented out)
0056:
0057: private final static char A = 0;
0058:
0059: //private final static char B = 1;
0060:
0061: private final static char V = 2;
0062:
0063: private final static char G = 3;
0064:
0065: //private final static char D = 4;
0066:
0067: private final static char E = 5;
0068:
0069: //private final static char ZH = 6;
0070:
0071: //private final static char Z = 7;
0072:
0073: private final static char I = 8;
0074:
0075: private final static char I_ = 9;
0076:
0077: //private final static char K = 10;
0078:
0079: private final static char L = 11;
0080:
0081: private final static char M = 12;
0082:
0083: private final static char N = 13;
0084:
0085: private final static char O = 14;
0086:
0087: //private final static char P = 15;
0088:
0089: //private final static char R = 16;
0090:
0091: private final static char S = 17;
0092:
0093: private final static char T = 18;
0094:
0095: private final static char U = 19;
0096:
0097: //private final static char F = 20;
0098:
0099: private final static char X = 21;
0100:
0101: //private final static char TS = 22;
0102:
0103: //private final static char CH = 23;
0104:
0105: private final static char SH = 24;
0106:
0107: private final static char SHCH = 25;
0108:
0109: //private final static char HARD = 26;
0110:
0111: private final static char Y = 27;
0112:
0113: private final static char SOFT = 28;
0114:
0115: private final static char AE = 29;
0116:
0117: private final static char IU = 30;
0118:
0119: private final static char IA = 31;
0120:
0121: // stem definitions
0122:
0123: private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
0124:
0125: private static char[][] perfectiveGerundEndings1 = {
0126:
0127: { V },
0128:
0129: { V, SH, I },
0130:
0131: { V, SH, I, S, SOFT }
0132:
0133: };
0134:
0135: private static char[][] perfectiveGerund1Predessors = {
0136:
0137: { A },
0138:
0139: { IA }
0140:
0141: };
0142:
0143: private static char[][] perfectiveGerundEndings2 = { { I, V }, {
0144:
0145: Y, V }, {
0146:
0147: I, V, SH, I }, {
0148:
0149: Y, V, SH, I }, {
0150:
0151: I, V, SH, I, S, SOFT }, {
0152:
0153: Y, V, SH, I, S, SOFT }
0154:
0155: };
0156:
0157: private static char[][] adjectiveEndings = {
0158:
0159: { E, E },
0160:
0161: { I, E },
0162:
0163: { Y, E },
0164:
0165: { O, E },
0166:
0167: { E, I_ },
0168:
0169: { I, I_ },
0170:
0171: { Y, I_ },
0172:
0173: { O, I_ },
0174:
0175: { E, M },
0176:
0177: { I, M },
0178:
0179: { Y, M },
0180:
0181: { O, M },
0182:
0183: { I, X },
0184:
0185: { Y, X },
0186:
0187: { U, IU },
0188:
0189: { IU, IU },
0190:
0191: { A, IA },
0192:
0193: { IA, IA },
0194:
0195: { O, IU },
0196:
0197: { E, IU },
0198:
0199: { I, M, I },
0200:
0201: { Y, M, I },
0202:
0203: { E, G, O },
0204:
0205: { O, G, O },
0206:
0207: { E, M, U },
0208:
0209: { O, M, U }
0210:
0211: };
0212:
0213: private static char[][] participleEndings1 = {
0214:
0215: { SHCH },
0216:
0217: { E, M },
0218:
0219: { N, N },
0220:
0221: { V, SH },
0222:
0223: { IU, SHCH }
0224:
0225: };
0226:
0227: private static char[][] participleEndings2 = {
0228:
0229: { I, V, SH },
0230:
0231: { Y, V, SH },
0232:
0233: { U, IU, SHCH }
0234:
0235: };
0236:
0237: private static char[][] participle1Predessors = {
0238:
0239: { A },
0240:
0241: { IA }
0242:
0243: };
0244:
0245: private static char[][] reflexiveEndings = {
0246:
0247: { S, IA },
0248:
0249: { S, SOFT }
0250:
0251: };
0252:
0253: private static char[][] verbEndings1 = {
0254:
0255: { I_ },
0256:
0257: { L },
0258:
0259: { N },
0260:
0261: { L, O },
0262:
0263: { N, O },
0264:
0265: { E, T },
0266:
0267: { IU, T },
0268:
0269: { L, A },
0270:
0271: { N, A },
0272:
0273: { L, I },
0274:
0275: { E, M },
0276:
0277: { N, Y },
0278:
0279: { E, T, E },
0280:
0281: { I_, T, E },
0282:
0283: { T, SOFT },
0284:
0285: { E, SH, SOFT },
0286:
0287: { N, N, O }
0288:
0289: };
0290:
0291: private static char[][] verbEndings2 = {
0292:
0293: { IU },
0294:
0295: { U, IU },
0296:
0297: { E, N },
0298:
0299: { E, I_ },
0300:
0301: { IA, T },
0302:
0303: { U, I_ },
0304:
0305: { I, L },
0306:
0307: { Y, L },
0308:
0309: { I, M },
0310:
0311: { Y, M },
0312:
0313: { I, T },
0314:
0315: { Y, T },
0316:
0317: { I, L, A },
0318:
0319: { Y, L, A },
0320:
0321: { E, N, A },
0322:
0323: { I, T, E },
0324:
0325: { I, L, I },
0326:
0327: { Y, L, I },
0328:
0329: { I, L, O },
0330:
0331: { Y, L, O },
0332:
0333: { E, N, O },
0334:
0335: { U, E, T },
0336:
0337: { U, IU, T },
0338:
0339: { E, N, Y },
0340:
0341: { I, T, SOFT },
0342:
0343: { Y, T, SOFT },
0344:
0345: { I, SH, SOFT },
0346:
0347: { E, I_, T, E },
0348:
0349: { U, I_, T, E }
0350:
0351: };
0352:
0353: private static char[][] verb1Predessors = {
0354:
0355: { A },
0356:
0357: { IA }
0358:
0359: };
0360:
0361: private static char[][] nounEndings = {
0362:
0363: { A },
0364:
0365: { U },
0366:
0367: { I_ },
0368:
0369: { O },
0370:
0371: { U },
0372:
0373: { E },
0374:
0375: { Y },
0376:
0377: { I },
0378:
0379: { SOFT },
0380:
0381: { IA },
0382:
0383: { E, V },
0384:
0385: { O, V },
0386:
0387: { I, E },
0388:
0389: { SOFT, E },
0390:
0391: { IA, X },
0392:
0393: { I, IU },
0394:
0395: { E, I },
0396:
0397: { I, I },
0398:
0399: { E, I_ },
0400:
0401: { O, I_ },
0402:
0403: { E, M },
0404:
0405: { A, M },
0406:
0407: { O, M },
0408:
0409: { A, X },
0410:
0411: { SOFT, IU },
0412:
0413: { I, IA },
0414:
0415: { SOFT, IA },
0416:
0417: { I, I_ },
0418:
0419: { IA, M },
0420:
0421: { IA, M, I },
0422:
0423: { A, M, I },
0424:
0425: { I, E, I_ },
0426:
0427: { I, IA, M },
0428:
0429: { I, E, M },
0430:
0431: { I, IA, X },
0432:
0433: { I, IA, M, I }
0434:
0435: };
0436:
0437: private static char[][] super lativeEndings = {
0438:
0439: { E, I_, SH },
0440:
0441: { E, I_, SH, E }
0442:
0443: };
0444:
0445: private static char[][] derivationalEndings = {
0446:
0447: { O, S, T },
0448:
0449: { O, S, T, SOFT }
0450:
0451: };
0452:
0453: /**
0454:
0455: * RussianStemmer constructor comment.
0456:
0457: */
0458:
0459: public RussianStemmer()
0460:
0461: {
0462:
0463: super ();
0464:
0465: }
0466:
0467: /**
0468:
0469: * RussianStemmer constructor comment.
0470:
0471: */
0472:
0473: public RussianStemmer(char[] charset)
0474:
0475: {
0476:
0477: super ();
0478:
0479: this .charset = charset;
0480:
0481: }
0482:
0483: /**
0484:
0485: * Adjectival ending is an adjective ending,
0486:
0487: * optionally preceded by participle ending.
0488:
0489: * Creation date: (17/03/2002 12:14:58 AM)
0490:
0491: * @param stemmingZone java.lang.StringBuffer
0492:
0493: */
0494:
0495: private boolean adjectival(StringBuffer stemmingZone)
0496:
0497: {
0498:
0499: // look for adjective ending in a stemming zone
0500:
0501: if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
0502:
0503: return false;
0504:
0505: // if adjective ending was found, try for participle ending.
0506:
0507: // variable r is unused, we are just interested in the side effect of
0508:
0509: // findAndRemoveEnding():
0510:
0511: boolean r =
0512:
0513: findAndRemoveEnding(stemmingZone, participleEndings1,
0514: participle1Predessors)
0515:
0516: ||
0517:
0518: findAndRemoveEnding(stemmingZone, participleEndings2);
0519:
0520: return true;
0521:
0522: }
0523:
0524: /**
0525:
0526: * Derivational endings
0527:
0528: * Creation date: (17/03/2002 12:14:58 AM)
0529:
0530: * @param stemmingZone java.lang.StringBuffer
0531:
0532: */
0533:
0534: private boolean derivational(StringBuffer stemmingZone)
0535:
0536: {
0537:
0538: int endingLength = findEnding(stemmingZone, derivationalEndings);
0539:
0540: if (endingLength == 0)
0541:
0542: // no derivational ending found
0543:
0544: return false;
0545:
0546: else
0547:
0548: {
0549:
0550: // Ensure that the ending locates in R2
0551:
0552: if (R2 - RV <= stemmingZone.length() - endingLength)
0553:
0554: {
0555:
0556: stemmingZone.setLength(stemmingZone.length()
0557: - endingLength);
0558:
0559: return true;
0560:
0561: }
0562:
0563: else
0564:
0565: {
0566:
0567: return false;
0568:
0569: }
0570:
0571: }
0572:
0573: }
0574:
0575: /**
0576:
0577: * Finds ending among given ending class and returns the length of ending found(0, if not found).
0578:
0579: * Creation date: (17/03/2002 8:18:34 PM)
0580:
0581: */
0582:
0583: private int findEnding(StringBuffer stemmingZone, int startIndex,
0584: char[][] theEndingClass)
0585:
0586: {
0587:
0588: boolean match = false;
0589:
0590: for (int i = theEndingClass.length - 1; i >= 0; i--)
0591:
0592: {
0593:
0594: char[] theEnding = theEndingClass[i];
0595:
0596: // check if the ending is bigger than stemming zone
0597:
0598: if (startIndex < theEnding.length - 1)
0599:
0600: {
0601:
0602: match = false;
0603:
0604: continue;
0605:
0606: }
0607:
0608: match = true;
0609:
0610: int stemmingIndex = startIndex;
0611:
0612: for (int j = theEnding.length - 1; j >= 0; j--)
0613:
0614: {
0615:
0616: if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
0617:
0618: {
0619:
0620: match = false;
0621:
0622: break;
0623:
0624: }
0625:
0626: }
0627:
0628: // check if ending was found
0629:
0630: if (match)
0631:
0632: {
0633:
0634: return theEndingClass[i].length; // cut ending
0635:
0636: }
0637:
0638: }
0639:
0640: return 0;
0641:
0642: }
0643:
0644: private int findEnding(StringBuffer stemmingZone,
0645: char[][] theEndingClass)
0646:
0647: {
0648:
0649: return findEnding(stemmingZone, stemmingZone.length() - 1,
0650: theEndingClass);
0651:
0652: }
0653:
0654: /**
0655:
0656: * Finds the ending among the given class of endings and removes it from stemming zone.
0657:
0658: * Creation date: (17/03/2002 8:18:34 PM)
0659:
0660: */
0661:
0662: private boolean findAndRemoveEnding(StringBuffer stemmingZone,
0663: char[][] theEndingClass)
0664:
0665: {
0666:
0667: int endingLength = findEnding(stemmingZone, theEndingClass);
0668:
0669: if (endingLength == 0)
0670:
0671: // not found
0672:
0673: return false;
0674:
0675: else {
0676:
0677: stemmingZone
0678: .setLength(stemmingZone.length() - endingLength);
0679:
0680: // cut the ending found
0681:
0682: return true;
0683:
0684: }
0685:
0686: }
0687:
0688: /**
0689:
0690: * Finds the ending among the given class of endings, then checks if this ending was
0691:
0692: * preceded by any of given predessors, and if so, removes it from stemming zone.
0693:
0694: * Creation date: (17/03/2002 8:18:34 PM)
0695:
0696: */
0697:
0698: private boolean findAndRemoveEnding(StringBuffer stemmingZone,
0699:
0700: char[][] theEndingClass, char[][] thePredessors)
0701:
0702: {
0703:
0704: int endingLength = findEnding(stemmingZone, theEndingClass);
0705:
0706: if (endingLength == 0)
0707:
0708: // not found
0709:
0710: return false;
0711:
0712: else
0713:
0714: {
0715:
0716: int predessorLength =
0717:
0718: findEnding(stemmingZone,
0719:
0720: stemmingZone.length() - endingLength - 1,
0721:
0722: thePredessors);
0723:
0724: if (predessorLength == 0)
0725:
0726: return false;
0727:
0728: else {
0729:
0730: stemmingZone.setLength(stemmingZone.length()
0731: - endingLength);
0732:
0733: // cut the ending found
0734:
0735: return true;
0736:
0737: }
0738:
0739: }
0740:
0741: }
0742:
0743: /**
0744:
0745: * Marks positions of RV, R1 and R2 in a given word.
0746:
0747: * Creation date: (16/03/2002 3:40:11 PM)
0748:
0749: */
0750:
0751: private void markPositions(String word)
0752:
0753: {
0754:
0755: RV = 0;
0756:
0757: R1 = 0;
0758:
0759: R2 = 0;
0760:
0761: int i = 0;
0762:
0763: // find RV
0764:
0765: while (word.length() > i && !isVowel(word.charAt(i)))
0766:
0767: {
0768:
0769: i++;
0770:
0771: }
0772:
0773: if (word.length() - 1 < ++i)
0774:
0775: return; // RV zone is empty
0776:
0777: RV = i;
0778:
0779: // find R1
0780:
0781: while (word.length() > i && isVowel(word.charAt(i)))
0782:
0783: {
0784:
0785: i++;
0786:
0787: }
0788:
0789: if (word.length() - 1 < ++i)
0790:
0791: return; // R1 zone is empty
0792:
0793: R1 = i;
0794:
0795: // find R2
0796:
0797: while (word.length() > i && !isVowel(word.charAt(i)))
0798:
0799: {
0800:
0801: i++;
0802:
0803: }
0804:
0805: if (word.length() - 1 < ++i)
0806:
0807: return; // R2 zone is empty
0808:
0809: while (word.length() > i && isVowel(word.charAt(i)))
0810:
0811: {
0812:
0813: i++;
0814:
0815: }
0816:
0817: if (word.length() - 1 < ++i)
0818:
0819: return; // R2 zone is empty
0820:
0821: R2 = i;
0822:
0823: }
0824:
0825: /**
0826:
0827: * Checks if character is a vowel..
0828:
0829: * Creation date: (16/03/2002 10:47:03 PM)
0830:
0831: * @return boolean
0832:
0833: * @param letter char
0834:
0835: */
0836:
0837: private boolean isVowel(char letter)
0838:
0839: {
0840:
0841: for (int i = 0; i < vowels.length; i++)
0842:
0843: {
0844:
0845: if (letter == charset[vowels[i]])
0846:
0847: return true;
0848:
0849: }
0850:
0851: return false;
0852:
0853: }
0854:
0855: /**
0856:
0857: * Noun endings.
0858:
0859: * Creation date: (17/03/2002 12:14:58 AM)
0860:
0861: * @param stemmingZone java.lang.StringBuffer
0862:
0863: */
0864:
0865: private boolean noun(StringBuffer stemmingZone)
0866:
0867: {
0868:
0869: return findAndRemoveEnding(stemmingZone, nounEndings);
0870:
0871: }
0872:
0873: /**
0874:
0875: * Perfective gerund endings.
0876:
0877: * Creation date: (17/03/2002 12:14:58 AM)
0878:
0879: * @param stemmingZone java.lang.StringBuffer
0880:
0881: */
0882:
0883: private boolean perfectiveGerund(StringBuffer stemmingZone)
0884:
0885: {
0886:
0887: return findAndRemoveEnding(
0888:
0889: stemmingZone,
0890:
0891: perfectiveGerundEndings1,
0892:
0893: perfectiveGerund1Predessors)
0894:
0895: || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
0896:
0897: }
0898:
0899: /**
0900:
0901: * Reflexive endings.
0902:
0903: * Creation date: (17/03/2002 12:14:58 AM)
0904:
0905: * @param stemmingZone java.lang.StringBuffer
0906:
0907: */
0908:
0909: private boolean reflexive(StringBuffer stemmingZone)
0910:
0911: {
0912:
0913: return findAndRemoveEnding(stemmingZone, reflexiveEndings);
0914:
0915: }
0916:
0917: /**
0918:
0919: * Insert the method's description here.
0920:
0921: * Creation date: (17/03/2002 12:14:58 AM)
0922:
0923: * @param stemmingZone java.lang.StringBuffer
0924:
0925: */
0926:
0927: private boolean removeI(StringBuffer stemmingZone)
0928:
0929: {
0930:
0931: if (stemmingZone.length() > 0
0932:
0933: && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
0934:
0935: {
0936:
0937: stemmingZone.setLength(stemmingZone.length() - 1);
0938:
0939: return true;
0940:
0941: }
0942:
0943: else
0944:
0945: {
0946:
0947: return false;
0948:
0949: }
0950:
0951: }
0952:
0953: /**
0954:
0955: * Insert the method's description here.
0956:
0957: * Creation date: (17/03/2002 12:14:58 AM)
0958:
0959: * @param stemmingZone java.lang.StringBuffer
0960:
0961: */
0962:
0963: private boolean removeSoft(StringBuffer stemmingZone)
0964:
0965: {
0966:
0967: if (stemmingZone.length() > 0
0968:
0969: && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
0970:
0971: {
0972:
0973: stemmingZone.setLength(stemmingZone.length() - 1);
0974:
0975: return true;
0976:
0977: }
0978:
0979: else
0980:
0981: {
0982:
0983: return false;
0984:
0985: }
0986:
0987: }
0988:
0989: /**
0990:
0991: * Insert the method's description here.
0992:
0993: * Creation date: (16/03/2002 10:58:42 PM)
0994:
0995: * @param newCharset char[]
0996:
0997: */
0998:
0999: public void setCharset(char[] newCharset)
1000:
1001: {
1002:
1003: charset = newCharset;
1004:
1005: }
1006:
1007: /**
1008:
1009: * Finds the stem for given Russian word.
1010:
1011: * Creation date: (16/03/2002 3:36:48 PM)
1012:
1013: * @return java.lang.String
1014:
1015: * @param input java.lang.String
1016:
1017: */
1018:
1019: public String stem(String input)
1020:
1021: {
1022:
1023: markPositions(input);
1024:
1025: if (RV == 0)
1026:
1027: return input; //RV wasn't detected, nothing to stem
1028:
1029: StringBuffer stemmingZone = new StringBuffer(input
1030: .substring(RV));
1031:
1032: // stemming goes on in RV
1033:
1034: // Step 1
1035:
1036: if (!perfectiveGerund(stemmingZone))
1037:
1038: {
1039:
1040: reflexive(stemmingZone);
1041:
1042: // variable r is unused, we are just interested in the flow that gets
1043:
1044: // created by logical expression: apply adjectival(); if that fails,
1045:
1046: // apply verb() etc
1047:
1048: boolean r =
1049:
1050: adjectival(stemmingZone)
1051:
1052: || verb(stemmingZone)
1053:
1054: || noun(stemmingZone);
1055:
1056: }
1057:
1058: // Step 2
1059:
1060: removeI(stemmingZone);
1061:
1062: // Step 3
1063:
1064: derivational(stemmingZone);
1065:
1066: // Step 4
1067:
1068: super lative(stemmingZone);
1069:
1070: undoubleN(stemmingZone);
1071:
1072: removeSoft(stemmingZone);
1073:
1074: // return result
1075:
1076: return input.substring(0, RV) + stemmingZone.toString();
1077:
1078: }
1079:
1080: /**
1081:
1082: * Superlative endings.
1083:
1084: * Creation date: (17/03/2002 12:14:58 AM)
1085:
1086: * @param stemmingZone java.lang.StringBuffer
1087:
1088: */
1089:
1090: private boolean super lative(StringBuffer stemmingZone)
1091:
1092: {
1093:
1094: return findAndRemoveEnding(stemmingZone, super lativeEndings);
1095:
1096: }
1097:
1098: /**
1099:
1100: * Undoubles N.
1101:
1102: * Creation date: (17/03/2002 12:14:58 AM)
1103:
1104: * @param stemmingZone java.lang.StringBuffer
1105:
1106: */
1107:
1108: private boolean undoubleN(StringBuffer stemmingZone)
1109:
1110: {
1111:
1112: char[][] doubleN = {
1113:
1114: { N, N }
1115:
1116: };
1117:
1118: if (findEnding(stemmingZone, doubleN) != 0)
1119:
1120: {
1121:
1122: stemmingZone.setLength(stemmingZone.length() - 1);
1123:
1124: return true;
1125:
1126: }
1127:
1128: else
1129:
1130: {
1131:
1132: return false;
1133:
1134: }
1135:
1136: }
1137:
1138: /**
1139:
1140: * Verb endings.
1141:
1142: * Creation date: (17/03/2002 12:14:58 AM)
1143:
1144: * @param stemmingZone java.lang.StringBuffer
1145:
1146: */
1147:
1148: private boolean verb(StringBuffer stemmingZone)
1149:
1150: {
1151:
1152: return findAndRemoveEnding(
1153:
1154: stemmingZone,
1155:
1156: verbEndings1,
1157:
1158: verb1Predessors)
1159:
1160: || findAndRemoveEnding(stemmingZone, verbEndings2);
1161:
1162: }
1163:
1164: /**
1165:
1166: * Static method for stemming with different charsets
1167:
1168: */
1169:
1170: public static String stem(String theWord, char[] charset)
1171:
1172: {
1173:
1174: RussianStemmer stemmer = new RussianStemmer();
1175:
1176: stemmer.setCharset(charset);
1177:
1178: return stemmer.stem(theWord);
1179:
1180: }
1181:
1182: }
|