0001: package org.apache.lucene.analysis.br;
0002:
0003: /**
0004: * Licensed to the Apache Software Foundation (ASF) under one or more
0005: * contributor license agreements. See the NOTICE file distributed with
0006: * this work for additional information regarding copyright ownership.
0007: * The ASF licenses this file to You under the Apache License, Version 2.0
0008: * (the "License"); you may not use this file except in compliance with
0009: * the License. You may obtain a copy of the License at
0010: *
0011: * http://www.apache.org/licenses/LICENSE-2.0
0012: *
0013: * Unless required by applicable law or agreed to in writing, software
0014: * distributed under the License is distributed on an "AS IS" BASIS,
0015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016: * See the License for the specific language governing permissions and
0017: * limitations under the License.
0018: */
0019:
0020: /**
0021: * A stemmer for Brazilian words.
0022: */
0023: public class BrazilianStemmer {
0024:
0025: /**
0026: * Changed term
0027: */
0028: private String TERM;
0029: private String CT;
0030: private String R1;
0031: private String R2;
0032: private String RV;
0033:
0034: public BrazilianStemmer() {
0035: }
0036:
0037: /**
0038: * Stemms the given term to an unique <tt>discriminator</tt>.
0039: *
0040: * @param term The term that should be stemmed.
0041: * @return Discriminator for <tt>term</tt>
0042: */
0043: protected String stem(String term) {
0044: boolean altered = false; // altered the term
0045:
0046: // creates CT
0047: createCT(term);
0048:
0049: if (!isIndexable(CT)) {
0050: return null;
0051: }
0052: if (!isStemmable(CT)) {
0053: return CT;
0054: }
0055:
0056: R1 = getR1(CT);
0057: R2 = getR1(R1);
0058: RV = getRV(CT);
0059: TERM = term + ";" + CT;
0060:
0061: altered = step1();
0062: if (!altered) {
0063: altered = step2();
0064: }
0065:
0066: if (altered) {
0067: step3();
0068: } else {
0069: step4();
0070: }
0071:
0072: step5();
0073:
0074: return CT;
0075: }
0076:
0077: /**
0078: * Checks a term if it can be processed correctly.
0079: *
0080: * @return true if, and only if, the given term consists in letters.
0081: */
0082: private boolean isStemmable(String term) {
0083: for (int c = 0; c < term.length(); c++) {
0084: // Discard terms that contain non-letter characters.
0085: if (!Character.isLetter(term.charAt(c))) {
0086: return false;
0087: }
0088: }
0089: return true;
0090: }
0091:
0092: /**
0093: * Checks a term if it can be processed indexed.
0094: *
0095: * @return true if it can be indexed
0096: */
0097: private boolean isIndexable(String term) {
0098: return (term.length() < 30) && (term.length() > 2);
0099: }
0100:
0101: /**
0102: * See if string is 'a','e','i','o','u'
0103: *
0104: * @return true if is vowel
0105: */
0106: private boolean isVowel(char value) {
0107: return (value == 'a') || (value == 'e') || (value == 'i')
0108: || (value == 'o') || (value == 'u');
0109: }
0110:
0111: /**
0112: * Gets R1
0113: *
0114: * R1 - is the region after the first non-vowel follwing a vowel,
0115: * or is the null region at the end of the word if there is
0116: * no such non-vowel.
0117: *
0118: * @return null or a string representing R1
0119: */
0120: private String getR1(String value) {
0121: int i;
0122: int j;
0123:
0124: // be-safe !!!
0125: if (value == null) {
0126: return null;
0127: }
0128:
0129: // find 1st vowel
0130: i = value.length() - 1;
0131: for (j = 0; j < i; j++) {
0132: if (isVowel(value.charAt(j))) {
0133: break;
0134: }
0135: }
0136:
0137: if (!(j < i)) {
0138: return null;
0139: }
0140:
0141: // find 1st non-vowel
0142: for (; j < i; j++) {
0143: if (!(isVowel(value.charAt(j)))) {
0144: break;
0145: }
0146: }
0147:
0148: if (!(j < i)) {
0149: return null;
0150: }
0151:
0152: return value.substring(j + 1);
0153: }
0154:
0155: /**
0156: * Gets RV
0157: *
0158: * RV - IF the second letter is a consoant, RV is the region after
0159: * the next following vowel,
0160: *
0161: * OR if the first two letters are vowels, RV is the region
0162: * after the next consoant,
0163: *
0164: * AND otherwise (consoant-vowel case) RV is the region after
0165: * the third letter.
0166: *
0167: * BUT RV is the end of the word if this positions cannot be
0168: * found.
0169: *
0170: * @return null or a string representing RV
0171: */
0172: private String getRV(String value) {
0173: int i;
0174: int j;
0175:
0176: // be-safe !!!
0177: if (value == null) {
0178: return null;
0179: }
0180:
0181: i = value.length() - 1;
0182:
0183: // RV - IF the second letter is a consoant, RV is the region after
0184: // the next following vowel,
0185: if ((i > 0) && !isVowel(value.charAt(1))) {
0186: // find 1st vowel
0187: for (j = 2; j < i; j++) {
0188: if (isVowel(value.charAt(j))) {
0189: break;
0190: }
0191: }
0192:
0193: if (j < i) {
0194: return value.substring(j + 1);
0195: }
0196: }
0197:
0198: // RV - OR if the first two letters are vowels, RV is the region
0199: // after the next consoant,
0200: if ((i > 1) && isVowel(value.charAt(0))
0201: && isVowel(value.charAt(1))) {
0202: // find 1st consoant
0203: for (j = 2; j < i; j++) {
0204: if (!isVowel(value.charAt(j))) {
0205: break;
0206: }
0207: }
0208:
0209: if (j < i) {
0210: return value.substring(j + 1);
0211: }
0212: }
0213:
0214: // RV - AND otherwise (consoant-vowel case) RV is the region after
0215: // the third letter.
0216: if (i > 2) {
0217: return value.substring(3);
0218: }
0219:
0220: return null;
0221: }
0222:
0223: /**
0224: * 1) Turn to lowercase
0225: * 2) Remove accents
0226: * 3) ã -> a ; õ -> o
0227: * 4) ç -> c
0228: *
0229: * @return null or a string transformed
0230: */
0231: private String changeTerm( String value ) {
0232: int j;
0233: String r = "" ;
0234:
0235: // be-safe !!!
0236: if (value == null) {
0237: return null ;
0238: }
0239:
0240: value = value.toLowerCase() ;
0241: for (j=0 ; j < value.length() ; j++) {
0242: if ((value.charAt(j) == 'á') ||
0243: (value.charAt(j) == 'â') ||
0244: (value.charAt(j) == 'ã')) {
0245: r= r + "a" ; continue ;
0246: }
0247: if ((value.charAt(j) == 'é') ||
0248: (value.charAt(j) == 'ê')) {
0249: r= r + "e" ; continue ;
0250: }
0251: if (value.charAt(j) == 'í') {
0252: r= r + "i" ; continue ;
0253: }
0254: if ((value.charAt(j) == 'ó') ||
0255: (value.charAt(j) == 'ô') ||
0256: (value.charAt(j) == 'õ')) {
0257: r= r + "o" ; continue ;
0258: }
0259: if ((value.charAt(j) == 'ú') ||
0260: (value.charAt(j) == 'ü')) {
0261: r= r + "u" ; continue ;
0262: }
0263: if (value.charAt(j) == 'ç') {
0264: r= r + "c" ; continue ;
0265: }
0266: if (value.charAt(j) == 'ñ') {
0267: r= r + "n" ; continue ;
0268: }
0269:
0270: r= r+ value.charAt(j) ;
0271: }
0272:
0273: return r ;
0274: }
0275:
0276: /**
0277: * Check if a string ends with a suffix
0278: *
0279: * @return true if the string ends with the specified suffix
0280: */
0281: private boolean suffix(String value, String suffix) {
0282:
0283: // be-safe !!!
0284: if ((value == null) || (suffix == null)) {
0285: return false;
0286: }
0287:
0288: if (suffix.length() > value.length()) {
0289: return false;
0290: }
0291:
0292: return value.substring(value.length() - suffix.length())
0293: .equals(suffix);
0294: }
0295:
0296: /**
0297: * Replace a string suffix by another
0298: *
0299: * @return the replaced String
0300: */
0301: private String replaceSuffix(String value, String toReplace,
0302: String changeTo) {
0303: String vvalue;
0304:
0305: // be-safe !!!
0306: if ((value == null) || (toReplace == null)
0307: || (changeTo == null)) {
0308: return value;
0309: }
0310:
0311: vvalue = removeSuffix(value, toReplace);
0312:
0313: if (value.equals(vvalue)) {
0314: return value;
0315: } else {
0316: return vvalue + changeTo;
0317: }
0318: }
0319:
0320: /**
0321: * Remove a string suffix
0322: *
0323: * @return the String without the suffix
0324: */
0325: private String removeSuffix(String value, String toRemove) {
0326: // be-safe !!!
0327: if ((value == null) || (toRemove == null)
0328: || !suffix(value, toRemove)) {
0329: return value;
0330: }
0331:
0332: return value.substring(0, value.length() - toRemove.length());
0333: }
0334:
0335: /**
0336: * See if a suffix is preceded by a String
0337: *
0338: * @return true if the suffix is preceded
0339: */
0340: private boolean suffixPreceded(String value, String suffix,
0341: String preceded) {
0342: // be-safe !!!
0343: if ((value == null) || (suffix == null) || (preceded == null)
0344: || !suffix(value, suffix)) {
0345: return false;
0346: }
0347:
0348: return suffix(removeSuffix(value, suffix), preceded);
0349: }
0350:
0351: /**
0352: * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
0353: */
0354: private void createCT(String term) {
0355: CT = changeTerm(term);
0356:
0357: if (CT.length() < 2)
0358: return;
0359:
0360: // if the first character is ... , remove it
0361: if ((CT.charAt(0) == '"') || (CT.charAt(0) == '\'')
0362: || (CT.charAt(0) == '-') || (CT.charAt(0) == ',')
0363: || (CT.charAt(0) == ';') || (CT.charAt(0) == '.')
0364: || (CT.charAt(0) == '?') || (CT.charAt(0) == '!')) {
0365: CT = CT.substring(1);
0366: }
0367:
0368: if (CT.length() < 2)
0369: return;
0370:
0371: // if the last character is ... , remove it
0372: if ((CT.charAt(CT.length() - 1) == '-')
0373: || (CT.charAt(CT.length() - 1) == ',')
0374: || (CT.charAt(CT.length() - 1) == ';')
0375: || (CT.charAt(CT.length() - 1) == '.')
0376: || (CT.charAt(CT.length() - 1) == '?')
0377: || (CT.charAt(CT.length() - 1) == '!')
0378: || (CT.charAt(CT.length() - 1) == '\'')
0379: || (CT.charAt(CT.length() - 1) == '"')) {
0380: CT = CT.substring(0, CT.length() - 1);
0381: }
0382: }
0383:
0384: /**
0385: * Standart suffix removal.
0386: * Search for the longest among the following suffixes, and perform
0387: * the following actions:
0388: *
0389: * @return false if no ending was removed
0390: */
0391: private boolean step1() {
0392: if (CT == null)
0393: return false;
0394:
0395: // suffix lenght = 7
0396: if (suffix(CT, "uciones") && suffix(R2, "uciones")) {
0397: CT = replaceSuffix(CT, "uciones", "u");
0398: return true;
0399: }
0400:
0401: // suffix lenght = 6
0402: if (CT.length() >= 6) {
0403: if (suffix(CT, "imentos") && suffix(R2, "imentos")) {
0404: CT = removeSuffix(CT, "imentos");
0405: return true;
0406: }
0407: if (suffix(CT, "amentos") && suffix(R2, "amentos")) {
0408: CT = removeSuffix(CT, "amentos");
0409: return true;
0410: }
0411: if (suffix(CT, "adores") && suffix(R2, "adores")) {
0412: CT = removeSuffix(CT, "adores");
0413: return true;
0414: }
0415: if (suffix(CT, "adoras") && suffix(R2, "adoras")) {
0416: CT = removeSuffix(CT, "adoras");
0417: return true;
0418: }
0419: if (suffix(CT, "logias") && suffix(R2, "logias")) {
0420: replaceSuffix(CT, "logias", "log");
0421: return true;
0422: }
0423: if (suffix(CT, "encias") && suffix(R2, "encias")) {
0424: CT = replaceSuffix(CT, "encias", "ente");
0425: return true;
0426: }
0427: if (suffix(CT, "amente") && suffix(R1, "amente")) {
0428: CT = removeSuffix(CT, "amente");
0429: return true;
0430: }
0431: if (suffix(CT, "idades") && suffix(R2, "idades")) {
0432: CT = removeSuffix(CT, "idades");
0433: return true;
0434: }
0435: }
0436:
0437: // suffix lenght = 5
0438: if (CT.length() >= 5) {
0439: if (suffix(CT, "acoes") && suffix(R2, "acoes")) {
0440: CT = removeSuffix(CT, "acoes");
0441: return true;
0442: }
0443: if (suffix(CT, "imento") && suffix(R2, "imento")) {
0444: CT = removeSuffix(CT, "imento");
0445: return true;
0446: }
0447: if (suffix(CT, "amento") && suffix(R2, "amento")) {
0448: CT = removeSuffix(CT, "amento");
0449: return true;
0450: }
0451: if (suffix(CT, "adora") && suffix(R2, "adora")) {
0452: CT = removeSuffix(CT, "adora");
0453: return true;
0454: }
0455: if (suffix(CT, "ismos") && suffix(R2, "ismos")) {
0456: CT = removeSuffix(CT, "ismos");
0457: return true;
0458: }
0459: if (suffix(CT, "istas") && suffix(R2, "istas")) {
0460: CT = removeSuffix(CT, "istas");
0461: return true;
0462: }
0463: if (suffix(CT, "logia") && suffix(R2, "logia")) {
0464: CT = replaceSuffix(CT, "logia", "log");
0465: return true;
0466: }
0467: if (suffix(CT, "ucion") && suffix(R2, "ucion")) {
0468: CT = replaceSuffix(CT, "ucion", "u");
0469: return true;
0470: }
0471: if (suffix(CT, "encia") && suffix(R2, "encia")) {
0472: CT = replaceSuffix(CT, "encia", "ente");
0473: return true;
0474: }
0475: if (suffix(CT, "mente") && suffix(R2, "mente")) {
0476: CT = removeSuffix(CT, "mente");
0477: return true;
0478: }
0479: if (suffix(CT, "idade") && suffix(R2, "idade")) {
0480: CT = removeSuffix(CT, "idade");
0481: return true;
0482: }
0483: }
0484:
0485: // suffix lenght = 4
0486: if (CT.length() >= 4) {
0487: if (suffix(CT, "acao") && suffix(R2, "acao")) {
0488: CT = removeSuffix(CT, "acao");
0489: return true;
0490: }
0491: if (suffix(CT, "ezas") && suffix(R2, "ezas")) {
0492: CT = removeSuffix(CT, "ezas");
0493: return true;
0494: }
0495: if (suffix(CT, "icos") && suffix(R2, "icos")) {
0496: CT = removeSuffix(CT, "icos");
0497: return true;
0498: }
0499: if (suffix(CT, "icas") && suffix(R2, "icas")) {
0500: CT = removeSuffix(CT, "icas");
0501: return true;
0502: }
0503: if (suffix(CT, "ismo") && suffix(R2, "ismo")) {
0504: CT = removeSuffix(CT, "ismo");
0505: return true;
0506: }
0507: if (suffix(CT, "avel") && suffix(R2, "avel")) {
0508: CT = removeSuffix(CT, "avel");
0509: return true;
0510: }
0511: if (suffix(CT, "ivel") && suffix(R2, "ivel")) {
0512: CT = removeSuffix(CT, "ivel");
0513: return true;
0514: }
0515: if (suffix(CT, "ista") && suffix(R2, "ista")) {
0516: CT = removeSuffix(CT, "ista");
0517: return true;
0518: }
0519: if (suffix(CT, "osos") && suffix(R2, "osos")) {
0520: CT = removeSuffix(CT, "osos");
0521: return true;
0522: }
0523: if (suffix(CT, "osas") && suffix(R2, "osas")) {
0524: CT = removeSuffix(CT, "osas");
0525: return true;
0526: }
0527: if (suffix(CT, "ador") && suffix(R2, "ador")) {
0528: CT = removeSuffix(CT, "ador");
0529: return true;
0530: }
0531: if (suffix(CT, "ivas") && suffix(R2, "ivas")) {
0532: CT = removeSuffix(CT, "ivas");
0533: return true;
0534: }
0535: if (suffix(CT, "ivos") && suffix(R2, "ivos")) {
0536: CT = removeSuffix(CT, "ivos");
0537: return true;
0538: }
0539: if (suffix(CT, "iras") && suffix(RV, "iras")
0540: && suffixPreceded(CT, "iras", "e")) {
0541: CT = replaceSuffix(CT, "iras", "ir");
0542: return true;
0543: }
0544: }
0545:
0546: // suffix lenght = 3
0547: if (CT.length() >= 3) {
0548: if (suffix(CT, "eza") && suffix(R2, "eza")) {
0549: CT = removeSuffix(CT, "eza");
0550: return true;
0551: }
0552: if (suffix(CT, "ico") && suffix(R2, "ico")) {
0553: CT = removeSuffix(CT, "ico");
0554: return true;
0555: }
0556: if (suffix(CT, "ica") && suffix(R2, "ica")) {
0557: CT = removeSuffix(CT, "ica");
0558: return true;
0559: }
0560: if (suffix(CT, "oso") && suffix(R2, "oso")) {
0561: CT = removeSuffix(CT, "oso");
0562: return true;
0563: }
0564: if (suffix(CT, "osa") && suffix(R2, "osa")) {
0565: CT = removeSuffix(CT, "osa");
0566: return true;
0567: }
0568: if (suffix(CT, "iva") && suffix(R2, "iva")) {
0569: CT = removeSuffix(CT, "iva");
0570: return true;
0571: }
0572: if (suffix(CT, "ivo") && suffix(R2, "ivo")) {
0573: CT = removeSuffix(CT, "ivo");
0574: return true;
0575: }
0576: if (suffix(CT, "ira") && suffix(RV, "ira")
0577: && suffixPreceded(CT, "ira", "e")) {
0578: CT = replaceSuffix(CT, "ira", "ir");
0579: return true;
0580: }
0581: }
0582:
0583: // no ending was removed by step1
0584: return false;
0585: }
0586:
0587: /**
0588: * Verb suffixes.
0589: *
0590: * Search for the longest among the following suffixes in RV,
0591: * and if found, delete.
0592: *
0593: * @return false if no ending was removed
0594: */
0595: private boolean step2() {
0596: if (RV == null)
0597: return false;
0598:
0599: // suffix lenght = 7
0600: if (RV.length() >= 7) {
0601: if (suffix(RV, "issemos")) {
0602: CT = removeSuffix(CT, "issemos");
0603: return true;
0604: }
0605: if (suffix(RV, "essemos")) {
0606: CT = removeSuffix(CT, "essemos");
0607: return true;
0608: }
0609: if (suffix(RV, "assemos")) {
0610: CT = removeSuffix(CT, "assemos");
0611: return true;
0612: }
0613: if (suffix(RV, "ariamos")) {
0614: CT = removeSuffix(CT, "ariamos");
0615: return true;
0616: }
0617: if (suffix(RV, "eriamos")) {
0618: CT = removeSuffix(CT, "eriamos");
0619: return true;
0620: }
0621: if (suffix(RV, "iriamos")) {
0622: CT = removeSuffix(CT, "iriamos");
0623: return true;
0624: }
0625: }
0626:
0627: // suffix lenght = 6
0628: if (RV.length() >= 6) {
0629: if (suffix(RV, "iremos")) {
0630: CT = removeSuffix(CT, "iremos");
0631: return true;
0632: }
0633: if (suffix(RV, "eremos")) {
0634: CT = removeSuffix(CT, "eremos");
0635: return true;
0636: }
0637: if (suffix(RV, "aremos")) {
0638: CT = removeSuffix(CT, "aremos");
0639: return true;
0640: }
0641: if (suffix(RV, "avamos")) {
0642: CT = removeSuffix(CT, "avamos");
0643: return true;
0644: }
0645: if (suffix(RV, "iramos")) {
0646: CT = removeSuffix(CT, "iramos");
0647: return true;
0648: }
0649: if (suffix(RV, "eramos")) {
0650: CT = removeSuffix(CT, "eramos");
0651: return true;
0652: }
0653: if (suffix(RV, "aramos")) {
0654: CT = removeSuffix(CT, "aramos");
0655: return true;
0656: }
0657: if (suffix(RV, "asseis")) {
0658: CT = removeSuffix(CT, "asseis");
0659: return true;
0660: }
0661: if (suffix(RV, "esseis")) {
0662: CT = removeSuffix(CT, "esseis");
0663: return true;
0664: }
0665: if (suffix(RV, "isseis")) {
0666: CT = removeSuffix(CT, "isseis");
0667: return true;
0668: }
0669: if (suffix(RV, "arieis")) {
0670: CT = removeSuffix(CT, "arieis");
0671: return true;
0672: }
0673: if (suffix(RV, "erieis")) {
0674: CT = removeSuffix(CT, "erieis");
0675: return true;
0676: }
0677: if (suffix(RV, "irieis")) {
0678: CT = removeSuffix(CT, "irieis");
0679: return true;
0680: }
0681: }
0682:
0683: // suffix lenght = 5
0684: if (RV.length() >= 5) {
0685: if (suffix(RV, "irmos")) {
0686: CT = removeSuffix(CT, "irmos");
0687: return true;
0688: }
0689: if (suffix(RV, "iamos")) {
0690: CT = removeSuffix(CT, "iamos");
0691: return true;
0692: }
0693: if (suffix(RV, "armos")) {
0694: CT = removeSuffix(CT, "armos");
0695: return true;
0696: }
0697: if (suffix(RV, "ermos")) {
0698: CT = removeSuffix(CT, "ermos");
0699: return true;
0700: }
0701: if (suffix(RV, "areis")) {
0702: CT = removeSuffix(CT, "areis");
0703: return true;
0704: }
0705: if (suffix(RV, "ereis")) {
0706: CT = removeSuffix(CT, "ereis");
0707: return true;
0708: }
0709: if (suffix(RV, "ireis")) {
0710: CT = removeSuffix(CT, "ireis");
0711: return true;
0712: }
0713: if (suffix(RV, "asses")) {
0714: CT = removeSuffix(CT, "asses");
0715: return true;
0716: }
0717: if (suffix(RV, "esses")) {
0718: CT = removeSuffix(CT, "esses");
0719: return true;
0720: }
0721: if (suffix(RV, "isses")) {
0722: CT = removeSuffix(CT, "isses");
0723: return true;
0724: }
0725: if (suffix(RV, "astes")) {
0726: CT = removeSuffix(CT, "astes");
0727: return true;
0728: }
0729: if (suffix(RV, "assem")) {
0730: CT = removeSuffix(CT, "assem");
0731: return true;
0732: }
0733: if (suffix(RV, "essem")) {
0734: CT = removeSuffix(CT, "essem");
0735: return true;
0736: }
0737: if (suffix(RV, "issem")) {
0738: CT = removeSuffix(CT, "issem");
0739: return true;
0740: }
0741: if (suffix(RV, "ardes")) {
0742: CT = removeSuffix(CT, "ardes");
0743: return true;
0744: }
0745: if (suffix(RV, "erdes")) {
0746: CT = removeSuffix(CT, "erdes");
0747: return true;
0748: }
0749: if (suffix(RV, "irdes")) {
0750: CT = removeSuffix(CT, "irdes");
0751: return true;
0752: }
0753: if (suffix(RV, "ariam")) {
0754: CT = removeSuffix(CT, "ariam");
0755: return true;
0756: }
0757: if (suffix(RV, "eriam")) {
0758: CT = removeSuffix(CT, "eriam");
0759: return true;
0760: }
0761: if (suffix(RV, "iriam")) {
0762: CT = removeSuffix(CT, "iriam");
0763: return true;
0764: }
0765: if (suffix(RV, "arias")) {
0766: CT = removeSuffix(CT, "arias");
0767: return true;
0768: }
0769: if (suffix(RV, "erias")) {
0770: CT = removeSuffix(CT, "erias");
0771: return true;
0772: }
0773: if (suffix(RV, "irias")) {
0774: CT = removeSuffix(CT, "irias");
0775: return true;
0776: }
0777: if (suffix(RV, "estes")) {
0778: CT = removeSuffix(CT, "estes");
0779: return true;
0780: }
0781: if (suffix(RV, "istes")) {
0782: CT = removeSuffix(CT, "istes");
0783: return true;
0784: }
0785: if (suffix(RV, "areis")) {
0786: CT = removeSuffix(CT, "areis");
0787: return true;
0788: }
0789: if (suffix(RV, "aveis")) {
0790: CT = removeSuffix(CT, "aveis");
0791: return true;
0792: }
0793: }
0794:
0795: // suffix lenght = 4
0796: if (RV.length() >= 4) {
0797: if (suffix(RV, "aria")) {
0798: CT = removeSuffix(CT, "aria");
0799: return true;
0800: }
0801: if (suffix(RV, "eria")) {
0802: CT = removeSuffix(CT, "eria");
0803: return true;
0804: }
0805: if (suffix(RV, "iria")) {
0806: CT = removeSuffix(CT, "iria");
0807: return true;
0808: }
0809: if (suffix(RV, "asse")) {
0810: CT = removeSuffix(CT, "asse");
0811: return true;
0812: }
0813: if (suffix(RV, "esse")) {
0814: CT = removeSuffix(CT, "esse");
0815: return true;
0816: }
0817: if (suffix(RV, "isse")) {
0818: CT = removeSuffix(CT, "isse");
0819: return true;
0820: }
0821: if (suffix(RV, "aste")) {
0822: CT = removeSuffix(CT, "aste");
0823: return true;
0824: }
0825: if (suffix(RV, "este")) {
0826: CT = removeSuffix(CT, "este");
0827: return true;
0828: }
0829: if (suffix(RV, "iste")) {
0830: CT = removeSuffix(CT, "iste");
0831: return true;
0832: }
0833: if (suffix(RV, "arei")) {
0834: CT = removeSuffix(CT, "arei");
0835: return true;
0836: }
0837: if (suffix(RV, "erei")) {
0838: CT = removeSuffix(CT, "erei");
0839: return true;
0840: }
0841: if (suffix(RV, "irei")) {
0842: CT = removeSuffix(CT, "irei");
0843: return true;
0844: }
0845: if (suffix(RV, "aram")) {
0846: CT = removeSuffix(CT, "aram");
0847: return true;
0848: }
0849: if (suffix(RV, "eram")) {
0850: CT = removeSuffix(CT, "eram");
0851: return true;
0852: }
0853: if (suffix(RV, "iram")) {
0854: CT = removeSuffix(CT, "iram");
0855: return true;
0856: }
0857: if (suffix(RV, "avam")) {
0858: CT = removeSuffix(CT, "avam");
0859: return true;
0860: }
0861: if (suffix(RV, "arem")) {
0862: CT = removeSuffix(CT, "arem");
0863: return true;
0864: }
0865: if (suffix(RV, "erem")) {
0866: CT = removeSuffix(CT, "erem");
0867: return true;
0868: }
0869: if (suffix(RV, "irem")) {
0870: CT = removeSuffix(CT, "irem");
0871: return true;
0872: }
0873: if (suffix(RV, "ando")) {
0874: CT = removeSuffix(CT, "ando");
0875: return true;
0876: }
0877: if (suffix(RV, "endo")) {
0878: CT = removeSuffix(CT, "endo");
0879: return true;
0880: }
0881: if (suffix(RV, "indo")) {
0882: CT = removeSuffix(CT, "indo");
0883: return true;
0884: }
0885: if (suffix(RV, "arao")) {
0886: CT = removeSuffix(CT, "arao");
0887: return true;
0888: }
0889: if (suffix(RV, "erao")) {
0890: CT = removeSuffix(CT, "erao");
0891: return true;
0892: }
0893: if (suffix(RV, "irao")) {
0894: CT = removeSuffix(CT, "irao");
0895: return true;
0896: }
0897: if (suffix(RV, "adas")) {
0898: CT = removeSuffix(CT, "adas");
0899: return true;
0900: }
0901: if (suffix(RV, "idas")) {
0902: CT = removeSuffix(CT, "idas");
0903: return true;
0904: }
0905: if (suffix(RV, "aras")) {
0906: CT = removeSuffix(CT, "aras");
0907: return true;
0908: }
0909: if (suffix(RV, "eras")) {
0910: CT = removeSuffix(CT, "eras");
0911: return true;
0912: }
0913: if (suffix(RV, "iras")) {
0914: CT = removeSuffix(CT, "iras");
0915: return true;
0916: }
0917: if (suffix(RV, "avas")) {
0918: CT = removeSuffix(CT, "avas");
0919: return true;
0920: }
0921: if (suffix(RV, "ares")) {
0922: CT = removeSuffix(CT, "ares");
0923: return true;
0924: }
0925: if (suffix(RV, "eres")) {
0926: CT = removeSuffix(CT, "eres");
0927: return true;
0928: }
0929: if (suffix(RV, "ires")) {
0930: CT = removeSuffix(CT, "ires");
0931: return true;
0932: }
0933: if (suffix(RV, "ados")) {
0934: CT = removeSuffix(CT, "ados");
0935: return true;
0936: }
0937: if (suffix(RV, "idos")) {
0938: CT = removeSuffix(CT, "idos");
0939: return true;
0940: }
0941: if (suffix(RV, "amos")) {
0942: CT = removeSuffix(CT, "amos");
0943: return true;
0944: }
0945: if (suffix(RV, "emos")) {
0946: CT = removeSuffix(CT, "emos");
0947: return true;
0948: }
0949: if (suffix(RV, "imos")) {
0950: CT = removeSuffix(CT, "imos");
0951: return true;
0952: }
0953: if (suffix(RV, "iras")) {
0954: CT = removeSuffix(CT, "iras");
0955: return true;
0956: }
0957: if (suffix(RV, "ieis")) {
0958: CT = removeSuffix(CT, "ieis");
0959: return true;
0960: }
0961: }
0962:
0963: // suffix lenght = 3
0964: if (RV.length() >= 3) {
0965: if (suffix(RV, "ada")) {
0966: CT = removeSuffix(CT, "ada");
0967: return true;
0968: }
0969: if (suffix(RV, "ida")) {
0970: CT = removeSuffix(CT, "ida");
0971: return true;
0972: }
0973: if (suffix(RV, "ara")) {
0974: CT = removeSuffix(CT, "ara");
0975: return true;
0976: }
0977: if (suffix(RV, "era")) {
0978: CT = removeSuffix(CT, "era");
0979: return true;
0980: }
0981: if (suffix(RV, "ira")) {
0982: CT = removeSuffix(CT, "ava");
0983: return true;
0984: }
0985: if (suffix(RV, "iam")) {
0986: CT = removeSuffix(CT, "iam");
0987: return true;
0988: }
0989: if (suffix(RV, "ado")) {
0990: CT = removeSuffix(CT, "ado");
0991: return true;
0992: }
0993: if (suffix(RV, "ido")) {
0994: CT = removeSuffix(CT, "ido");
0995: return true;
0996: }
0997: if (suffix(RV, "ias")) {
0998: CT = removeSuffix(CT, "ias");
0999: return true;
1000: }
1001: if (suffix(RV, "ais")) {
1002: CT = removeSuffix(CT, "ais");
1003: return true;
1004: }
1005: if (suffix(RV, "eis")) {
1006: CT = removeSuffix(CT, "eis");
1007: return true;
1008: }
1009: if (suffix(RV, "ira")) {
1010: CT = removeSuffix(CT, "ira");
1011: return true;
1012: }
1013: if (suffix(RV, "ear")) {
1014: CT = removeSuffix(CT, "ear");
1015: return true;
1016: }
1017: }
1018:
1019: // suffix lenght = 2
1020: if (RV.length() >= 2) {
1021: if (suffix(RV, "ia")) {
1022: CT = removeSuffix(CT, "ia");
1023: return true;
1024: }
1025: if (suffix(RV, "ei")) {
1026: CT = removeSuffix(CT, "ei");
1027: return true;
1028: }
1029: if (suffix(RV, "am")) {
1030: CT = removeSuffix(CT, "am");
1031: return true;
1032: }
1033: if (suffix(RV, "em")) {
1034: CT = removeSuffix(CT, "em");
1035: return true;
1036: }
1037: if (suffix(RV, "ar")) {
1038: CT = removeSuffix(CT, "ar");
1039: return true;
1040: }
1041: if (suffix(RV, "er")) {
1042: CT = removeSuffix(CT, "er");
1043: return true;
1044: }
1045: if (suffix(RV, "ir")) {
1046: CT = removeSuffix(CT, "ir");
1047: return true;
1048: }
1049: if (suffix(RV, "as")) {
1050: CT = removeSuffix(CT, "as");
1051: return true;
1052: }
1053: if (suffix(RV, "es")) {
1054: CT = removeSuffix(CT, "es");
1055: return true;
1056: }
1057: if (suffix(RV, "is")) {
1058: CT = removeSuffix(CT, "is");
1059: return true;
1060: }
1061: if (suffix(RV, "eu")) {
1062: CT = removeSuffix(CT, "eu");
1063: return true;
1064: }
1065: if (suffix(RV, "iu")) {
1066: CT = removeSuffix(CT, "iu");
1067: return true;
1068: }
1069: if (suffix(RV, "iu")) {
1070: CT = removeSuffix(CT, "iu");
1071: return true;
1072: }
1073: if (suffix(RV, "ou")) {
1074: CT = removeSuffix(CT, "ou");
1075: return true;
1076: }
1077: }
1078:
1079: // no ending was removed by step2
1080: return false;
1081: }
1082:
1083: /**
1084: * Delete suffix 'i' if in RV and preceded by 'c'
1085: *
1086: */
1087: private void step3() {
1088: if (RV == null)
1089: return;
1090:
1091: if (suffix(RV, "i") && suffixPreceded(RV, "i", "c")) {
1092: CT = removeSuffix(CT, "i");
1093: }
1094:
1095: }
1096:
1097: /**
1098: * Residual suffix
1099: *
1100: * If the word ends with one of the suffixes (os a i o á í ó)
1101: * in RV, delete it
1102: *
1103: */
1104: private void step4() {
1105: if (RV == null)
1106: return;
1107:
1108: if (suffix(RV, "os")) {
1109: CT = removeSuffix(CT, "os");
1110: return;
1111: }
1112: if (suffix(RV, "a")) {
1113: CT = removeSuffix(CT, "a");
1114: return;
1115: }
1116: if (suffix(RV, "i")) {
1117: CT = removeSuffix(CT, "i");
1118: return;
1119: }
1120: if (suffix(RV, "o")) {
1121: CT = removeSuffix(CT, "o");
1122: return;
1123: }
1124:
1125: }
1126:
1127: /**
1128: * If the word ends with one of ( e é ê) in RV,delete it,
1129: * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
1130: * delete the 'u' (or 'i')
1131: *
1132: * Or if the word ends ç remove the cedilha
1133: *
1134: */
1135: private void step5() {
1136: if (RV == null)
1137: return;
1138:
1139: if (suffix(RV, "e")) {
1140: if (suffixPreceded(RV, "e", "gu")) {
1141: CT = removeSuffix(CT, "e");
1142: CT = removeSuffix(CT, "u");
1143: return;
1144: }
1145:
1146: if (suffixPreceded(RV, "e", "ci")) {
1147: CT = removeSuffix(CT, "e");
1148: CT = removeSuffix(CT, "i");
1149: return;
1150: }
1151:
1152: CT = removeSuffix(CT, "e");
1153: return;
1154: }
1155: }
1156:
1157: /**
1158: * For log and debug purpose
1159: *
1160: * @return TERM, CT, RV, R1 and R2
1161: */
1162: public String log() {
1163: return " (TERM = " + TERM + ")" + " (CT = " + CT + ")"
1164: + " (RV = " + RV + ")" + " (R1 = " + R1 + ")"
1165: + " (R2 = " + R2 + ")";
1166: }
1167:
1168: }
|