0001: package org.apache.lucene.analysis.fr;
0002:
0003: /* ====================================================================
0004:
0005: * The Apache Software License, Version 1.1
0006:
0007: *
0008:
0009: * Copyright (c) 2001 The Apache Software Foundation. All rights
0010:
0011: * reserved.
0012:
0013: *
0014:
0015: * Redistribution and use in source and binary forms, with or without
0016:
0017: * modification, are permitted provided that the following conditions
0018:
0019: * are met:
0020:
0021: *
0022:
0023: * 1. Redistributions of source code must retain the above copyright
0024:
0025: * notice, this list of conditions and the following disclaimer.
0026:
0027: *
0028:
0029: * 2. Redistributions in binary form must reproduce the above copyright
0030:
0031: * notice, this list of conditions and the following disclaimer in
0032:
0033: * the documentation and/or other materials provided with the
0034:
0035: * distribution.
0036:
0037: *
0038:
0039: * 3. The end-user documentation included with the redistribution,
0040:
0041: * if any, must include the following acknowledgment:
0042:
0043: * "This product includes software developed by the
0044:
0045: * Apache Software Foundation (http://www.apache.org/)."
0046:
0047: * Alternately, this acknowledgment may appear in the software itself,
0048:
0049: * if and wherever such third-party acknowledgments normally appear.
0050:
0051: *
0052:
0053: * 4. The names "Apache" and "Apache Software Foundation" and
0054:
0055: * "Apache Lucene" must not be used to endorse or promote products
0056:
0057: * derived from this software without prior written permission. For
0058:
0059: * written permission, please contact apache@apache.org.
0060:
0061: *
0062:
0063: * 5. Products derived from this software may not be called "Apache",
0064:
0065: * "Apache Lucene", nor may "Apache" appear in their name, without
0066:
0067: * prior written permission of the Apache Software Foundation.
0068:
0069: *
0070:
0071: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0072:
0073: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0074:
0075: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0076:
0077: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0078:
0079: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0080:
0081: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0082:
0083: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0084:
0085: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0086:
0087: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0088:
0089: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0090:
0091: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0092:
0093: * SUCH DAMAGE.
0094:
0095: * ====================================================================
0096:
0097: *
0098:
0099: * This software consists of voluntary contributions made by many
0100:
0101: * individuals on behalf of the Apache Software Foundation. For more
0102:
0103: * information on the Apache Software Foundation, please see
0104:
0105: * <http://www.apache.org/>.
0106:
0107: */
0108:
0109: /**
0110:
0111: * A stemmer for French words. The algorithm is based on the work of
0112:
0113: * Dr Martin Porter on his snowball project<br>
0114:
0115: * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
0116:
0117: * (French stemming algorithm) for details
0118:
0119: *
0120:
0121: * Changelog: 2005/06/06: changed isVowel(char) for unicode support (jf@teamskill.de)
0122:
0123: *
0124:
0125: * @author Patrick Talbot
0126:
0127: */
0128:
0129: public class FrenchStemmer {
0130:
0131: /**
0132:
0133: * Buffer for the terms while stemming them.
0134:
0135: */
0136:
0137: private StringBuffer sb = new StringBuffer();
0138:
0139: /**
0140:
0141: * A temporary buffer, used to reconstruct R2
0142:
0143: */
0144:
0145: private StringBuffer tb = new StringBuffer();
0146:
0147: /**
0148:
0149: * Region R0 is equal to the whole buffer
0150:
0151: */
0152:
0153: private String R0;
0154:
0155: /**
0156:
0157: * Region RV
0158:
0159: * "If the word begins with two vowels, RV is the region after the third letter,
0160:
0161: * otherwise the region after the first vowel not at the beginning of the word,
0162:
0163: * or the end of the word if these positions cannot be found."
0164:
0165: */
0166:
0167: private String RV;
0168:
0169: /**
0170:
0171: * Region R1
0172:
0173: * "R1 is the region after the first non-vowel following a vowel
0174:
0175: * or is the null region at the end of the word if there is no such non-vowel"
0176:
0177: */
0178:
0179: private String R1;
0180:
0181: /**
0182:
0183: * Region R2
0184:
0185: * "R2 is the region after the first non-vowel in R1 following a vowel
0186:
0187: * or is the null region at the end of the word if there is no such non-vowel"
0188:
0189: */
0190:
0191: private String R2;
0192:
0193: /**
0194:
0195: * Set to true if we need to perform step 2
0196:
0197: */
0198:
0199: private boolean suite;
0200:
0201: /**
0202:
0203: * Set to true if the buffer was modified
0204:
0205: */
0206:
0207: private boolean modified;
0208:
0209: /**
0210:
0211: * Stemms the given term to a unique <tt>discriminator</tt>.
0212:
0213: *
0214:
0215: * @param term java.langString The term that should be stemmed
0216:
0217: * @return java.lang.String Discriminator for <tt>term</tt>
0218:
0219: */
0220:
0221: protected String stem(String term) {
0222:
0223: if (!isStemmable(term)) {
0224:
0225: return term;
0226:
0227: }
0228:
0229: // Use lowercase for medium stemming.
0230:
0231: term = term.toLowerCase();
0232:
0233: // Reset the StringBuffer.
0234:
0235: sb.delete(0, sb.length());
0236:
0237: sb.insert(0, term);
0238:
0239: // reset the booleans
0240:
0241: modified = false;
0242:
0243: suite = false;
0244:
0245: sb = treatVowels(sb);
0246:
0247: setStrings();
0248:
0249: step1();
0250:
0251: if (!modified || suite) {
0252:
0253: if (RV != null) {
0254:
0255: suite = step2a();
0256:
0257: if (!suite)
0258:
0259: step2b();
0260:
0261: }
0262:
0263: }
0264:
0265: if (modified || suite)
0266:
0267: step3();
0268:
0269: else
0270:
0271: step4();
0272:
0273: step5();
0274:
0275: step6();
0276:
0277: return sb.toString();
0278:
0279: }
0280:
0281: /**
0282:
0283: * Sets the search region Strings<br>
0284:
0285: * it needs to be done each time the buffer was modified
0286:
0287: */
0288:
0289: private void setStrings() {
0290:
0291: // set the strings
0292:
0293: R0 = sb.toString();
0294:
0295: RV = retrieveRV(sb);
0296:
0297: R1 = retrieveR(sb);
0298:
0299: if (R1 != null) {
0300:
0301: tb.delete(0, tb.length());
0302:
0303: tb.insert(0, R1);
0304:
0305: R2 = retrieveR(tb);
0306:
0307: }
0308:
0309: else
0310:
0311: R2 = null;
0312:
0313: }
0314:
0315: /**
0316:
0317: * First step of the Porter Algorithmn<br>
0318:
0319: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0320:
0321: */
0322:
0323: private void step1() {
0324:
0325: String[] suffix = {
0326:
0327: "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe",
0328: "isme",
0329:
0330: "able", "iste" };
0331:
0332: deleteFrom(R2, suffix);
0333:
0334: replaceFrom(R2, new String[] { "logies", "logie" }
0335:
0336: , "log");
0337:
0338: replaceFrom(R2, new String[] { "usions", "utions", "usion",
0339: "ution" }
0340:
0341: , "u");
0342:
0343: replaceFrom(R2, new String[] { "ences", "ence" }
0344:
0345: , "ent");
0346:
0347: String[] search = {
0348:
0349: "atrices", "ateurs", "ations", "atrice", "ateur", "ation" };
0350:
0351: deleteButSuffixFromElseReplace(R2, search, "ic", true, R0,
0352: "iqU");
0353:
0354: deleteButSuffixFromElseReplace(R2, new String[] { "ements",
0355: "ement" }
0356:
0357: , "eus", false, R0, "eux");
0358:
0359: deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0360:
0361: , "ativ", false);
0362:
0363: deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0364:
0365: , "iv", false);
0366:
0367: deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0368:
0369: , "abl", false);
0370:
0371: deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0372:
0373: , "iqU", false);
0374:
0375: deleteFromIfTestVowelBeforeIn(R1, new String[] { "issements",
0376: "issement" }
0377:
0378: , false, R0);
0379:
0380: deleteFrom(RV, new String[] { "ements", "ement" });
0381:
0382: deleteButSuffixFromElseReplace(R2,
0383: new String[] { "ités", "ité" }
0384:
0385: , "abil", false, R0, "abl");
0386:
0387: deleteButSuffixFromElseReplace(R2,
0388: new String[] { "ités", "ité" }
0389:
0390: , "ic", false, R0, "iqU");
0391:
0392: deleteButSuffixFrom(R2, new String[] { "ités", "ité" }
0393:
0394: , "iv", true);
0395:
0396: String[] autre = {
0397:
0398: "ifs", "ives", "if", "ive" };
0399:
0400: deleteButSuffixFromElseReplace(R2, autre, "icat", false, R0,
0401: "iqU");
0402:
0403: deleteButSuffixFromElseReplace(R2, autre, "at", true, R2, "iqU");
0404:
0405: replaceFrom(R0, new String[] { "eaux" }
0406:
0407: , "eau");
0408:
0409: replaceFrom(R1, new String[] { "aux" }
0410:
0411: , "al");
0412:
0413: deleteButSuffixFromElseReplace(R2, new String[] { "euses",
0414: "euse" }
0415:
0416: , "", true, R1, "eux");
0417:
0418: deleteFrom(R2, new String[] { "eux" });
0419:
0420: // if one of the next steps is performed, we will need to perform step2a
0421:
0422: boolean temp = false;
0423:
0424: temp = replaceFrom(RV, new String[] { "amment" }
0425:
0426: , "ant");
0427:
0428: if (temp == true)
0429:
0430: suite = true;
0431:
0432: temp = replaceFrom(RV, new String[] { "emment" }
0433:
0434: , "ent");
0435:
0436: if (temp == true)
0437:
0438: suite = true;
0439:
0440: temp = deleteFromIfTestVowelBeforeIn(RV, new String[] {
0441: "ments", "ment" }
0442:
0443: , true, RV);
0444:
0445: if (temp == true)
0446:
0447: suite = true;
0448:
0449: }
0450:
0451: /**
0452:
0453: * Second step (A) of the Porter Algorithmn<br>
0454:
0455: * Will be performed if nothing changed from the first step
0456:
0457: * or changed were done in the amment, emment, ments or ment suffixes<br>
0458:
0459: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0460:
0461: *
0462:
0463: * @return boolean - true if something changed in the StringBuffer
0464:
0465: */
0466:
0467: private boolean step2a() {
0468:
0469: String[] search = {
0470:
0471: "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras",
0472: "ira",
0473:
0474: "irent", "iriez", "irez", "irions", "irons", "iront",
0475:
0476: "issaIent", "issais", "issantes", "issante", "issants",
0477: "issant",
0478:
0479: "issait", "issais", "issions", "issons", "issiez",
0480: "issez", "issent",
0481:
0482: "isses", "isse", "ir", "is", "ît", "it", "ies", "ie",
0483: "i" };
0484:
0485: return deleteFromIfTestVowelBeforeIn(RV, search, false, RV);
0486:
0487: }
0488:
0489: /**
0490:
0491: * Second step (B) of the Porter Algorithmn<br>
0492:
0493: * Will be performed if step 2 A was performed unsuccessfully<br>
0494:
0495: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0496:
0497: */
0498:
0499: private void step2b() {
0500:
0501: String[] suffix = {
0502:
0503: "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
0504: "èrent",
0505:
0506: "era", "ées", "iez", "ée", "és", "er", "ez", "é"
0507:
0508: };
0509:
0510: deleteFrom(RV, suffix);
0511:
0512: String[] search = {
0513:
0514: "assions", "assiez", "assent", "asses", "asse", "aIent",
0515:
0516: "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants",
0517: "ant",
0518:
0519: "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as",
0520: "ai", "Ai", "a" };
0521:
0522: deleteButSuffixFrom(RV, search, "e", true);
0523:
0524: deleteFrom(R2, new String[] { "ions" });
0525:
0526: }
0527:
0528: /**
0529:
0530: * Third step of the Porter Algorithmn<br>
0531:
0532: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0533:
0534: */
0535:
0536: private void step3() {
0537:
0538: if (sb.length() > 0) {
0539:
0540: char ch = sb.charAt(sb.length() - 1);
0541:
0542: if (ch == 'Y') {
0543:
0544: sb.setCharAt(sb.length() - 1, 'i');
0545:
0546: setStrings();
0547:
0548: }
0549:
0550: else if (ch == 'ç') {
0551:
0552: sb.setCharAt(sb.length() - 1, 'c');
0553:
0554: setStrings();
0555:
0556: }
0557:
0558: }
0559:
0560: }
0561:
0562: /**
0563:
0564: * Fourth step of the Porter Algorithmn<br>
0565:
0566: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0567:
0568: */
0569:
0570: private void step4() {
0571:
0572: if (sb.length() > 1) {
0573:
0574: char ch = sb.charAt(sb.length() - 1);
0575:
0576: if (ch == 's') {
0577:
0578: char b = sb.charAt(sb.length() - 2);
0579:
0580: if (b != 'a' && b != 'i' && b != 'o' && b != 'u'
0581: && b != 'è' &&
0582:
0583: b != 's')
0584:
0585: {
0586:
0587: sb.delete(sb.length() - 1, sb.length());
0588:
0589: setStrings();
0590:
0591: }
0592:
0593: }
0594:
0595: }
0596:
0597: boolean found = deleteFromIfPrecededIn(R2,
0598: new String[] { "ion" }
0599:
0600: , RV, "s");
0601:
0602: if (!found)
0603:
0604: found = deleteFromIfPrecededIn(R2, new String[] { "ion" }
0605:
0606: , RV, "t");
0607:
0608: replaceFrom(RV, new String[] { "Ière", "ière", "Ier", "ier" }
0609:
0610: , "i");
0611:
0612: deleteFrom(RV, new String[] { "e" });
0613:
0614: deleteFromIfPrecededIn(RV, new String[] { "ë" }
0615:
0616: , R0, "gu");
0617:
0618: }
0619:
0620: /**
0621:
0622: * Fifth step of the Porter Algorithmn<br>
0623:
0624: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0625:
0626: */
0627:
0628: private void step5() {
0629:
0630: if (R0 != null) {
0631:
0632: if (R0.endsWith("enn") || R0.endsWith("onn")
0633: || R0.endsWith("ett") ||
0634:
0635: R0.endsWith("ell") || R0.endsWith("eill")) {
0636:
0637: sb.delete(sb.length() - 1, sb.length());
0638:
0639: setStrings();
0640:
0641: }
0642:
0643: }
0644:
0645: }
0646:
0647: /**
0648:
0649: * Sixth (and last!) step of the Porter Algorithmn<br>
0650:
0651: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0652:
0653: */
0654:
0655: private void step6() {
0656:
0657: if (R0 != null && R0.length() > 0) {
0658:
0659: boolean seenVowel = false;
0660:
0661: boolean seenConson = false;
0662:
0663: int pos = -1;
0664:
0665: for (int i = R0.length() - 1; i > -1; i--) {
0666:
0667: char ch = R0.charAt(i);
0668:
0669: if (isVowel(ch)) {
0670:
0671: if (!seenVowel) {
0672:
0673: if (ch == 'é' || ch == 'è')
0674:
0675: {
0676:
0677: pos = i;
0678:
0679: break;
0680:
0681: }
0682:
0683: }
0684:
0685: seenVowel = true;
0686:
0687: }
0688:
0689: else {
0690:
0691: if (seenVowel)
0692:
0693: break;
0694:
0695: else
0696:
0697: seenConson = true;
0698:
0699: }
0700:
0701: }
0702:
0703: if (pos > -1 && seenConson && !seenVowel)
0704:
0705: sb.setCharAt(pos, 'e');
0706:
0707: }
0708:
0709: }
0710:
0711: /**
0712:
0713: * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
0714:
0715: *
0716:
0717: * @param source java.lang.String - the primary source zone for search
0718:
0719: * @param search java.lang.String[] - the strings to search for suppression
0720:
0721: * @param from java.lang.String - the secondary source zone for search
0722:
0723: * @param prefix java.lang.String - the prefix to add to the search string to test
0724:
0725: * @return boolean - true if modified
0726:
0727: */
0728:
0729: private boolean deleteFromIfPrecededIn(String source,
0730: String[] search,
0731:
0732: String from, String prefix) {
0733:
0734: boolean found = false;
0735:
0736: if (source != null) {
0737:
0738: for (int i = 0; i < search.length; i++) {
0739:
0740: if (source.endsWith(search[i])) {
0741:
0742: if (from != null
0743: && from.endsWith(prefix + search[i])) {
0744:
0745: sb.delete(sb.length() - search[i].length(), sb
0746: .length());
0747:
0748: found = true;
0749:
0750: setStrings();
0751:
0752: break;
0753:
0754: }
0755:
0756: }
0757:
0758: }
0759:
0760: }
0761:
0762: return found;
0763:
0764: }
0765:
0766: /**
0767:
0768: * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
0769:
0770: *
0771:
0772: * @param source java.lang.String - the primary source zone for search
0773:
0774: * @param search java.lang.String[] - the strings to search for suppression
0775:
0776: * @param vowel boolean - true if we need a vowel before the search string
0777:
0778: * @param from java.lang.String - the secondary source zone for search (where vowel could be)
0779:
0780: * @return boolean - true if modified
0781:
0782: */
0783:
0784: private boolean deleteFromIfTestVowelBeforeIn(String source,
0785: String[] search,
0786:
0787: boolean vowel, String from) {
0788:
0789: boolean found = false;
0790:
0791: if (source != null && from != null) {
0792:
0793: for (int i = 0; i < search.length; i++) {
0794:
0795: if (source.endsWith(search[i])) {
0796:
0797: if ((search[i].length() + 1) <= from.length()) {
0798:
0799: boolean test = isVowel(sb.charAt(sb.length() -
0800:
0801: (search[i].length() + 1)));
0802:
0803: if (test == vowel) {
0804:
0805: sb.delete(sb.length() - search[i].length(),
0806: sb.length());
0807:
0808: modified = true;
0809:
0810: found = true;
0811:
0812: setStrings();
0813:
0814: break;
0815:
0816: }
0817:
0818: }
0819:
0820: }
0821:
0822: }
0823:
0824: }
0825:
0826: return found;
0827:
0828: }
0829:
0830: /**
0831:
0832: * Delete a suffix searched in zone "source" if preceded by the prefix
0833:
0834: *
0835:
0836: * @param source java.lang.String - the primary source zone for search
0837:
0838: * @param search java.lang.String[] - the strings to search for suppression
0839:
0840: * @param prefix java.lang.String - the prefix to add to the search string to test
0841:
0842: * @param without boolean - true if it will be deleted even without prefix found
0843:
0844: */
0845:
0846: private void deleteButSuffixFrom(String source, String[] search,
0847:
0848: String prefix, boolean without) {
0849:
0850: if (source != null) {
0851:
0852: for (int i = 0; i < search.length; i++) {
0853:
0854: if (source.endsWith(prefix + search[i])) {
0855:
0856: sb.delete(sb.length()
0857: - (prefix.length() + search[i].length()),
0858:
0859: sb.length());
0860:
0861: modified = true;
0862:
0863: setStrings();
0864:
0865: break;
0866:
0867: }
0868:
0869: else if (without && source.endsWith(search[i])) {
0870:
0871: sb.delete(sb.length() - search[i].length(), sb
0872: .length());
0873:
0874: modified = true;
0875:
0876: setStrings();
0877:
0878: break;
0879:
0880: }
0881:
0882: }
0883:
0884: }
0885:
0886: }
0887:
0888: /**
0889:
0890: * Delete a suffix searched in zone "source" if preceded by prefix<br>
0891:
0892: * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
0893:
0894: * or delete the suffix if specified
0895:
0896: *
0897:
0898: * @param source java.lang.String - the primary source zone for search
0899:
0900: * @param search java.lang.String[] - the strings to search for suppression
0901:
0902: * @param prefix java.lang.String - the prefix to add to the search string to test
0903:
0904: * @param without boolean - true if it will be deleted even without prefix found
0905:
0906: */
0907:
0908: private void deleteButSuffixFromElseReplace(String source,
0909: String[] search,
0910:
0911: String prefix, boolean without,
0912:
0913: String from, String replace) {
0914:
0915: if (source != null) {
0916:
0917: for (int i = 0; i < search.length; i++) {
0918:
0919: if (source.endsWith(prefix + search[i])) {
0920:
0921: sb.delete(sb.length()
0922: - (prefix.length() + search[i].length()),
0923:
0924: sb.length());
0925:
0926: modified = true;
0927:
0928: setStrings();
0929:
0930: break;
0931:
0932: }
0933:
0934: else if (from != null
0935: && from.endsWith(prefix + search[i])) {
0936:
0937: sb.replace(sb.length()
0938: - (prefix.length() + search[i].length()),
0939:
0940: sb.length(), replace);
0941:
0942: modified = true;
0943:
0944: setStrings();
0945:
0946: break;
0947:
0948: }
0949:
0950: else if (without && source.endsWith(search[i])) {
0951:
0952: sb.delete(sb.length() - search[i].length(), sb
0953: .length());
0954:
0955: modified = true;
0956:
0957: setStrings();
0958:
0959: break;
0960:
0961: }
0962:
0963: }
0964:
0965: }
0966:
0967: }
0968:
0969: /**
0970:
0971: * Replace a search string with another within the source zone
0972:
0973: *
0974:
0975: * @param source java.lang.String - the source zone for search
0976:
0977: * @param search java.lang.String[] - the strings to search for replacement
0978:
0979: * @param replace java.lang.String - the replacement string
0980:
0981: */
0982:
0983: private boolean replaceFrom(String source, String[] search,
0984: String replace) {
0985:
0986: boolean found = false;
0987:
0988: if (source != null) {
0989:
0990: for (int i = 0; i < search.length; i++) {
0991:
0992: if (source.endsWith(search[i])) {
0993:
0994: sb.replace(sb.length() - search[i].length(), sb
0995: .length(), replace);
0996:
0997: modified = true;
0998:
0999: found = true;
1000:
1001: setStrings();
1002:
1003: break;
1004:
1005: }
1006:
1007: }
1008:
1009: }
1010:
1011: return found;
1012:
1013: }
1014:
1015: /**
1016:
1017: * Delete a search string within the source zone
1018:
1019: *
1020:
1021: * @param source the source zone for search
1022:
1023: * @param suffix the strings to search for suppression
1024:
1025: */
1026:
1027: private void deleteFrom(String source, String[] suffix) {
1028:
1029: if (source != null) {
1030:
1031: for (int i = 0; i < suffix.length; i++) {
1032:
1033: if (source.endsWith(suffix[i])) {
1034:
1035: sb.delete(sb.length() - suffix[i].length(), sb
1036: .length());
1037:
1038: modified = true;
1039:
1040: setStrings();
1041:
1042: break;
1043:
1044: }
1045:
1046: }
1047:
1048: }
1049:
1050: }
1051:
1052: /**
1053:
1054: * Test if a char is a french vowel, including accentuated ones
1055:
1056: *
1057:
1058: * @param ch the char to test
1059:
1060: * @return boolean - true if the char is a vowel
1061:
1062: */
1063:
1064: private boolean isVowel(char ch) {
1065:
1066: switch (ch) {
1067:
1068: case 'o':
1069:
1070: case 'u':
1071:
1072: case 'y':
1073:
1074: case '\u00e2':
1075:
1076: case '\u00e0':
1077:
1078: case '\u00eb':
1079:
1080: case '\u00e9':
1081:
1082: case '\u00ea':
1083:
1084: case '\u00e8':
1085:
1086: case '\u00ef':
1087:
1088: case '\u00ee':
1089:
1090: case '\u00f4':
1091:
1092: case '\u00fc':
1093:
1094: case '\u00f9':
1095:
1096: case '\u00fb':
1097:
1098: return true;
1099:
1100: default:
1101:
1102: return false;
1103:
1104: }
1105:
1106: }
1107:
1108: /**
1109:
1110: * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
1111:
1112: * "R is the region after the first non-vowel following a vowel
1113:
1114: * or is the null region at the end of the word if there is no such non-vowel"<br>
1115:
1116: * @param buffer java.lang.StringBuffer - the in buffer
1117:
1118: * @return java.lang.String - the resulting string
1119:
1120: */
1121:
1122: private String retrieveR(StringBuffer buffer) {
1123:
1124: int len = buffer.length();
1125:
1126: int pos = -1;
1127:
1128: for (int c = 0; c < len; c++) {
1129:
1130: if (isVowel(buffer.charAt(c))) {
1131:
1132: pos = c;
1133:
1134: break;
1135:
1136: }
1137:
1138: }
1139:
1140: if (pos > -1) {
1141:
1142: int consonne = -1;
1143:
1144: for (int c = pos; c < len; c++) {
1145:
1146: if (!isVowel(buffer.charAt(c))) {
1147:
1148: consonne = c;
1149:
1150: break;
1151:
1152: }
1153:
1154: }
1155:
1156: if (consonne > -1 && (consonne + 1) < len)
1157:
1158: return buffer.substring(consonne + 1, len);
1159:
1160: else
1161:
1162: return null;
1163:
1164: }
1165:
1166: else
1167:
1168: return null;
1169:
1170: }
1171:
1172: /**
1173:
1174: * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
1175:
1176: * "If the word begins with two vowels, RV is the region after the third letter,
1177:
1178: * otherwise the region after the first vowel not at the beginning of the word,
1179:
1180: * or the end of the word if these positions cannot be found."<br>
1181:
1182: * @param buffer java.lang.StringBuffer - the in buffer
1183:
1184: * @return java.lang.String - the resulting string
1185:
1186: */
1187:
1188: private String retrieveRV(StringBuffer buffer) {
1189:
1190: int len = buffer.length();
1191:
1192: if (buffer.length() > 3) {
1193:
1194: if (isVowel(buffer.charAt(0)) && isVowel(buffer.charAt(1))) {
1195:
1196: return buffer.substring(3, len);
1197:
1198: }
1199:
1200: else {
1201:
1202: int pos = 0;
1203:
1204: for (int c = 1; c < len; c++) {
1205:
1206: if (isVowel(buffer.charAt(c))) {
1207:
1208: pos = c;
1209:
1210: break;
1211:
1212: }
1213:
1214: }
1215:
1216: if (pos + 1 < len)
1217:
1218: return buffer.substring(pos + 1, len);
1219:
1220: else
1221:
1222: return null;
1223:
1224: }
1225:
1226: }
1227:
1228: else
1229:
1230: return null;
1231:
1232: }
1233:
1234: /**
1235:
1236: * Turns u and i preceded AND followed by a vowel to UpperCase<br>
1237:
1238: * Turns y preceded OR followed by a vowel to UpperCase<br>
1239:
1240: * Turns u preceded by q to UpperCase<br>
1241:
1242: *
1243:
1244: * @param buffer java.util.StringBuffer - the buffer to treat
1245:
1246: * @return java.util.StringBuffer - the treated buffer
1247:
1248: */
1249:
1250: private StringBuffer treatVowels(StringBuffer buffer) {
1251:
1252: for (int c = 0; c < buffer.length(); c++) {
1253:
1254: char ch = buffer.charAt(c);
1255:
1256: if (c == 0) { // first char
1257:
1258: if (buffer.length() > 1) {
1259:
1260: if (ch == 'y' && isVowel(buffer.charAt(c + 1)))
1261:
1262: buffer.setCharAt(c, 'Y');
1263:
1264: }
1265:
1266: }
1267:
1268: else if (c == buffer.length() - 1) { // last char
1269:
1270: if (ch == 'u' && buffer.charAt(c - 1) == 'q')
1271:
1272: buffer.setCharAt(c, 'U');
1273:
1274: if (ch == 'y' && isVowel(buffer.charAt(c - 1)))
1275:
1276: buffer.setCharAt(c, 'Y');
1277:
1278: }
1279:
1280: else { // other cases
1281:
1282: if (ch == 'u') {
1283:
1284: if (buffer.charAt(c - 1) == 'q')
1285:
1286: buffer.setCharAt(c, 'U');
1287:
1288: else if (isVowel(buffer.charAt(c - 1))
1289: && isVowel(buffer.charAt(c + 1)))
1290:
1291: buffer.setCharAt(c, 'U');
1292:
1293: }
1294:
1295: if (ch == 'i') {
1296:
1297: if (isVowel(buffer.charAt(c - 1))
1298: && isVowel(buffer.charAt(c + 1)))
1299:
1300: buffer.setCharAt(c, 'I');
1301:
1302: }
1303:
1304: if (ch == 'y') {
1305:
1306: if (isVowel(buffer.charAt(c - 1))
1307: || isVowel(buffer.charAt(c + 1)))
1308:
1309: buffer.setCharAt(c, 'Y');
1310:
1311: }
1312:
1313: }
1314:
1315: }
1316:
1317: return buffer;
1318:
1319: }
1320:
1321: /**
1322:
1323: * Checks a term if it can be processed correctly.
1324:
1325: *
1326:
1327: * @return boolean - true if, and only if, the given term consists in letters.
1328:
1329: */
1330:
1331: private boolean isStemmable(String term) {
1332:
1333: boolean upper = false;
1334:
1335: int first = -1;
1336:
1337: for (int c = 0; c < term.length(); c++) {
1338:
1339: // Discard terms that contain non-letter characters.
1340:
1341: if (!Character.isLetter(term.charAt(c))) {
1342:
1343: return false;
1344:
1345: }
1346:
1347: // Discard terms that contain multiple uppercase letters.
1348:
1349: if (Character.isUpperCase(term.charAt(c))) {
1350:
1351: if (upper) {
1352:
1353: return false;
1354:
1355: }
1356:
1357: // First encountered uppercase letter, set flag and save
1358:
1359: // position.
1360:
1361: else {
1362:
1363: first = c;
1364:
1365: upper = true;
1366:
1367: }
1368:
1369: }
1370:
1371: }
1372:
1373: // Discard the term if it contains a single uppercase letter that
1374:
1375: // is not starting the term.
1376:
1377: if (first > 0) {
1378:
1379: return false;
1380:
1381: }
1382:
1383: return true;
1384:
1385: }
1386:
1387: }
|