Source Code Cross Referenced for FrenchStemmer.java in » Search-Engine » Lius-0.4 » org » apache » lucene » analysis » fr » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » Lius 0.4 » org.apache.lucene.analysis.fr
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        package org.apache.lucene.analysis.fr;
0002:
0003:        /* ====================================================================
0004:
0005:         * The Apache Software License, Version 1.1
0006:
0007:         *
0008:
0009:         * Copyright (c) 2001 The Apache Software Foundation.  All rights
0010:
0011:         * reserved.
0012:
0013:         *
0014:
0015:         * Redistribution and use in source and binary forms, with or without
0016:
0017:         * modification, are permitted provided that the following conditions
0018:
0019:         * are met:
0020:
0021:         *
0022:
0023:         * 1. Redistributions of source code must retain the above copyright
0024:
0025:         *    notice, this list of conditions and the following disclaimer.
0026:
0027:         *
0028:
0029:         * 2. Redistributions in binary form must reproduce the above copyright
0030:
0031:         *    notice, this list of conditions and the following disclaimer in
0032:
0033:         *    the documentation and/or other materials provided with the
0034:
0035:         *    distribution.
0036:
0037:         *
0038:
0039:         * 3. The end-user documentation included with the redistribution,
0040:
0041:         *    if any, must include the following acknowledgment:
0042:
0043:         *       "This product includes software developed by the
0044:
0045:         *        Apache Software Foundation (http://www.apache.org/)."
0046:
0047:         *    Alternately, this acknowledgment may appear in the software itself,
0048:
0049:         *    if and wherever such third-party acknowledgments normally appear.
0050:
0051:         *
0052:
0053:         * 4. The names "Apache" and "Apache Software Foundation" and
0054:
0055:         *    "Apache Lucene" must not be used to endorse or promote products
0056:
0057:         *    derived from this software without prior written permission. For
0058:
0059:         *    written permission, please contact apache@apache.org.
0060:
0061:         *
0062:
0063:         * 5. Products derived from this software may not be called "Apache",
0064:
0065:         *    "Apache Lucene", nor may "Apache" appear in their name, without
0066:
0067:         *    prior written permission of the Apache Software Foundation.
0068:
0069:         *
0070:
0071:         * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0072:
0073:         * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0074:
0075:         * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0076:
0077:         * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0078:
0079:         * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0080:
0081:         * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0082:
0083:         * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0084:
0085:         * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0086:
0087:         * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0088:
0089:         * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0090:
0091:         * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0092:
0093:         * SUCH DAMAGE.
0094:
0095:         * ====================================================================
0096:
0097:         *
0098:
0099:         * This software consists of voluntary contributions made by many
0100:
0101:         * individuals on behalf of the Apache Software Foundation.  For more
0102:
0103:         * information on the Apache Software Foundation, please see
0104:
0105:         * <http://www.apache.org/>.
0106:
0107:         */
0108:
0109:        /**
0110:
0111:         * A stemmer for French words. The algorithm is based on the work of
0112:
0113:         * Dr Martin Porter on his snowball project<br>
0114:
0115:         * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
0116:
0117:         * (French stemming algorithm) for details
0118:
0119:         *
0120:
0121:         * Changelog: 2005/06/06: changed isVowel(char) for unicode support (jf@teamskill.de) 
0122:
0123:         *
0124:
0125:         * @author    Patrick Talbot
0126:
0127:         */
0128:
0129:        public class FrenchStemmer {
0130:
0131:            /**
0132:
0133:             * Buffer for the terms while stemming them.
0134:
0135:             */
0136:
0137:            private StringBuffer sb = new StringBuffer();
0138:
0139:            /**
0140:
0141:             * A temporary buffer, used to reconstruct R2
0142:
0143:             */
0144:
0145:            private StringBuffer tb = new StringBuffer();
0146:
0147:            /**
0148:
0149:             * Region R0 is equal to the whole buffer
0150:
0151:             */
0152:
0153:            private String R0;
0154:
0155:            /**
0156:
0157:             * Region RV
0158:
0159:             * "If the word begins with two vowels, RV is the region after the third letter,
0160:
0161:             * otherwise the region after the first vowel not at the beginning of the word,
0162:
0163:             * or the end of the word if these positions cannot be found."
0164:
0165:             */
0166:
0167:            private String RV;
0168:
0169:            /**
0170:
0171:             * Region R1
0172:
0173:             * "R1 is the region after the first non-vowel following a vowel
0174:
0175:             * or is the null region at the end of the word if there is no such non-vowel"
0176:
0177:             */
0178:
0179:            private String R1;
0180:
0181:            /**
0182:
0183:             * Region R2
0184:
0185:             * "R2 is the region after the first non-vowel in R1 following a vowel
0186:
0187:             * or is the null region at the end of the word if there is no such non-vowel"
0188:
0189:             */
0190:
0191:            private String R2;
0192:
0193:            /**
0194:
0195:             * Set to true if we need to perform step 2
0196:
0197:             */
0198:
0199:            private boolean suite;
0200:
0201:            /**
0202:
0203:             * Set to true if the buffer was modified
0204:
0205:             */
0206:
0207:            private boolean modified;
0208:
0209:            /**
0210:
0211:             * Stemms the given term to a unique <tt>discriminator</tt>.
0212:
0213:             *
0214:
0215:             * @param term  java.langString The term that should be stemmed
0216:
0217:             * @return java.lang.String  Discriminator for <tt>term</tt>
0218:
0219:             */
0220:
0221:            protected String stem(String term) {
0222:
0223:                if (!isStemmable(term)) {
0224:
0225:                    return term;
0226:
0227:                }
0228:
0229:                // Use lowercase for medium stemming.
0230:
0231:                term = term.toLowerCase();
0232:
0233:                // Reset the StringBuffer.
0234:
0235:                sb.delete(0, sb.length());
0236:
0237:                sb.insert(0, term);
0238:
0239:                // reset the booleans
0240:
0241:                modified = false;
0242:
0243:                suite = false;
0244:
0245:                sb = treatVowels(sb);
0246:
0247:                setStrings();
0248:
0249:                step1();
0250:
0251:                if (!modified || suite) {
0252:
0253:                    if (RV != null) {
0254:
0255:                        suite = step2a();
0256:
0257:                        if (!suite)
0258:
0259:                            step2b();
0260:
0261:                    }
0262:
0263:                }
0264:
0265:                if (modified || suite)
0266:
0267:                    step3();
0268:
0269:                else
0270:
0271:                    step4();
0272:
0273:                step5();
0274:
0275:                step6();
0276:
0277:                return sb.toString();
0278:
0279:            }
0280:
0281:            /**
0282:
0283:             * Sets the search region Strings<br>
0284:
0285:             * it needs to be done each time the buffer was modified
0286:
0287:             */
0288:
0289:            private void setStrings() {
0290:
0291:                // set the strings
0292:
0293:                R0 = sb.toString();
0294:
0295:                RV = retrieveRV(sb);
0296:
0297:                R1 = retrieveR(sb);
0298:
0299:                if (R1 != null) {
0300:
0301:                    tb.delete(0, tb.length());
0302:
0303:                    tb.insert(0, R1);
0304:
0305:                    R2 = retrieveR(tb);
0306:
0307:                }
0308:
0309:                else
0310:
0311:                    R2 = null;
0312:
0313:            }
0314:
0315:            /**
0316:
0317:             * First step of the Porter Algorithmn<br>
0318:
0319:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0320:
0321:             */
0322:
0323:            private void step1() {
0324:
0325:                String[] suffix = {
0326:
0327:                "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe",
0328:                        "isme",
0329:
0330:                        "able", "iste" };
0331:
0332:                deleteFrom(R2, suffix);
0333:
0334:                replaceFrom(R2, new String[] { "logies", "logie" }
0335:
0336:                , "log");
0337:
0338:                replaceFrom(R2, new String[] { "usions", "utions", "usion",
0339:                        "ution" }
0340:
0341:                , "u");
0342:
0343:                replaceFrom(R2, new String[] { "ences", "ence" }
0344:
0345:                , "ent");
0346:
0347:                String[] search = {
0348:
0349:                "atrices", "ateurs", "ations", "atrice", "ateur", "ation" };
0350:
0351:                deleteButSuffixFromElseReplace(R2, search, "ic", true, R0,
0352:                        "iqU");
0353:
0354:                deleteButSuffixFromElseReplace(R2, new String[] { "ements",
0355:                        "ement" }
0356:
0357:                , "eus", false, R0, "eux");
0358:
0359:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0360:
0361:                , "ativ", false);
0362:
0363:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0364:
0365:                , "iv", false);
0366:
0367:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0368:
0369:                , "abl", false);
0370:
0371:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" }
0372:
0373:                , "iqU", false);
0374:
0375:                deleteFromIfTestVowelBeforeIn(R1, new String[] { "issements",
0376:                        "issement" }
0377:
0378:                , false, R0);
0379:
0380:                deleteFrom(RV, new String[] { "ements", "ement" });
0381:
0382:                deleteButSuffixFromElseReplace(R2,
0383:                        new String[] { "it�s", "it�" }
0384:
0385:                        , "abil", false, R0, "abl");
0386:
0387:                deleteButSuffixFromElseReplace(R2,
0388:                        new String[] { "it�s", "it�" }
0389:
0390:                        , "ic", false, R0, "iqU");
0391:
0392:                deleteButSuffixFrom(R2, new String[] { "it�s", "it�" }
0393:
0394:                , "iv", true);
0395:
0396:                String[] autre = {
0397:
0398:                "ifs", "ives", "if", "ive" };
0399:
0400:                deleteButSuffixFromElseReplace(R2, autre, "icat", false, R0,
0401:                        "iqU");
0402:
0403:                deleteButSuffixFromElseReplace(R2, autre, "at", true, R2, "iqU");
0404:
0405:                replaceFrom(R0, new String[] { "eaux" }
0406:
0407:                , "eau");
0408:
0409:                replaceFrom(R1, new String[] { "aux" }
0410:
0411:                , "al");
0412:
0413:                deleteButSuffixFromElseReplace(R2, new String[] { "euses",
0414:                        "euse" }
0415:
0416:                , "", true, R1, "eux");
0417:
0418:                deleteFrom(R2, new String[] { "eux" });
0419:
0420:                // if one of the next steps is performed, we will need to perform step2a
0421:
0422:                boolean temp = false;
0423:
0424:                temp = replaceFrom(RV, new String[] { "amment" }
0425:
0426:                , "ant");
0427:
0428:                if (temp == true)
0429:
0430:                    suite = true;
0431:
0432:                temp = replaceFrom(RV, new String[] { "emment" }
0433:
0434:                , "ent");
0435:
0436:                if (temp == true)
0437:
0438:                    suite = true;
0439:
0440:                temp = deleteFromIfTestVowelBeforeIn(RV, new String[] {
0441:                        "ments", "ment" }
0442:
0443:                , true, RV);
0444:
0445:                if (temp == true)
0446:
0447:                    suite = true;
0448:
0449:            }
0450:
0451:            /**
0452:
0453:             * Second step (A) of the Porter Algorithmn<br>
0454:
0455:             * Will be performed if nothing changed from the first step
0456:
0457:             * or changed were done in the amment, emment, ments or ment suffixes<br>
0458:
0459:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0460:
0461:             *
0462:
0463:             * @return boolean - true if something changed in the StringBuffer
0464:
0465:             */
0466:
0467:            private boolean step2a() {
0468:
0469:                String[] search = {
0470:
0471:                "�mes", "�tes", "iraIent", "irait", "irais", "irai", "iras",
0472:                        "ira",
0473:
0474:                        "irent", "iriez", "irez", "irions", "irons", "iront",
0475:
0476:                        "issaIent", "issais", "issantes", "issante", "issants",
0477:                        "issant",
0478:
0479:                        "issait", "issais", "issions", "issons", "issiez",
0480:                        "issez", "issent",
0481:
0482:                        "isses", "isse", "ir", "is", "�t", "it", "ies", "ie",
0483:                        "i" };
0484:
0485:                return deleteFromIfTestVowelBeforeIn(RV, search, false, RV);
0486:
0487:            }
0488:
0489:            /**
0490:
0491:             * Second step (B) of the Porter Algorithmn<br>
0492:
0493:             * Will be performed if step 2 A was performed unsuccessfully<br>
0494:
0495:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0496:
0497:             */
0498:
0499:            private void step2b() {
0500:
0501:                String[] suffix = {
0502:
0503:                "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
0504:                        "�rent",
0505:
0506:                        "era", "�es", "iez", "�e", "�s", "er", "ez", "�"
0507:
0508:                };
0509:
0510:                deleteFrom(RV, suffix);
0511:
0512:                String[] search = {
0513:
0514:                "assions", "assiez", "assent", "asses", "asse", "aIent",
0515:
0516:                "antes", "aIent", "Aient", "ante", "�mes", "�tes", "ants",
0517:                        "ant",
0518:
0519:                        "ait", "a�t", "ais", "Ait", "A�t", "Ais", "�t", "as",
0520:                        "ai", "Ai", "a" };
0521:
0522:                deleteButSuffixFrom(RV, search, "e", true);
0523:
0524:                deleteFrom(R2, new String[] { "ions" });
0525:
0526:            }
0527:
0528:            /**
0529:
0530:             * Third step of the Porter Algorithmn<br>
0531:
0532:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0533:
0534:             */
0535:
0536:            private void step3() {
0537:
0538:                if (sb.length() > 0) {
0539:
0540:                    char ch = sb.charAt(sb.length() - 1);
0541:
0542:                    if (ch == 'Y') {
0543:
0544:                        sb.setCharAt(sb.length() - 1, 'i');
0545:
0546:                        setStrings();
0547:
0548:                    }
0549:
0550:                    else if (ch == '�') {
0551:
0552:                        sb.setCharAt(sb.length() - 1, 'c');
0553:
0554:                        setStrings();
0555:
0556:                    }
0557:
0558:                }
0559:
0560:            }
0561:
0562:            /**
0563:
0564:             * Fourth step of the Porter Algorithmn<br>
0565:
0566:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0567:
0568:             */
0569:
0570:            private void step4() {
0571:
0572:                if (sb.length() > 1) {
0573:
0574:                    char ch = sb.charAt(sb.length() - 1);
0575:
0576:                    if (ch == 's') {
0577:
0578:                        char b = sb.charAt(sb.length() - 2);
0579:
0580:                        if (b != 'a' && b != 'i' && b != 'o' && b != 'u'
0581:                                && b != '�' &&
0582:
0583:                                b != 's')
0584:
0585:                        {
0586:
0587:                            sb.delete(sb.length() - 1, sb.length());
0588:
0589:                            setStrings();
0590:
0591:                        }
0592:
0593:                    }
0594:
0595:                }
0596:
0597:                boolean found = deleteFromIfPrecededIn(R2,
0598:                        new String[] { "ion" }
0599:
0600:                        , RV, "s");
0601:
0602:                if (!found)
0603:
0604:                    found = deleteFromIfPrecededIn(R2, new String[] { "ion" }
0605:
0606:                    , RV, "t");
0607:
0608:                replaceFrom(RV, new String[] { "I�re", "i�re", "Ier", "ier" }
0609:
0610:                , "i");
0611:
0612:                deleteFrom(RV, new String[] { "e" });
0613:
0614:                deleteFromIfPrecededIn(RV, new String[] { "�" }
0615:
0616:                , R0, "gu");
0617:
0618:            }
0619:
0620:            /**
0621:
0622:             * Fifth step of the Porter Algorithmn<br>
0623:
0624:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0625:
0626:             */
0627:
0628:            private void step5() {
0629:
0630:                if (R0 != null) {
0631:
0632:                    if (R0.endsWith("enn") || R0.endsWith("onn")
0633:                            || R0.endsWith("ett") ||
0634:
0635:                            R0.endsWith("ell") || R0.endsWith("eill")) {
0636:
0637:                        sb.delete(sb.length() - 1, sb.length());
0638:
0639:                        setStrings();
0640:
0641:                    }
0642:
0643:                }
0644:
0645:            }
0646:
0647:            /**
0648:
0649:             * Sixth (and last!) step of the Porter Algorithmn<br>
0650:
0651:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
0652:
0653:             */
0654:
0655:            private void step6() {
0656:
0657:                if (R0 != null && R0.length() > 0) {
0658:
0659:                    boolean seenVowel = false;
0660:
0661:                    boolean seenConson = false;
0662:
0663:                    int pos = -1;
0664:
0665:                    for (int i = R0.length() - 1; i > -1; i--) {
0666:
0667:                        char ch = R0.charAt(i);
0668:
0669:                        if (isVowel(ch)) {
0670:
0671:                            if (!seenVowel) {
0672:
0673:                                if (ch == '�' || ch == '�')
0674:
0675:                                {
0676:
0677:                                    pos = i;
0678:
0679:                                    break;
0680:
0681:                                }
0682:
0683:                            }
0684:
0685:                            seenVowel = true;
0686:
0687:                        }
0688:
0689:                        else {
0690:
0691:                            if (seenVowel)
0692:
0693:                                break;
0694:
0695:                            else
0696:
0697:                                seenConson = true;
0698:
0699:                        }
0700:
0701:                    }
0702:
0703:                    if (pos > -1 && seenConson && !seenVowel)
0704:
0705:                        sb.setCharAt(pos, 'e');
0706:
0707:                }
0708:
0709:            }
0710:
0711:            /**
0712:
0713:             * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
0714:
0715:             *
0716:
0717:             * @param source java.lang.String - the primary source zone for search
0718:
0719:             * @param search java.lang.String[] - the strings to search for suppression
0720:
0721:             * @param from java.lang.String - the secondary source zone for search
0722:
0723:             * @param prefix java.lang.String - the prefix to add to the search string to test
0724:
0725:             * @return boolean - true if modified
0726:
0727:             */
0728:
0729:            private boolean deleteFromIfPrecededIn(String source,
0730:                    String[] search,
0731:
0732:                    String from, String prefix) {
0733:
0734:                boolean found = false;
0735:
0736:                if (source != null) {
0737:
0738:                    for (int i = 0; i < search.length; i++) {
0739:
0740:                        if (source.endsWith(search[i])) {
0741:
0742:                            if (from != null
0743:                                    && from.endsWith(prefix + search[i])) {
0744:
0745:                                sb.delete(sb.length() - search[i].length(), sb
0746:                                        .length());
0747:
0748:                                found = true;
0749:
0750:                                setStrings();
0751:
0752:                                break;
0753:
0754:                            }
0755:
0756:                        }
0757:
0758:                    }
0759:
0760:                }
0761:
0762:                return found;
0763:
0764:            }
0765:
0766:            /**
0767:
0768:             * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
0769:
0770:             *
0771:
0772:             * @param source java.lang.String - the primary source zone for search
0773:
0774:             * @param search java.lang.String[] - the strings to search for suppression
0775:
0776:             * @param vowel boolean - true if we need a vowel before the search string
0777:
0778:             * @param from java.lang.String - the secondary source zone for search (where vowel could be)
0779:
0780:             * @return boolean - true if modified
0781:
0782:             */
0783:
0784:            private boolean deleteFromIfTestVowelBeforeIn(String source,
0785:                    String[] search,
0786:
0787:                    boolean vowel, String from) {
0788:
0789:                boolean found = false;
0790:
0791:                if (source != null && from != null) {
0792:
0793:                    for (int i = 0; i < search.length; i++) {
0794:
0795:                        if (source.endsWith(search[i])) {
0796:
0797:                            if ((search[i].length() + 1) <= from.length()) {
0798:
0799:                                boolean test = isVowel(sb.charAt(sb.length() -
0800:
0801:                                (search[i].length() + 1)));
0802:
0803:                                if (test == vowel) {
0804:
0805:                                    sb.delete(sb.length() - search[i].length(),
0806:                                            sb.length());
0807:
0808:                                    modified = true;
0809:
0810:                                    found = true;
0811:
0812:                                    setStrings();
0813:
0814:                                    break;
0815:
0816:                                }
0817:
0818:                            }
0819:
0820:                        }
0821:
0822:                    }
0823:
0824:                }
0825:
0826:                return found;
0827:
0828:            }
0829:
0830:            /**
0831:
0832:             * Delete a suffix searched in zone "source" if preceded by the prefix
0833:
0834:             *
0835:
0836:             * @param source java.lang.String - the primary source zone for search
0837:
0838:             * @param search java.lang.String[] - the strings to search for suppression
0839:
0840:             * @param prefix java.lang.String - the prefix to add to the search string to test
0841:
0842:             * @param without boolean - true if it will be deleted even without prefix found
0843:
0844:             */
0845:
0846:            private void deleteButSuffixFrom(String source, String[] search,
0847:
0848:            String prefix, boolean without) {
0849:
0850:                if (source != null) {
0851:
0852:                    for (int i = 0; i < search.length; i++) {
0853:
0854:                        if (source.endsWith(prefix + search[i])) {
0855:
0856:                            sb.delete(sb.length()
0857:                                    - (prefix.length() + search[i].length()),
0858:
0859:                            sb.length());
0860:
0861:                            modified = true;
0862:
0863:                            setStrings();
0864:
0865:                            break;
0866:
0867:                        }
0868:
0869:                        else if (without && source.endsWith(search[i])) {
0870:
0871:                            sb.delete(sb.length() - search[i].length(), sb
0872:                                    .length());
0873:
0874:                            modified = true;
0875:
0876:                            setStrings();
0877:
0878:                            break;
0879:
0880:                        }
0881:
0882:                    }
0883:
0884:                }
0885:
0886:            }
0887:
0888:            /**
0889:
0890:             * Delete a suffix searched in zone "source" if preceded by prefix<br>
0891:
0892:             * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
0893:
0894:             * or delete the suffix if specified
0895:
0896:             *
0897:
0898:             * @param source java.lang.String - the primary source zone for search
0899:
0900:             * @param search java.lang.String[] - the strings to search for suppression
0901:
0902:             * @param prefix java.lang.String - the prefix to add to the search string to test
0903:
0904:             * @param without boolean - true if it will be deleted even without prefix found
0905:
0906:             */
0907:
0908:            private void deleteButSuffixFromElseReplace(String source,
0909:                    String[] search,
0910:
0911:                    String prefix, boolean without,
0912:
0913:                    String from, String replace) {
0914:
0915:                if (source != null) {
0916:
0917:                    for (int i = 0; i < search.length; i++) {
0918:
0919:                        if (source.endsWith(prefix + search[i])) {
0920:
0921:                            sb.delete(sb.length()
0922:                                    - (prefix.length() + search[i].length()),
0923:
0924:                            sb.length());
0925:
0926:                            modified = true;
0927:
0928:                            setStrings();
0929:
0930:                            break;
0931:
0932:                        }
0933:
0934:                        else if (from != null
0935:                                && from.endsWith(prefix + search[i])) {
0936:
0937:                            sb.replace(sb.length()
0938:                                    - (prefix.length() + search[i].length()),
0939:
0940:                            sb.length(), replace);
0941:
0942:                            modified = true;
0943:
0944:                            setStrings();
0945:
0946:                            break;
0947:
0948:                        }
0949:
0950:                        else if (without && source.endsWith(search[i])) {
0951:
0952:                            sb.delete(sb.length() - search[i].length(), sb
0953:                                    .length());
0954:
0955:                            modified = true;
0956:
0957:                            setStrings();
0958:
0959:                            break;
0960:
0961:                        }
0962:
0963:                    }
0964:
0965:                }
0966:
0967:            }
0968:
0969:            /**
0970:
0971:             * Replace a search string with another within the source zone
0972:
0973:             *
0974:
0975:             * @param source java.lang.String - the source zone for search
0976:
0977:             * @param search java.lang.String[] - the strings to search for replacement
0978:
0979:             * @param replace java.lang.String - the replacement string
0980:
0981:             */
0982:
0983:            private boolean replaceFrom(String source, String[] search,
0984:                    String replace) {
0985:
0986:                boolean found = false;
0987:
0988:                if (source != null) {
0989:
0990:                    for (int i = 0; i < search.length; i++) {
0991:
0992:                        if (source.endsWith(search[i])) {
0993:
0994:                            sb.replace(sb.length() - search[i].length(), sb
0995:                                    .length(), replace);
0996:
0997:                            modified = true;
0998:
0999:                            found = true;
1000:
1001:                            setStrings();
1002:
1003:                            break;
1004:
1005:                        }
1006:
1007:                    }
1008:
1009:                }
1010:
1011:                return found;
1012:
1013:            }
1014:
1015:            /**
1016:
1017:             * Delete a search string within the source zone
1018:
1019:             *
1020:
1021:             * @param source the source zone for search
1022:
1023:             * @param suffix the strings to search for suppression
1024:
1025:             */
1026:
1027:            private void deleteFrom(String source, String[] suffix) {
1028:
1029:                if (source != null) {
1030:
1031:                    for (int i = 0; i < suffix.length; i++) {
1032:
1033:                        if (source.endsWith(suffix[i])) {
1034:
1035:                            sb.delete(sb.length() - suffix[i].length(), sb
1036:                                    .length());
1037:
1038:                            modified = true;
1039:
1040:                            setStrings();
1041:
1042:                            break;
1043:
1044:                        }
1045:
1046:                    }
1047:
1048:                }
1049:
1050:            }
1051:
1052:            /**
1053:
1054:             * Test if a char is a french vowel, including accentuated ones
1055:
1056:             *
1057:
1058:             * @param ch the char to test
1059:
1060:             * @return boolean - true if the char is a vowel
1061:
1062:             */
1063:
1064:            private boolean isVowel(char ch) {
1065:
1066:                switch (ch) {
1067:
1068:                case 'o':
1069:
1070:                case 'u':
1071:
1072:                case 'y':
1073:
1074:                case '\u00e2':
1075:
1076:                case '\u00e0':
1077:
1078:                case '\u00eb':
1079:
1080:                case '\u00e9':
1081:
1082:                case '\u00ea':
1083:
1084:                case '\u00e8':
1085:
1086:                case '\u00ef':
1087:
1088:                case '\u00ee':
1089:
1090:                case '\u00f4':
1091:
1092:                case '\u00fc':
1093:
1094:                case '\u00f9':
1095:
1096:                case '\u00fb':
1097:
1098:                    return true;
1099:
1100:                default:
1101:
1102:                    return false;
1103:
1104:                }
1105:
1106:            }
1107:
1108:            /**
1109:
1110:             * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
1111:
1112:             * "R is the region after the first non-vowel following a vowel
1113:
1114:             * or is the null region at the end of the word if there is no such non-vowel"<br>
1115:
1116:             * @param buffer java.lang.StringBuffer - the in buffer
1117:
1118:             * @return java.lang.String - the resulting string
1119:
1120:             */
1121:
1122:            private String retrieveR(StringBuffer buffer) {
1123:
1124:                int len = buffer.length();
1125:
1126:                int pos = -1;
1127:
1128:                for (int c = 0; c < len; c++) {
1129:
1130:                    if (isVowel(buffer.charAt(c))) {
1131:
1132:                        pos = c;
1133:
1134:                        break;
1135:
1136:                    }
1137:
1138:                }
1139:
1140:                if (pos > -1) {
1141:
1142:                    int consonne = -1;
1143:
1144:                    for (int c = pos; c < len; c++) {
1145:
1146:                        if (!isVowel(buffer.charAt(c))) {
1147:
1148:                            consonne = c;
1149:
1150:                            break;
1151:
1152:                        }
1153:
1154:                    }
1155:
1156:                    if (consonne > -1 && (consonne + 1) < len)
1157:
1158:                        return buffer.substring(consonne + 1, len);
1159:
1160:                    else
1161:
1162:                        return null;
1163:
1164:                }
1165:
1166:                else
1167:
1168:                    return null;
1169:
1170:            }
1171:
1172:            /**
1173:
1174:             * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
1175:
1176:             * "If the word begins with two vowels, RV is the region after the third letter,
1177:
1178:             * otherwise the region after the first vowel not at the beginning of the word,
1179:
1180:             * or the end of the word if these positions cannot be found."<br>
1181:
1182:             * @param buffer java.lang.StringBuffer - the in buffer
1183:
1184:             * @return java.lang.String - the resulting string
1185:
1186:             */
1187:
1188:            private String retrieveRV(StringBuffer buffer) {
1189:
1190:                int len = buffer.length();
1191:
1192:                if (buffer.length() > 3) {
1193:
1194:                    if (isVowel(buffer.charAt(0)) && isVowel(buffer.charAt(1))) {
1195:
1196:                        return buffer.substring(3, len);
1197:
1198:                    }
1199:
1200:                    else {
1201:
1202:                        int pos = 0;
1203:
1204:                        for (int c = 1; c < len; c++) {
1205:
1206:                            if (isVowel(buffer.charAt(c))) {
1207:
1208:                                pos = c;
1209:
1210:                                break;
1211:
1212:                            }
1213:
1214:                        }
1215:
1216:                        if (pos + 1 < len)
1217:
1218:                            return buffer.substring(pos + 1, len);
1219:
1220:                        else
1221:
1222:                            return null;
1223:
1224:                    }
1225:
1226:                }
1227:
1228:                else
1229:
1230:                    return null;
1231:
1232:            }
1233:
1234:            /**
1235:
1236:             * Turns u and i preceded AND followed by a vowel to UpperCase<br>
1237:
1238:             * Turns y preceded OR followed by a vowel to UpperCase<br>
1239:
1240:             * Turns u preceded by q to UpperCase<br>
1241:
1242:             *
1243:
1244:             * @param buffer java.util.StringBuffer - the buffer to treat
1245:
1246:             * @return java.util.StringBuffer - the treated buffer
1247:
1248:             */
1249:
1250:            private StringBuffer treatVowels(StringBuffer buffer) {
1251:
1252:                for (int c = 0; c < buffer.length(); c++) {
1253:
1254:                    char ch = buffer.charAt(c);
1255:
1256:                    if (c == 0) { // first char
1257:
1258:                        if (buffer.length() > 1) {
1259:
1260:                            if (ch == 'y' && isVowel(buffer.charAt(c + 1)))
1261:
1262:                                buffer.setCharAt(c, 'Y');
1263:
1264:                        }
1265:
1266:                    }
1267:
1268:                    else if (c == buffer.length() - 1) { // last char
1269:
1270:                        if (ch == 'u' && buffer.charAt(c - 1) == 'q')
1271:
1272:                            buffer.setCharAt(c, 'U');
1273:
1274:                        if (ch == 'y' && isVowel(buffer.charAt(c - 1)))
1275:
1276:                            buffer.setCharAt(c, 'Y');
1277:
1278:                    }
1279:
1280:                    else { // other cases
1281:
1282:                        if (ch == 'u') {
1283:
1284:                            if (buffer.charAt(c - 1) == 'q')
1285:
1286:                                buffer.setCharAt(c, 'U');
1287:
1288:                            else if (isVowel(buffer.charAt(c - 1))
1289:                                    && isVowel(buffer.charAt(c + 1)))
1290:
1291:                                buffer.setCharAt(c, 'U');
1292:
1293:                        }
1294:
1295:                        if (ch == 'i') {
1296:
1297:                            if (isVowel(buffer.charAt(c - 1))
1298:                                    && isVowel(buffer.charAt(c + 1)))
1299:
1300:                                buffer.setCharAt(c, 'I');
1301:
1302:                        }
1303:
1304:                        if (ch == 'y') {
1305:
1306:                            if (isVowel(buffer.charAt(c - 1))
1307:                                    || isVowel(buffer.charAt(c + 1)))
1308:
1309:                                buffer.setCharAt(c, 'Y');
1310:
1311:                        }
1312:
1313:                    }
1314:
1315:                }
1316:
1317:                return buffer;
1318:
1319:            }
1320:
1321:            /**
1322:
1323:             * Checks a term if it can be processed correctly.
1324:
1325:             *
1326:
1327:             * @return boolean - true if, and only if, the given term consists in letters.
1328:
1329:             */
1330:
1331:            private boolean isStemmable(String term) {
1332:
1333:                boolean upper = false;
1334:
1335:                int first = -1;
1336:
1337:                for (int c = 0; c < term.length(); c++) {
1338:
1339:                    // Discard terms that contain non-letter characters.
1340:
1341:                    if (!Character.isLetter(term.charAt(c))) {
1342:
1343:                        return false;
1344:
1345:                    }
1346:
1347:                    // Discard terms that contain multiple uppercase letters.
1348:
1349:                    if (Character.isUpperCase(term.charAt(c))) {
1350:
1351:                        if (upper) {
1352:
1353:                            return false;
1354:
1355:                        }
1356:
1357:                        // First encountered uppercase letter, set flag and save
1358:
1359:                        // position.
1360:
1361:                        else {
1362:
1363:                            first = c;
1364:
1365:                            upper = true;
1366:
1367:                        }
1368:
1369:                    }
1370:
1371:                }
1372:
1373:                // Discard the term if it contains a single uppercase letter that
1374:
1375:                // is not starting the term.
1376:
1377:                if (first > 0) {
1378:
1379:                    return false;
1380:
1381:                }
1382:
1383:                return true;
1384:
1385:            }
1386:
1387:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.