Source Code Cross Referenced for FrenchStemmer.java in  » Net » lucene-connector » org » apache » lucene » analysis » fr » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.analysis.fr 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package org.apache.lucene.analysis.fr;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        /**
021:         * A stemmer for French words. The algorithm is based on the work of
022:         * Dr Martin Porter on his snowball project<br>
023:         * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
024:         * (French stemming algorithm) for details
025:         *
026:         * @author    Patrick Talbot
027:         */
028:
029:        public class FrenchStemmer {
030:
031:            /**
032:             * Buffer for the terms while stemming them.
033:             */
034:            private StringBuffer sb = new StringBuffer();
035:
036:            /**
037:             * A temporary buffer, used to reconstruct R2
038:             */
039:            private StringBuffer tb = new StringBuffer();
040:
041:            /**
042:             * Region R0 is equal to the whole buffer
043:             */
044:            private String R0;
045:
046:            /**
047:             * Region RV
048:             * "If the word begins with two vowels, RV is the region after the third letter,
049:             * otherwise the region after the first vowel not at the beginning of the word,
050:             * or the end of the word if these positions cannot be found."
051:             */
052:            private String RV;
053:
054:            /**
055:             * Region R1
056:             * "R1 is the region after the first non-vowel following a vowel
057:             * or is the null region at the end of the word if there is no such non-vowel"
058:             */
059:            private String R1;
060:
061:            /**
062:             * Region R2
063:             * "R2 is the region after the first non-vowel in R1 following a vowel
064:             * or is the null region at the end of the word if there is no such non-vowel"
065:             */
066:            private String R2;
067:
068:            /**
069:             * Set to true if we need to perform step 2
070:             */
071:            private boolean suite;
072:
073:            /**
074:             * Set to true if the buffer was modified
075:             */
076:            private boolean modified;
077:
078:            /**
079:             * Stemms the given term to a unique <tt>discriminator</tt>.
080:             *
081:             * @param term  java.langString The term that should be stemmed
082:             * @return java.lang.String  Discriminator for <tt>term</tt>
083:             */
084:            protected String stem(String term) {
085:                if (!isStemmable(term)) {
086:                    return term;
087:                }
088:
089:                // Use lowercase for medium stemming.
090:                term = term.toLowerCase();
091:
092:                // Reset the StringBuffer.
093:                sb.delete(0, sb.length());
094:                sb.insert(0, term);
095:
096:                // reset the booleans
097:                modified = false;
098:                suite = false;
099:
100:                sb = treatVowels(sb);
101:
102:                setStrings();
103:
104:                step1();
105:
106:                if (!modified || suite) {
107:                    if (RV != null) {
108:                        suite = step2a();
109:                        if (!suite)
110:                            step2b();
111:                    }
112:                }
113:
114:                if (modified || suite)
115:                    step3();
116:                else
117:                    step4();
118:
119:                step5();
120:
121:                step6();
122:
123:                return sb.toString();
124:            }
125:
126:            /**
127:             * Sets the search region Strings<br>
128:             * it needs to be done each time the buffer was modified
129:             */
130:            private void setStrings() {
131:                // set the strings
132:                R0 = sb.toString();
133:                RV = retrieveRV(sb);
134:                R1 = retrieveR(sb);
135:                if (R1 != null) {
136:                    tb.delete(0, tb.length());
137:                    tb.insert(0, R1);
138:                    R2 = retrieveR(tb);
139:                } else
140:                    R2 = null;
141:            }
142:
143:            /**
144:             * First step of the Porter Algorithmn<br>
145:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
146:             */
147:            private void step1() {
148:                String[] suffix = { "ances", "iqUes", "ismes", "ables",
149:                        "istes", "ance", "iqUe", "isme", "able", "iste" };
150:                deleteFrom(R2, suffix);
151:
152:                replaceFrom(R2, new String[] { "logies", "logie" }, "log");
153:                replaceFrom(R2, new String[] { "usions", "utions", "usion",
154:                        "ution" }, "u");
155:                replaceFrom(R2, new String[] { "ences", "ence" }, "ent");
156:
157:                String[] search = { "atrices", "ateurs", "ations", "atrice",
158:                        "ateur", "ation" };
159:                deleteButSuffixFromElseReplace(R2, search, "ic", true, R0,
160:                        "iqU");
161:
162:                deleteButSuffixFromElseReplace(R2, new String[] { "ements",
163:                        "ement" }, "eus", false, R0, "eux");
164:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
165:                        "ativ", false);
166:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
167:                        "iv", false);
168:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
169:                        "abl", false);
170:                deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
171:                        "iqU", false);
172:
173:                deleteFromIfTestVowelBeforeIn(R1, new String[] { "issements",
174:                        "issement" }, false, R0);
175:                deleteFrom(RV, new String[] { "ements", "ement" });
176:
177:                deleteButSuffixFromElseReplace(R2, new String[] { "ités",
178:                        "ité" }, "abil", false, R0, "abl");
179:                deleteButSuffixFromElseReplace(R2, new String[] { "ités",
180:                        "ité" }, "ic", false, R0, "iqU");
181:                deleteButSuffixFrom(R2, new String[] { "ités", "ité" }, "iv",
182:                        true);
183:
184:                String[] autre = { "ifs", "ives", "if", "ive" };
185:                deleteButSuffixFromElseReplace(R2, autre, "icat", false, R0,
186:                        "iqU");
187:                deleteButSuffixFromElseReplace(R2, autre, "at", true, R2, "iqU");
188:
189:                replaceFrom(R0, new String[] { "eaux" }, "eau");
190:
191:                replaceFrom(R1, new String[] { "aux" }, "al");
192:
193:                deleteButSuffixFromElseReplace(R2, new String[] { "euses",
194:                        "euse" }, "", true, R1, "eux");
195:
196:                deleteFrom(R2, new String[] { "eux" });
197:
198:                // if one of the next steps is performed, we will need to perform step2a
199:                boolean temp = false;
200:                temp = replaceFrom(RV, new String[] { "amment" }, "ant");
201:                if (temp == true)
202:                    suite = true;
203:                temp = replaceFrom(RV, new String[] { "emment" }, "ent");
204:                if (temp == true)
205:                    suite = true;
206:                temp = deleteFromIfTestVowelBeforeIn(RV, new String[] {
207:                        "ments", "ment" }, true, RV);
208:                if (temp == true)
209:                    suite = true;
210:
211:            }
212:
213:            /**
214:             * Second step (A) of the Porter Algorithmn<br>
215:             * Will be performed if nothing changed from the first step
216:             * or changed were done in the amment, emment, ments or ment suffixes<br>
217:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
218:             *
219:             * @return boolean - true if something changed in the StringBuffer
220:             */
221:            private boolean step2a() {
222:                String[] search = { "îmes", "îtes", "iraIent", "irait",
223:                        "irais", "irai", "iras", "ira", "irent", "iriez",
224:                        "irez", "irions", "irons", "iront", "issaIent",
225:                        "issais", "issantes", "issante", "issants", "issant",
226:                        "issait", "issais", "issions", "issons", "issiez",
227:                        "issez", "issent", "isses", "isse", "ir", "is", "ît",
228:                        "it", "ies", "ie", "i" };
229:                return deleteFromIfTestVowelBeforeIn(RV, search, false, RV);
230:            }
231:
232:            /**
233:             * Second step (B) of the Porter Algorithmn<br>
234:             * Will be performed if step 2 A was performed unsuccessfully<br>
235:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
236:             */
237:            private void step2b() {
238:                String[] suffix = { "eraIent", "erais", "erait", "erai",
239:                        "eras", "erions", "eriez", "erons", "eront", "erez",
240:                        "èrent", "era", "ées", "iez", "ée", "és", "er",
241:                        "ez", "é" };
242:                deleteFrom(RV, suffix);
243:
244:                String[] search = { "assions", "assiez", "assent", "asses",
245:                        "asse", "aIent", "antes", "aIent", "Aient", "ante",
246:                        "âmes", "âtes", "ants", "ant", "ait", "aît", "ais",
247:                        "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
248:                deleteButSuffixFrom(RV, search, "e", true);
249:
250:                deleteFrom(R2, new String[] { "ions" });
251:            }
252:
253:            /**
254:             * Third step of the Porter Algorithmn<br>
255:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
256:             */
257:            private void step3() {
258:		if (sb.length()>0)
259:		{
260:			char ch = sb.charAt( sb.length()-1 );
261:			if (ch == 'Y')
262:			{
263:				sb.setCharAt( sb.length()-1, 'i' );
264:				setStrings();
265:			}
266:			else if (ch == 'ç')
267:			{
268:				sb.setCharAt( sb.length()-1, 'c' );
269:				setStrings();
270:			}
271:		}
272:	}
273:
274:            /**
275:             * Fourth step of the Porter Algorithmn<br>
276:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
277:             */
278:            private void step4() {
279:		if (sb.length() > 1)
280:		{
281:			char ch = sb.charAt( sb.length()-1 );
282:			if (ch == 's')
283:			{
284:				char b = sb.charAt( sb.length()-2 );
285:				if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
286:				{
287:					sb.delete( sb.length() - 1, sb.length());
288:					setStrings();
289:				}
290:			}
291:		}
292:		boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
293:		if (!found)
294:		found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
295:
296:		replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
297:		deleteFrom( RV, new String[] { "e" } );
298:		deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
299:	}
300:
301:            /**
302:             * Fifth step of the Porter Algorithmn<br>
303:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
304:             */
305:            private void step5() {
306:                if (R0 != null) {
307:                    if (R0.endsWith("enn") || R0.endsWith("onn")
308:                            || R0.endsWith("ett") || R0.endsWith("ell")
309:                            || R0.endsWith("eill")) {
310:                        sb.delete(sb.length() - 1, sb.length());
311:                        setStrings();
312:                    }
313:                }
314:            }
315:
316:            /**
317:             * Sixth (and last!) step of the Porter Algorithmn<br>
318:             * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
319:             */
320:            private void step6() {
321:		if (R0!=null && R0.length()>0)
322:		{
323:			boolean seenVowel = false;
324:			boolean seenConson = false;
325:			int pos = -1;
326:			for (int i = R0.length()-1; i > -1; i--)
327:			{
328:				char ch = R0.charAt(i);
329:				if (isVowel(ch))
330:				{
331:					if (!seenVowel)
332:					{
333:						if (ch == 'é' || ch == 'è')
334:						{
335:							pos = i;
336:							break;
337:						}
338:					}
339:					seenVowel = true;
340:				}
341:				else
342:				{
343:					if (seenVowel)
344:						break;
345:					else
346:						seenConson = true;
347:				}
348:			}
349:			if (pos > -1 && seenConson && !seenVowel)
350:				sb.setCharAt(pos, 'e');
351:		}
352:	}
353:
354:            /**
355:             * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
356:             *
357:             * @param source java.lang.String - the primary source zone for search
358:             * @param search java.lang.String[] - the strings to search for suppression
359:             * @param from java.lang.String - the secondary source zone for search
360:             * @param prefix java.lang.String - the prefix to add to the search string to test
361:             * @return boolean - true if modified
362:             */
363:            private boolean deleteFromIfPrecededIn(String source,
364:                    String[] search, String from, String prefix) {
365:                boolean found = false;
366:                if (source != null) {
367:                    for (int i = 0; i < search.length; i++) {
368:                        if (source.endsWith(search[i])) {
369:                            if (from != null
370:                                    && from.endsWith(prefix + search[i])) {
371:                                sb.delete(sb.length() - search[i].length(), sb
372:                                        .length());
373:                                found = true;
374:                                setStrings();
375:                                break;
376:                            }
377:                        }
378:                    }
379:                }
380:                return found;
381:            }
382:
383:            /**
384:             * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
385:             *
386:             * @param source java.lang.String - the primary source zone for search
387:             * @param search java.lang.String[] - the strings to search for suppression
388:             * @param vowel boolean - true if we need a vowel before the search string
389:             * @param from java.lang.String - the secondary source zone for search (where vowel could be)
390:             * @return boolean - true if modified
391:             */
392:            private boolean deleteFromIfTestVowelBeforeIn(String source,
393:                    String[] search, boolean vowel, String from) {
394:                boolean found = false;
395:                if (source != null && from != null) {
396:                    for (int i = 0; i < search.length; i++) {
397:                        if (source.endsWith(search[i])) {
398:                            if ((search[i].length() + 1) <= from.length()) {
399:                                boolean test = isVowel(sb.charAt(sb.length()
400:                                        - (search[i].length() + 1)));
401:                                if (test == vowel) {
402:                                    sb.delete(sb.length() - search[i].length(),
403:                                            sb.length());
404:                                    modified = true;
405:                                    found = true;
406:                                    setStrings();
407:                                    break;
408:                                }
409:                            }
410:                        }
411:                    }
412:                }
413:                return found;
414:            }
415:
416:            /**
417:             * Delete a suffix searched in zone "source" if preceded by the prefix
418:             *
419:             * @param source java.lang.String - the primary source zone for search
420:             * @param search java.lang.String[] - the strings to search for suppression
421:             * @param prefix java.lang.String - the prefix to add to the search string to test
422:             * @param without boolean - true if it will be deleted even without prefix found
423:             */
424:            private void deleteButSuffixFrom(String source, String[] search,
425:                    String prefix, boolean without) {
426:                if (source != null) {
427:                    for (int i = 0; i < search.length; i++) {
428:                        if (source.endsWith(prefix + search[i])) {
429:                            sb.delete(sb.length()
430:                                    - (prefix.length() + search[i].length()),
431:                                    sb.length());
432:                            modified = true;
433:                            setStrings();
434:                            break;
435:                        } else if (without && source.endsWith(search[i])) {
436:                            sb.delete(sb.length() - search[i].length(), sb
437:                                    .length());
438:                            modified = true;
439:                            setStrings();
440:                            break;
441:                        }
442:                    }
443:                }
444:            }
445:
446:            /**
447:             * Delete a suffix searched in zone "source" if preceded by prefix<br>
448:             * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
449:             * or delete the suffix if specified
450:             *
451:             * @param source java.lang.String - the primary source zone for search
452:             * @param search java.lang.String[] - the strings to search for suppression
453:             * @param prefix java.lang.String - the prefix to add to the search string to test
454:             * @param without boolean - true if it will be deleted even without prefix found
455:             */
456:            private void deleteButSuffixFromElseReplace(String source,
457:                    String[] search, String prefix, boolean without,
458:                    String from, String replace) {
459:                if (source != null) {
460:                    for (int i = 0; i < search.length; i++) {
461:                        if (source.endsWith(prefix + search[i])) {
462:                            sb.delete(sb.length()
463:                                    - (prefix.length() + search[i].length()),
464:                                    sb.length());
465:                            modified = true;
466:                            setStrings();
467:                            break;
468:                        } else if (from != null
469:                                && from.endsWith(prefix + search[i])) {
470:                            sb.replace(sb.length()
471:                                    - (prefix.length() + search[i].length()),
472:                                    sb.length(), replace);
473:                            modified = true;
474:                            setStrings();
475:                            break;
476:                        } else if (without && source.endsWith(search[i])) {
477:                            sb.delete(sb.length() - search[i].length(), sb
478:                                    .length());
479:                            modified = true;
480:                            setStrings();
481:                            break;
482:                        }
483:                    }
484:                }
485:            }
486:
487:            /**
488:             * Replace a search string with another within the source zone
489:             *
490:             * @param source java.lang.String - the source zone for search
491:             * @param search java.lang.String[] - the strings to search for replacement
492:             * @param replace java.lang.String - the replacement string
493:             */
494:            private boolean replaceFrom(String source, String[] search,
495:                    String replace) {
496:                boolean found = false;
497:                if (source != null) {
498:                    for (int i = 0; i < search.length; i++) {
499:                        if (source.endsWith(search[i])) {
500:                            sb.replace(sb.length() - search[i].length(), sb
501:                                    .length(), replace);
502:                            modified = true;
503:                            found = true;
504:                            setStrings();
505:                            break;
506:                        }
507:                    }
508:                }
509:                return found;
510:            }
511:
512:            /**
513:             * Delete a search string within the source zone
514:             *
515:             * @param source the source zone for search
516:             * @param suffix the strings to search for suppression
517:             */
518:            private void deleteFrom(String source, String[] suffix) {
519:                if (source != null) {
520:                    for (int i = 0; i < suffix.length; i++) {
521:                        if (source.endsWith(suffix[i])) {
522:                            sb.delete(sb.length() - suffix[i].length(), sb
523:                                    .length());
524:                            modified = true;
525:                            setStrings();
526:                            break;
527:                        }
528:                    }
529:                }
530:            }
531:
532:            /**
533:             * Test if a char is a french vowel, including accentuated ones
534:             *
535:             * @param ch the char to test
536:             * @return boolean - true if the char is a vowel
537:             */
538:            private boolean isVowel(char ch) {
539:		switch (ch)
540:		{
541:			case 'a':
542:			case 'e':
543:			case 'i':
544:			case 'o':
545:			case 'u':
546:			case 'y':
547:			case 'â':
548:			case 'à':
549:			case 'ë':
550:			case 'é':
551:			case 'ê':
552:			case 'è':
553:			case 'ï':
554:			case 'î':
555:			case 'ô':
556:			case 'ü':
557:			case 'ù':
558:			case 'û':
559:				return true;
560:			default:
561:				return false;
562:		}
563:	}
564:
565:            /**
566:             * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
567:             * "R is the region after the first non-vowel following a vowel
568:             * or is the null region at the end of the word if there is no such non-vowel"<br>
569:             * @param buffer java.lang.StringBuffer - the in buffer
570:             * @return java.lang.String - the resulting string
571:             */
572:            private String retrieveR(StringBuffer buffer) {
573:                int len = buffer.length();
574:                int pos = -1;
575:                for (int c = 0; c < len; c++) {
576:                    if (isVowel(buffer.charAt(c))) {
577:                        pos = c;
578:                        break;
579:                    }
580:                }
581:                if (pos > -1) {
582:                    int consonne = -1;
583:                    for (int c = pos; c < len; c++) {
584:                        if (!isVowel(buffer.charAt(c))) {
585:                            consonne = c;
586:                            break;
587:                        }
588:                    }
589:                    if (consonne > -1 && (consonne + 1) < len)
590:                        return buffer.substring(consonne + 1, len);
591:                    else
592:                        return null;
593:                } else
594:                    return null;
595:            }
596:
597:            /**
598:             * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
599:             * "If the word begins with two vowels, RV is the region after the third letter,
600:             * otherwise the region after the first vowel not at the beginning of the word,
601:             * or the end of the word if these positions cannot be found."<br>
602:             * @param buffer java.lang.StringBuffer - the in buffer
603:             * @return java.lang.String - the resulting string
604:             */
605:            private String retrieveRV(StringBuffer buffer) {
606:                int len = buffer.length();
607:                if (buffer.length() > 3) {
608:                    if (isVowel(buffer.charAt(0)) && isVowel(buffer.charAt(1))) {
609:                        return buffer.substring(3, len);
610:                    } else {
611:                        int pos = 0;
612:                        for (int c = 1; c < len; c++) {
613:                            if (isVowel(buffer.charAt(c))) {
614:                                pos = c;
615:                                break;
616:                            }
617:                        }
618:                        if (pos + 1 < len)
619:                            return buffer.substring(pos + 1, len);
620:                        else
621:                            return null;
622:                    }
623:                } else
624:                    return null;
625:            }
626:
627:            /**
628:             * Turns u and i preceded AND followed by a vowel to UpperCase<br>
629:             * Turns y preceded OR followed by a vowel to UpperCase<br>
630:             * Turns u preceded by q to UpperCase<br>
631:             *
632:             * @param buffer java.util.StringBuffer - the buffer to treat
633:             * @return java.util.StringBuffer - the treated buffer
634:             */
635:            private StringBuffer treatVowels(StringBuffer buffer) {
636:                for (int c = 0; c < buffer.length(); c++) {
637:                    char ch = buffer.charAt(c);
638:
639:                    if (c == 0) // first char
640:                    {
641:                        if (buffer.length() > 1) {
642:                            if (ch == 'y' && isVowel(buffer.charAt(c + 1)))
643:                                buffer.setCharAt(c, 'Y');
644:                        }
645:                    } else if (c == buffer.length() - 1) // last char
646:                    {
647:                        if (ch == 'u' && buffer.charAt(c - 1) == 'q')
648:                            buffer.setCharAt(c, 'U');
649:                        if (ch == 'y' && isVowel(buffer.charAt(c - 1)))
650:                            buffer.setCharAt(c, 'Y');
651:                    } else // other cases
652:                    {
653:                        if (ch == 'u') {
654:                            if (buffer.charAt(c - 1) == 'q')
655:                                buffer.setCharAt(c, 'U');
656:                            else if (isVowel(buffer.charAt(c - 1))
657:                                    && isVowel(buffer.charAt(c + 1)))
658:                                buffer.setCharAt(c, 'U');
659:                        }
660:                        if (ch == 'i') {
661:                            if (isVowel(buffer.charAt(c - 1))
662:                                    && isVowel(buffer.charAt(c + 1)))
663:                                buffer.setCharAt(c, 'I');
664:                        }
665:                        if (ch == 'y') {
666:                            if (isVowel(buffer.charAt(c - 1))
667:                                    || isVowel(buffer.charAt(c + 1)))
668:                                buffer.setCharAt(c, 'Y');
669:                        }
670:                    }
671:                }
672:
673:                return buffer;
674:            }
675:
676:            /**
677:             * Checks a term if it can be processed correctly.
678:             *
679:             * @return boolean - true if, and only if, the given term consists in letters.
680:             */
681:            private boolean isStemmable(String term) {
682:                boolean upper = false;
683:                int first = -1;
684:                for (int c = 0; c < term.length(); c++) {
685:                    // Discard terms that contain non-letter characters.
686:                    if (!Character.isLetter(term.charAt(c))) {
687:                        return false;
688:                    }
689:                    // Discard terms that contain multiple uppercase letters.
690:                    if (Character.isUpperCase(term.charAt(c))) {
691:                        if (upper) {
692:                            return false;
693:                        }
694:                        // First encountered uppercase letter, set flag and save
695:                        // position.
696:                        else {
697:                            first = c;
698:                            upper = true;
699:                        }
700:                    }
701:                }
702:                // Discard the term if it contains a single uppercase letter that
703:                // is not starting the term.
704:                if (first > 0) {
705:                    return false;
706:                }
707:                return true;
708:            }
709:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.