001: package org.apache.lucene.analysis.fr;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: /**
021: * A stemmer for French words. The algorithm is based on the work of
022: * Dr Martin Porter on his snowball project<br>
023: * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
024: * (French stemming algorithm) for details
025: *
026: * @author Patrick Talbot
027: */
028:
029: public class FrenchStemmer {
030:
031: /**
032: * Buffer for the terms while stemming them.
033: */
034: private StringBuffer sb = new StringBuffer();
035:
036: /**
037: * A temporary buffer, used to reconstruct R2
038: */
039: private StringBuffer tb = new StringBuffer();
040:
041: /**
042: * Region R0 is equal to the whole buffer
043: */
044: private String R0;
045:
046: /**
047: * Region RV
048: * "If the word begins with two vowels, RV is the region after the third letter,
049: * otherwise the region after the first vowel not at the beginning of the word,
050: * or the end of the word if these positions cannot be found."
051: */
052: private String RV;
053:
054: /**
055: * Region R1
056: * "R1 is the region after the first non-vowel following a vowel
057: * or is the null region at the end of the word if there is no such non-vowel"
058: */
059: private String R1;
060:
061: /**
062: * Region R2
063: * "R2 is the region after the first non-vowel in R1 following a vowel
064: * or is the null region at the end of the word if there is no such non-vowel"
065: */
066: private String R2;
067:
068: /**
069: * Set to true if we need to perform step 2
070: */
071: private boolean suite;
072:
073: /**
074: * Set to true if the buffer was modified
075: */
076: private boolean modified;
077:
078: /**
079: * Stemms the given term to a unique <tt>discriminator</tt>.
080: *
081: * @param term java.langString The term that should be stemmed
082: * @return java.lang.String Discriminator for <tt>term</tt>
083: */
084: protected String stem(String term) {
085: if (!isStemmable(term)) {
086: return term;
087: }
088:
089: // Use lowercase for medium stemming.
090: term = term.toLowerCase();
091:
092: // Reset the StringBuffer.
093: sb.delete(0, sb.length());
094: sb.insert(0, term);
095:
096: // reset the booleans
097: modified = false;
098: suite = false;
099:
100: sb = treatVowels(sb);
101:
102: setStrings();
103:
104: step1();
105:
106: if (!modified || suite) {
107: if (RV != null) {
108: suite = step2a();
109: if (!suite)
110: step2b();
111: }
112: }
113:
114: if (modified || suite)
115: step3();
116: else
117: step4();
118:
119: step5();
120:
121: step6();
122:
123: return sb.toString();
124: }
125:
126: /**
127: * Sets the search region Strings<br>
128: * it needs to be done each time the buffer was modified
129: */
130: private void setStrings() {
131: // set the strings
132: R0 = sb.toString();
133: RV = retrieveRV(sb);
134: R1 = retrieveR(sb);
135: if (R1 != null) {
136: tb.delete(0, tb.length());
137: tb.insert(0, R1);
138: R2 = retrieveR(tb);
139: } else
140: R2 = null;
141: }
142:
143: /**
144: * First step of the Porter Algorithmn<br>
145: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
146: */
147: private void step1() {
148: String[] suffix = { "ances", "iqUes", "ismes", "ables",
149: "istes", "ance", "iqUe", "isme", "able", "iste" };
150: deleteFrom(R2, suffix);
151:
152: replaceFrom(R2, new String[] { "logies", "logie" }, "log");
153: replaceFrom(R2, new String[] { "usions", "utions", "usion",
154: "ution" }, "u");
155: replaceFrom(R2, new String[] { "ences", "ence" }, "ent");
156:
157: String[] search = { "atrices", "ateurs", "ations", "atrice",
158: "ateur", "ation" };
159: deleteButSuffixFromElseReplace(R2, search, "ic", true, R0,
160: "iqU");
161:
162: deleteButSuffixFromElseReplace(R2, new String[] { "ements",
163: "ement" }, "eus", false, R0, "eux");
164: deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
165: "ativ", false);
166: deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
167: "iv", false);
168: deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
169: "abl", false);
170: deleteButSuffixFrom(R2, new String[] { "ements", "ement" },
171: "iqU", false);
172:
173: deleteFromIfTestVowelBeforeIn(R1, new String[] { "issements",
174: "issement" }, false, R0);
175: deleteFrom(RV, new String[] { "ements", "ement" });
176:
177: deleteButSuffixFromElseReplace(R2, new String[] { "ités",
178: "ité" }, "abil", false, R0, "abl");
179: deleteButSuffixFromElseReplace(R2, new String[] { "ités",
180: "ité" }, "ic", false, R0, "iqU");
181: deleteButSuffixFrom(R2, new String[] { "ités", "ité" }, "iv",
182: true);
183:
184: String[] autre = { "ifs", "ives", "if", "ive" };
185: deleteButSuffixFromElseReplace(R2, autre, "icat", false, R0,
186: "iqU");
187: deleteButSuffixFromElseReplace(R2, autre, "at", true, R2, "iqU");
188:
189: replaceFrom(R0, new String[] { "eaux" }, "eau");
190:
191: replaceFrom(R1, new String[] { "aux" }, "al");
192:
193: deleteButSuffixFromElseReplace(R2, new String[] { "euses",
194: "euse" }, "", true, R1, "eux");
195:
196: deleteFrom(R2, new String[] { "eux" });
197:
198: // if one of the next steps is performed, we will need to perform step2a
199: boolean temp = false;
200: temp = replaceFrom(RV, new String[] { "amment" }, "ant");
201: if (temp == true)
202: suite = true;
203: temp = replaceFrom(RV, new String[] { "emment" }, "ent");
204: if (temp == true)
205: suite = true;
206: temp = deleteFromIfTestVowelBeforeIn(RV, new String[] {
207: "ments", "ment" }, true, RV);
208: if (temp == true)
209: suite = true;
210:
211: }
212:
213: /**
214: * Second step (A) of the Porter Algorithmn<br>
215: * Will be performed if nothing changed from the first step
216: * or changed were done in the amment, emment, ments or ment suffixes<br>
217: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
218: *
219: * @return boolean - true if something changed in the StringBuffer
220: */
221: private boolean step2a() {
222: String[] search = { "îmes", "îtes", "iraIent", "irait",
223: "irais", "irai", "iras", "ira", "irent", "iriez",
224: "irez", "irions", "irons", "iront", "issaIent",
225: "issais", "issantes", "issante", "issants", "issant",
226: "issait", "issais", "issions", "issons", "issiez",
227: "issez", "issent", "isses", "isse", "ir", "is", "ît",
228: "it", "ies", "ie", "i" };
229: return deleteFromIfTestVowelBeforeIn(RV, search, false, RV);
230: }
231:
232: /**
233: * Second step (B) of the Porter Algorithmn<br>
234: * Will be performed if step 2 A was performed unsuccessfully<br>
235: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
236: */
237: private void step2b() {
238: String[] suffix = { "eraIent", "erais", "erait", "erai",
239: "eras", "erions", "eriez", "erons", "eront", "erez",
240: "èrent", "era", "ées", "iez", "ée", "és", "er",
241: "ez", "é" };
242: deleteFrom(RV, suffix);
243:
244: String[] search = { "assions", "assiez", "assent", "asses",
245: "asse", "aIent", "antes", "aIent", "Aient", "ante",
246: "âmes", "âtes", "ants", "ant", "ait", "aît", "ais",
247: "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
248: deleteButSuffixFrom(RV, search, "e", true);
249:
250: deleteFrom(R2, new String[] { "ions" });
251: }
252:
253: /**
254: * Third step of the Porter Algorithmn<br>
255: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
256: */
257: private void step3() {
258: if (sb.length()>0)
259: {
260: char ch = sb.charAt( sb.length()-1 );
261: if (ch == 'Y')
262: {
263: sb.setCharAt( sb.length()-1, 'i' );
264: setStrings();
265: }
266: else if (ch == 'ç')
267: {
268: sb.setCharAt( sb.length()-1, 'c' );
269: setStrings();
270: }
271: }
272: }
273:
274: /**
275: * Fourth step of the Porter Algorithmn<br>
276: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
277: */
278: private void step4() {
279: if (sb.length() > 1)
280: {
281: char ch = sb.charAt( sb.length()-1 );
282: if (ch == 's')
283: {
284: char b = sb.charAt( sb.length()-2 );
285: if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
286: {
287: sb.delete( sb.length() - 1, sb.length());
288: setStrings();
289: }
290: }
291: }
292: boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
293: if (!found)
294: found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
295:
296: replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
297: deleteFrom( RV, new String[] { "e" } );
298: deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
299: }
300:
301: /**
302: * Fifth step of the Porter Algorithmn<br>
303: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
304: */
305: private void step5() {
306: if (R0 != null) {
307: if (R0.endsWith("enn") || R0.endsWith("onn")
308: || R0.endsWith("ett") || R0.endsWith("ell")
309: || R0.endsWith("eill")) {
310: sb.delete(sb.length() - 1, sb.length());
311: setStrings();
312: }
313: }
314: }
315:
316: /**
317: * Sixth (and last!) step of the Porter Algorithmn<br>
318: * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
319: */
320: private void step6() {
321: if (R0!=null && R0.length()>0)
322: {
323: boolean seenVowel = false;
324: boolean seenConson = false;
325: int pos = -1;
326: for (int i = R0.length()-1; i > -1; i--)
327: {
328: char ch = R0.charAt(i);
329: if (isVowel(ch))
330: {
331: if (!seenVowel)
332: {
333: if (ch == 'é' || ch == 'è')
334: {
335: pos = i;
336: break;
337: }
338: }
339: seenVowel = true;
340: }
341: else
342: {
343: if (seenVowel)
344: break;
345: else
346: seenConson = true;
347: }
348: }
349: if (pos > -1 && seenConson && !seenVowel)
350: sb.setCharAt(pos, 'e');
351: }
352: }
353:
354: /**
355: * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
356: *
357: * @param source java.lang.String - the primary source zone for search
358: * @param search java.lang.String[] - the strings to search for suppression
359: * @param from java.lang.String - the secondary source zone for search
360: * @param prefix java.lang.String - the prefix to add to the search string to test
361: * @return boolean - true if modified
362: */
363: private boolean deleteFromIfPrecededIn(String source,
364: String[] search, String from, String prefix) {
365: boolean found = false;
366: if (source != null) {
367: for (int i = 0; i < search.length; i++) {
368: if (source.endsWith(search[i])) {
369: if (from != null
370: && from.endsWith(prefix + search[i])) {
371: sb.delete(sb.length() - search[i].length(), sb
372: .length());
373: found = true;
374: setStrings();
375: break;
376: }
377: }
378: }
379: }
380: return found;
381: }
382:
383: /**
384: * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
385: *
386: * @param source java.lang.String - the primary source zone for search
387: * @param search java.lang.String[] - the strings to search for suppression
388: * @param vowel boolean - true if we need a vowel before the search string
389: * @param from java.lang.String - the secondary source zone for search (where vowel could be)
390: * @return boolean - true if modified
391: */
392: private boolean deleteFromIfTestVowelBeforeIn(String source,
393: String[] search, boolean vowel, String from) {
394: boolean found = false;
395: if (source != null && from != null) {
396: for (int i = 0; i < search.length; i++) {
397: if (source.endsWith(search[i])) {
398: if ((search[i].length() + 1) <= from.length()) {
399: boolean test = isVowel(sb.charAt(sb.length()
400: - (search[i].length() + 1)));
401: if (test == vowel) {
402: sb.delete(sb.length() - search[i].length(),
403: sb.length());
404: modified = true;
405: found = true;
406: setStrings();
407: break;
408: }
409: }
410: }
411: }
412: }
413: return found;
414: }
415:
416: /**
417: * Delete a suffix searched in zone "source" if preceded by the prefix
418: *
419: * @param source java.lang.String - the primary source zone for search
420: * @param search java.lang.String[] - the strings to search for suppression
421: * @param prefix java.lang.String - the prefix to add to the search string to test
422: * @param without boolean - true if it will be deleted even without prefix found
423: */
424: private void deleteButSuffixFrom(String source, String[] search,
425: String prefix, boolean without) {
426: if (source != null) {
427: for (int i = 0; i < search.length; i++) {
428: if (source.endsWith(prefix + search[i])) {
429: sb.delete(sb.length()
430: - (prefix.length() + search[i].length()),
431: sb.length());
432: modified = true;
433: setStrings();
434: break;
435: } else if (without && source.endsWith(search[i])) {
436: sb.delete(sb.length() - search[i].length(), sb
437: .length());
438: modified = true;
439: setStrings();
440: break;
441: }
442: }
443: }
444: }
445:
446: /**
447: * Delete a suffix searched in zone "source" if preceded by prefix<br>
448: * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
449: * or delete the suffix if specified
450: *
451: * @param source java.lang.String - the primary source zone for search
452: * @param search java.lang.String[] - the strings to search for suppression
453: * @param prefix java.lang.String - the prefix to add to the search string to test
454: * @param without boolean - true if it will be deleted even without prefix found
455: */
456: private void deleteButSuffixFromElseReplace(String source,
457: String[] search, String prefix, boolean without,
458: String from, String replace) {
459: if (source != null) {
460: for (int i = 0; i < search.length; i++) {
461: if (source.endsWith(prefix + search[i])) {
462: sb.delete(sb.length()
463: - (prefix.length() + search[i].length()),
464: sb.length());
465: modified = true;
466: setStrings();
467: break;
468: } else if (from != null
469: && from.endsWith(prefix + search[i])) {
470: sb.replace(sb.length()
471: - (prefix.length() + search[i].length()),
472: sb.length(), replace);
473: modified = true;
474: setStrings();
475: break;
476: } else if (without && source.endsWith(search[i])) {
477: sb.delete(sb.length() - search[i].length(), sb
478: .length());
479: modified = true;
480: setStrings();
481: break;
482: }
483: }
484: }
485: }
486:
487: /**
488: * Replace a search string with another within the source zone
489: *
490: * @param source java.lang.String - the source zone for search
491: * @param search java.lang.String[] - the strings to search for replacement
492: * @param replace java.lang.String - the replacement string
493: */
494: private boolean replaceFrom(String source, String[] search,
495: String replace) {
496: boolean found = false;
497: if (source != null) {
498: for (int i = 0; i < search.length; i++) {
499: if (source.endsWith(search[i])) {
500: sb.replace(sb.length() - search[i].length(), sb
501: .length(), replace);
502: modified = true;
503: found = true;
504: setStrings();
505: break;
506: }
507: }
508: }
509: return found;
510: }
511:
512: /**
513: * Delete a search string within the source zone
514: *
515: * @param source the source zone for search
516: * @param suffix the strings to search for suppression
517: */
518: private void deleteFrom(String source, String[] suffix) {
519: if (source != null) {
520: for (int i = 0; i < suffix.length; i++) {
521: if (source.endsWith(suffix[i])) {
522: sb.delete(sb.length() - suffix[i].length(), sb
523: .length());
524: modified = true;
525: setStrings();
526: break;
527: }
528: }
529: }
530: }
531:
532: /**
533: * Test if a char is a french vowel, including accentuated ones
534: *
535: * @param ch the char to test
536: * @return boolean - true if the char is a vowel
537: */
538: private boolean isVowel(char ch) {
539: switch (ch)
540: {
541: case 'a':
542: case 'e':
543: case 'i':
544: case 'o':
545: case 'u':
546: case 'y':
547: case 'â':
548: case 'à':
549: case 'ë':
550: case 'é':
551: case 'ê':
552: case 'è':
553: case 'ï':
554: case 'î':
555: case 'ô':
556: case 'ü':
557: case 'ù':
558: case 'û':
559: return true;
560: default:
561: return false;
562: }
563: }
564:
565: /**
566: * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
567: * "R is the region after the first non-vowel following a vowel
568: * or is the null region at the end of the word if there is no such non-vowel"<br>
569: * @param buffer java.lang.StringBuffer - the in buffer
570: * @return java.lang.String - the resulting string
571: */
572: private String retrieveR(StringBuffer buffer) {
573: int len = buffer.length();
574: int pos = -1;
575: for (int c = 0; c < len; c++) {
576: if (isVowel(buffer.charAt(c))) {
577: pos = c;
578: break;
579: }
580: }
581: if (pos > -1) {
582: int consonne = -1;
583: for (int c = pos; c < len; c++) {
584: if (!isVowel(buffer.charAt(c))) {
585: consonne = c;
586: break;
587: }
588: }
589: if (consonne > -1 && (consonne + 1) < len)
590: return buffer.substring(consonne + 1, len);
591: else
592: return null;
593: } else
594: return null;
595: }
596:
597: /**
598: * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
599: * "If the word begins with two vowels, RV is the region after the third letter,
600: * otherwise the region after the first vowel not at the beginning of the word,
601: * or the end of the word if these positions cannot be found."<br>
602: * @param buffer java.lang.StringBuffer - the in buffer
603: * @return java.lang.String - the resulting string
604: */
605: private String retrieveRV(StringBuffer buffer) {
606: int len = buffer.length();
607: if (buffer.length() > 3) {
608: if (isVowel(buffer.charAt(0)) && isVowel(buffer.charAt(1))) {
609: return buffer.substring(3, len);
610: } else {
611: int pos = 0;
612: for (int c = 1; c < len; c++) {
613: if (isVowel(buffer.charAt(c))) {
614: pos = c;
615: break;
616: }
617: }
618: if (pos + 1 < len)
619: return buffer.substring(pos + 1, len);
620: else
621: return null;
622: }
623: } else
624: return null;
625: }
626:
627: /**
628: * Turns u and i preceded AND followed by a vowel to UpperCase<br>
629: * Turns y preceded OR followed by a vowel to UpperCase<br>
630: * Turns u preceded by q to UpperCase<br>
631: *
632: * @param buffer java.util.StringBuffer - the buffer to treat
633: * @return java.util.StringBuffer - the treated buffer
634: */
635: private StringBuffer treatVowels(StringBuffer buffer) {
636: for (int c = 0; c < buffer.length(); c++) {
637: char ch = buffer.charAt(c);
638:
639: if (c == 0) // first char
640: {
641: if (buffer.length() > 1) {
642: if (ch == 'y' && isVowel(buffer.charAt(c + 1)))
643: buffer.setCharAt(c, 'Y');
644: }
645: } else if (c == buffer.length() - 1) // last char
646: {
647: if (ch == 'u' && buffer.charAt(c - 1) == 'q')
648: buffer.setCharAt(c, 'U');
649: if (ch == 'y' && isVowel(buffer.charAt(c - 1)))
650: buffer.setCharAt(c, 'Y');
651: } else // other cases
652: {
653: if (ch == 'u') {
654: if (buffer.charAt(c - 1) == 'q')
655: buffer.setCharAt(c, 'U');
656: else if (isVowel(buffer.charAt(c - 1))
657: && isVowel(buffer.charAt(c + 1)))
658: buffer.setCharAt(c, 'U');
659: }
660: if (ch == 'i') {
661: if (isVowel(buffer.charAt(c - 1))
662: && isVowel(buffer.charAt(c + 1)))
663: buffer.setCharAt(c, 'I');
664: }
665: if (ch == 'y') {
666: if (isVowel(buffer.charAt(c - 1))
667: || isVowel(buffer.charAt(c + 1)))
668: buffer.setCharAt(c, 'Y');
669: }
670: }
671: }
672:
673: return buffer;
674: }
675:
676: /**
677: * Checks a term if it can be processed correctly.
678: *
679: * @return boolean - true if, and only if, the given term consists in letters.
680: */
681: private boolean isStemmable(String term) {
682: boolean upper = false;
683: int first = -1;
684: for (int c = 0; c < term.length(); c++) {
685: // Discard terms that contain non-letter characters.
686: if (!Character.isLetter(term.charAt(c))) {
687: return false;
688: }
689: // Discard terms that contain multiple uppercase letters.
690: if (Character.isUpperCase(term.charAt(c))) {
691: if (upper) {
692: return false;
693: }
694: // First encountered uppercase letter, set flag and save
695: // position.
696: else {
697: first = c;
698: upper = true;
699: }
700: }
701: }
702: // Discard the term if it contains a single uppercase letter that
703: // is not starting the term.
704: if (first > 0) {
705: return false;
706: }
707: return true;
708: }
709: }
|