001: package org.apache.lucene.analysis.nl;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.util.Map;
021:
022: /**
023: *
024: * A stemmer for Dutch words. The algorithm is an implementation of
025: * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
026: * algorithm in Martin Porter's snowball project.
027: *
028: * @author Edwin de Jonge (ejne at cbs.nl)
029: */
030:
031: public class DutchStemmer {
032: /**
033: * Buffer for the terms while stemming them.
034: */
035: private StringBuffer sb = new StringBuffer();
036: private boolean _removedE;
037: private Map _stemDict;
038:
039: private int _R1;
040: private int _R2;
041:
042: //TODO convert to internal
043: /*
044: * Stemms the given term to an unique <tt>discriminator</tt>.
045: *
046: * @param term The term that should be stemmed.
047: * @return Discriminator for <tt>term</tt>
048: */
049: public String stem(String term) {
050: term = term.toLowerCase();
051: if (!isStemmable(term))
052: return term;
053: if (_stemDict != null && _stemDict.containsKey(term))
054: if (_stemDict.get(term) instanceof String)
055: return (String) _stemDict.get(term);
056: else
057: return null;
058:
059: // Reset the StringBuffer.
060: sb.delete(0, sb.length());
061: sb.insert(0, term);
062: // Stemming starts here...
063: substitute(sb);
064: storeYandI(sb);
065: _R1 = getRIndex(sb, 0);
066: _R1 = Math.max(3, _R1);
067: step1(sb);
068: step2(sb);
069: _R2 = getRIndex(sb, _R1);
070: step3a(sb);
071: step3b(sb);
072: step4(sb);
073: reStoreYandI(sb);
074: return sb.toString();
075: }
076:
077: private boolean enEnding(StringBuffer sb) {
078: String[] enend = new String[] { "ene", "en" };
079: for (int i = 0; i < enend.length; i++) {
080: String end = enend[i];
081: String s = sb.toString();
082: int index = s.length() - end.length();
083: if (s.endsWith(end) && index >= _R1
084: && isValidEnEnding(sb, index - 1)) {
085: sb.delete(index, index + end.length());
086: unDouble(sb, index);
087: return true;
088: }
089: }
090: return false;
091: }
092:
093: private void step1(StringBuffer sb) {
094: if (_R1 >= sb.length())
095: return;
096:
097: String s = sb.toString();
098: int lengthR1 = sb.length() - _R1;
099: int index;
100:
101: if (s.endsWith("heden")) {
102: sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1,
103: lengthR1 + _R1).replaceAll("heden", "heid"));
104: return;
105: }
106:
107: if (enEnding(sb))
108: return;
109:
110: if (s.endsWith("se") && (index = s.length() - 2) >= _R1
111: && isValidSEnding(sb, index - 1)) {
112: sb.delete(index, index + 2);
113: return;
114: }
115: if (s.endsWith("s") && (index = s.length() - 1) >= _R1
116: && isValidSEnding(sb, index - 1)) {
117: sb.delete(index, index + 1);
118: }
119: }
120:
121: /**
122: * Delete suffix e if in R1 and
123: * preceded by a non-vowel, and then undouble the ending
124: *
125: * @param sb String being stemmed
126: */
127: private void step2(StringBuffer sb) {
128: _removedE = false;
129: if (_R1 >= sb.length())
130: return;
131: String s = sb.toString();
132: int index = s.length() - 1;
133: if (index >= _R1 && s.endsWith("e")
134: && !isVowel(sb.charAt(index - 1))) {
135: sb.delete(index, index + 1);
136: unDouble(sb);
137: _removedE = true;
138: }
139: }
140:
141: /**
142: * Delete "heid"
143: *
144: * @param sb String being stemmed
145: */
146: private void step3a(StringBuffer sb) {
147: if (_R2 >= sb.length())
148: return;
149: String s = sb.toString();
150: int index = s.length() - 4;
151: if (s.endsWith("heid") && index >= _R2
152: && sb.charAt(index - 1) != 'c') {
153: sb.delete(index, index + 4); //remove heid
154: enEnding(sb);
155: }
156: }
157:
158: /**
159: * <p>A d-suffix, or derivational suffix, enables a new word,
160: * often with a different grammatical category, or with a different
161: * sense, to be built from another word. Whether a d-suffix can be
162: * attached is discovered not from the rules of grammar, but by
163: * referring to a dictionary. So in English, ness can be added to
164: * certain adjectives to form corresponding nouns (littleness,
165: * kindness, foolishness ...) but not to all adjectives
166: * (not for example, to big, cruel, wise ...) d-suffixes can be
167: * used to change meaning, often in rather exotic ways.</p>
168: * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
169: *
170: * @param sb String being stemmed
171: */
172: private void step3b(StringBuffer sb) {
173: if (_R2 >= sb.length())
174: return;
175: String s = sb.toString();
176: int index = 0;
177:
178: if ((s.endsWith("end") || s.endsWith("ing"))
179: && (index = s.length() - 3) >= _R2) {
180: sb.delete(index, index + 3);
181: if (sb.charAt(index - 2) == 'i'
182: && sb.charAt(index - 1) == 'g') {
183: if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
184: index -= 2;
185: sb.delete(index, index + 2);
186: }
187: } else {
188: unDouble(sb, index);
189: }
190: return;
191: }
192: if (s.endsWith("ig") && (index = s.length() - 2) >= _R2) {
193: if (sb.charAt(index - 1) != 'e')
194: sb.delete(index, index + 2);
195: return;
196: }
197: if (s.endsWith("lijk") && (index = s.length() - 4) >= _R2) {
198: sb.delete(index, index + 4);
199: step2(sb);
200: return;
201: }
202: if (s.endsWith("baar") && (index = s.length() - 4) >= _R2) {
203: sb.delete(index, index + 4);
204: return;
205: }
206: if (s.endsWith("bar") && (index = s.length() - 3) >= _R2) {
207: if (_removedE)
208: sb.delete(index, index + 3);
209: return;
210: }
211: }
212:
213: /**
214: * undouble vowel
215: * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
216: *
217: * @param sb String being stemmed
218: */
219: private void step4(StringBuffer sb) {
220: if (sb.length() < 4)
221: return;
222: String end = sb.substring(sb.length() - 4, sb.length());
223: char c = end.charAt(0);
224: char v1 = end.charAt(1);
225: char v2 = end.charAt(2);
226: char d = end.charAt(3);
227: if (v1 == v2 && d != 'I' && v1 != 'i' && isVowel(v1)
228: && !isVowel(d) && !isVowel(c)) {
229: sb.delete(sb.length() - 2, sb.length() - 1);
230: }
231: }
232:
233: /**
234: * Checks if a term could be stemmed.
235: *
236: * @return true if, and only if, the given term consists in letters.
237: */
238: private boolean isStemmable(String term) {
239: for (int c = 0; c < term.length(); c++) {
240: if (!Character.isLetter(term.charAt(c)))
241: return false;
242: }
243: return true;
244: }
245:
246: /**
247: * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
248: */
249: private void substitute(StringBuffer buffer) {
250: for (int i = 0; i < buffer.length(); i++) {
251: switch (buffer.charAt(i)) {
252: case 'ä':
253: case 'á':
254: {
255: buffer.setCharAt(i, 'a');
256: break;
257: }
258: case 'ë':
259: case 'é':
260: {
261: buffer.setCharAt(i, 'e');
262: break;
263: }
264: case 'ü':
265: case 'ú':
266: {
267: buffer.setCharAt(i, 'u');
268: break;
269: }
270: case 'ï':
271: case 'i':
272: {
273: buffer.setCharAt(i, 'i');
274: break;
275: }
276: case 'ö':
277: case 'ó':
278: {
279: buffer.setCharAt(i, 'o');
280: break;
281: }
282: }
283: }
284: }
285:
286: /*private boolean isValidSEnding(StringBuffer sb) {
287: return isValidSEnding(sb, sb.length() - 1);
288: }*/
289:
290: private boolean isValidSEnding(StringBuffer sb, int index) {
291: char c = sb.charAt(index);
292: if (isVowel(c) || c == 'j')
293: return false;
294: return true;
295: }
296:
297: /*private boolean isValidEnEnding(StringBuffer sb) {
298: return isValidEnEnding(sb, sb.length() - 1);
299: }*/
300:
301: private boolean isValidEnEnding(StringBuffer sb, int index) {
302: char c = sb.charAt(index);
303: if (isVowel(c))
304: return false;
305: if (c < 3)
306: return false;
307: // ends with "gem"?
308: if (c == 'm' && sb.charAt(index - 2) == 'g'
309: && sb.charAt(index - 1) == 'e')
310: return false;
311: return true;
312: }
313:
314: private void unDouble(StringBuffer sb) {
315: unDouble(sb, sb.length());
316: }
317:
318: private void unDouble(StringBuffer sb, int endIndex) {
319: String s = sb.substring(0, endIndex);
320: if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd")
321: || s.endsWith("nn") || s.endsWith("mm")
322: || s.endsWith("ff")) {
323: sb.delete(endIndex - 1, endIndex);
324: }
325: }
326:
327: private int getRIndex(StringBuffer sb, int start) {
328: if (start == 0)
329: start = 1;
330: int i = start;
331: for (; i < sb.length(); i++) {
332: //first non-vowel preceded by a vowel
333: if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
334: return i + 1;
335: }
336: }
337: return i + 1;
338: }
339:
340: private void storeYandI(StringBuffer sb) {
341: if (sb.charAt(0) == 'y')
342: sb.setCharAt(0, 'Y');
343:
344: int last = sb.length() - 1;
345:
346: for (int i = 1; i < last; i++) {
347: switch (sb.charAt(i)) {
348: case 'i': {
349: if (isVowel(sb.charAt(i - 1))
350: && isVowel(sb.charAt(i + 1)))
351: sb.setCharAt(i, 'I');
352: break;
353: }
354: case 'y': {
355: if (isVowel(sb.charAt(i - 1)))
356: sb.setCharAt(i, 'Y');
357: break;
358: }
359: }
360: }
361: if (last > 0 && sb.charAt(last) == 'y'
362: && isVowel(sb.charAt(last - 1)))
363: sb.setCharAt(last, 'Y');
364: }
365:
366: private void reStoreYandI(StringBuffer sb) {
367: String tmp = sb.toString();
368: sb.delete(0, sb.length());
369: sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
370: }
371:
372: private boolean isVowel(char c) {
373: switch (c) {
374: case 'e':
375: case 'a':
376: case 'o':
377: case 'i':
378: case 'u':
379: case 'y':
380: case 'è':
381: {
382: return true;
383: }
384: }
385: return false;
386: }
387:
388: void setStemDictionary(Map dict) {
389: _stemDict = dict;
390: }
391:
392: }
|