001: package org.apache.lucene.analysis.de;
002:
003: // This file is encoded in UTF-8
004:
005: /**
006: * Licensed to the Apache Software Foundation (ASF) under one or more
007: * contributor license agreements. See the NOTICE file distributed with
008: * this work for additional information regarding copyright ownership.
009: * The ASF licenses this file to You under the Apache License, Version 2.0
010: * (the "License"); you may not use this file except in compliance with
011: * the License. You may obtain a copy of the License at
012: *
013: * http://www.apache.org/licenses/LICENSE-2.0
014: *
015: * Unless required by applicable law or agreed to in writing, software
016: * distributed under the License is distributed on an "AS IS" BASIS,
017: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018: * See the License for the specific language governing permissions and
019: * limitations under the License.
020: */
021:
022: /**
023: * A stemmer for German words. The algorithm is based on the report
024: * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
025: * Caumanns (joerg.caumanns at isst.fhg.de).
026: *
027: *
028: * @version $Id: GermanStemmer.java 564236 2007-08-09 15:21:19Z gsingers $
029: */
030: public class GermanStemmer {
031: /**
032: * Buffer for the terms while stemming them.
033: */
034: private StringBuffer sb = new StringBuffer();
035:
036: /**
037: * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
038: */
039: private int substCount = 0;
040:
041: /**
042: * Stemms the given term to an unique <tt>discriminator</tt>.
043: *
044: * @param term The term that should be stemmed.
045: * @return Discriminator for <tt>term</tt>
046: */
047: protected String stem(String term) {
048: // Use lowercase for medium stemming.
049: term = term.toLowerCase();
050: if (!isStemmable(term))
051: return term;
052: // Reset the StringBuffer.
053: sb.delete(0, sb.length());
054: sb.insert(0, term);
055: // Stemming starts here...
056: substitute(sb);
057: strip(sb);
058: optimize(sb);
059: resubstitute(sb);
060: removeParticleDenotion(sb);
061: return sb.toString();
062: }
063:
064: /**
065: * Checks if a term could be stemmed.
066: *
067: * @return true if, and only if, the given term consists in letters.
068: */
069: private boolean isStemmable(String term) {
070: for (int c = 0; c < term.length(); c++) {
071: if (!Character.isLetter(term.charAt(c)))
072: return false;
073: }
074: return true;
075: }
076:
077: /**
078: * suffix stripping (stemming) on the current term. The stripping is reduced
079: * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
080: * from which all regular suffixes are build of. The simplification causes
081: * some overstemming, and way more irregular stems, but still provides unique.
082: * discriminators in the most of those cases.
083: * The algorithm is context free, except of the length restrictions.
084: */
085: private void strip(StringBuffer buffer) {
086: boolean doMore = true;
087: while (doMore && buffer.length() > 3) {
088: if ((buffer.length() + substCount > 5)
089: && buffer.substring(buffer.length() - 2,
090: buffer.length()).equals("nd")) {
091: buffer.delete(buffer.length() - 2, buffer.length());
092: } else if ((buffer.length() + substCount > 4)
093: && buffer.substring(buffer.length() - 2,
094: buffer.length()).equals("em")) {
095: buffer.delete(buffer.length() - 2, buffer.length());
096: } else if ((buffer.length() + substCount > 4)
097: && buffer.substring(buffer.length() - 2,
098: buffer.length()).equals("er")) {
099: buffer.delete(buffer.length() - 2, buffer.length());
100: } else if (buffer.charAt(buffer.length() - 1) == 'e') {
101: buffer.deleteCharAt(buffer.length() - 1);
102: } else if (buffer.charAt(buffer.length() - 1) == 's') {
103: buffer.deleteCharAt(buffer.length() - 1);
104: } else if (buffer.charAt(buffer.length() - 1) == 'n') {
105: buffer.deleteCharAt(buffer.length() - 1);
106: }
107: // "t" occurs only as suffix of verbs.
108: else if (buffer.charAt(buffer.length() - 1) == 't') {
109: buffer.deleteCharAt(buffer.length() - 1);
110: } else {
111: doMore = false;
112: }
113: }
114: }
115:
116: /**
117: * Does some optimizations on the term. This optimisations are
118: * contextual.
119: */
120: private void optimize(StringBuffer buffer) {
121: // Additional step for female plurals of professions and inhabitants.
122: if (buffer.length() > 5
123: && buffer.substring(buffer.length() - 5,
124: buffer.length()).equals("erin*")) {
125: buffer.deleteCharAt(buffer.length() - 1);
126: strip(buffer);
127: }
128: // Additional step for irregular plural nouns like "Matrizen -> Matrix".
129: if (buffer.charAt(buffer.length() - 1) == ('z')) {
130: buffer.setCharAt(buffer.length() - 1, 'x');
131: }
132: }
133:
134: /**
135: * Removes a particle denotion ("ge") from a term.
136: */
137: private void removeParticleDenotion(StringBuffer buffer) {
138: if (buffer.length() > 4) {
139: for (int c = 0; c < buffer.length() - 3; c++) {
140: if (buffer.substring(c, c + 4).equals("gege")) {
141: buffer.delete(c, c + 2);
142: return;
143: }
144: }
145: }
146: }
147:
148: /**
149: * Do some substitutions for the term to reduce overstemming:
150: *
151: * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
152: * "ß" is substituted by "ss"
153: * - Substitute a second char of a pair of equal characters with
154: * an asterisk: ?? -> ?*
155: * - Substitute some common character combinations with a token:
156: * sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
157: */
158: private void substitute( StringBuffer buffer )
159: {
160: substCount = 0;
161: for ( int c = 0; c < buffer.length(); c++ ) {
162: // Replace the second char of a pair of the equal characters with an asterisk
163: if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
164: buffer.setCharAt( c, '*' );
165: }
166: // Substitute Umlauts.
167: else if ( buffer.charAt( c ) == 'ä' ) {
168: buffer.setCharAt( c, 'a' );
169: }
170: else if ( buffer.charAt( c ) == 'ö' ) {
171: buffer.setCharAt( c, 'o' );
172: }
173: else if ( buffer.charAt( c ) == 'ü' ) {
174: buffer.setCharAt( c, 'u' );
175: }
176: // Fix bug so that 'ß' at the end of a word is replaced.
177: else if ( buffer.charAt( c ) == 'ß' ) {
178: buffer.setCharAt( c, 's' );
179: buffer.insert( c + 1, 's' );
180: substCount++;
181: }
182: // Take care that at least one character is left left side from the current one
183: if ( c < buffer.length() - 1 ) {
184: // Masking several common character combinations with an token
185: if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
186: buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
187: {
188: buffer.setCharAt( c, '$' );
189: buffer.delete( c + 1, c + 3 );
190: substCount =+ 2;
191: }
192: else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
193: buffer.setCharAt( c, '§' );
194: buffer.deleteCharAt( c + 1 );
195: substCount++;
196: }
197: else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
198: buffer.setCharAt( c, '%' );
199: buffer.deleteCharAt( c + 1 );
200: substCount++;
201: }
202: else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
203: buffer.setCharAt( c, '&' );
204: buffer.deleteCharAt( c + 1 );
205: substCount++;
206: }
207: else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
208: buffer.setCharAt( c, '#' );
209: buffer.deleteCharAt( c + 1 );
210: substCount++;
211: }
212: else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
213: buffer.setCharAt( c, '!' );
214: buffer.deleteCharAt( c + 1 );
215: substCount++;
216: }
217: }
218: }
219: }
220:
221: /**
222: * Undoes the changes made by substitute(). That are character pairs and
223: * character combinations. Umlauts will remain as their corresponding vowel,
224: * as "ß" remains as "ss".
225: */
226: private void resubstitute( StringBuffer buffer )
227: {
228: for ( int c = 0; c < buffer.length(); c++ ) {
229: if ( buffer.charAt( c ) == '*' ) {
230: char x = buffer.charAt( c - 1 );
231: buffer.setCharAt( c, x );
232: }
233: else if ( buffer.charAt( c ) == '$' ) {
234: buffer.setCharAt( c, 's' );
235: buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
236: }
237: else if ( buffer.charAt( c ) == '§' ) {
238: buffer.setCharAt( c, 'c' );
239: buffer.insert( c + 1, 'h' );
240: }
241: else if ( buffer.charAt( c ) == '%' ) {
242: buffer.setCharAt( c, 'e' );
243: buffer.insert( c + 1, 'i' );
244: }
245: else if ( buffer.charAt( c ) == '&' ) {
246: buffer.setCharAt( c, 'i' );
247: buffer.insert( c + 1, 'e' );
248: }
249: else if ( buffer.charAt( c ) == '#' ) {
250: buffer.setCharAt( c, 'i' );
251: buffer.insert( c + 1, 'g' );
252: }
253: else if ( buffer.charAt( c ) == '!' ) {
254: buffer.setCharAt( c, 's' );
255: buffer.insert( c + 1, 't' );
256: }
257: }
258: }
259: }
|