001: /*
002: * Copyright 2001-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.commons.codec.language;
018:
019: import org.apache.commons.codec.EncoderException;
020: import org.apache.commons.codec.StringEncoder;
021:
022: /**
023: * Encodes a string into a metaphone value.
024: * <p>
025: * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
026: * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
027: * </p>
028: * <p>
029: * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
030: * 39.</CITE>
031: * </p>
032: *
033: * @author Apache Software Foundation
034: * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
035: */
036: public class Metaphone implements StringEncoder {
037:
038: /**
039: * Five values in the English language
040: */
041: private String vowels = "AEIOU";
042:
043: /**
044: * Variable used in Metaphone algorithm
045: */
046: private String frontv = "EIY";
047:
048: /**
049: * Variable used in Metaphone algorithm
050: */
051: private String varson = "CSPTG";
052:
053: /**
054: * The max code length for metaphone is 4
055: */
056: private int maxCodeLen = 4;
057:
058: /**
059: * Creates an instance of the Metaphone encoder
060: */
061: public Metaphone() {
062: super ();
063: }
064:
065: /**
066: * Find the metaphone value of a String. This is similar to the
067: * soundex algorithm, but better at finding similar sounding words.
068: * All input is converted to upper case.
069: * Limitations: Input format is expected to be a single ASCII word
070: * with only characters in the A - Z range, no punctuation or numbers.
071: *
072: * @param txt String to find the metaphone code for
073: * @return A metaphone code corresponding to the String supplied
074: */
075: public String metaphone(String txt) {
076: boolean hard = false;
077: if ((txt == null) || (txt.length() == 0)) {
078: return "";
079: }
080: // single character is itself
081: if (txt.length() == 1) {
082: return txt.toUpperCase();
083: }
084:
085: char[] inwd = txt.toUpperCase().toCharArray();
086:
087: StringBuffer local = new StringBuffer(40); // manipulate
088: StringBuffer code = new StringBuffer(10); // output
089: // handle initial 2 characters exceptions
090: switch (inwd[0]) {
091: case 'K':
092: case 'G':
093: case 'P': /* looking for KN, etc*/
094: if (inwd[1] == 'N') {
095: local.append(inwd, 1, inwd.length - 1);
096: } else {
097: local.append(inwd);
098: }
099: break;
100: case 'A': /* looking for AE */
101: if (inwd[1] == 'E') {
102: local.append(inwd, 1, inwd.length - 1);
103: } else {
104: local.append(inwd);
105: }
106: break;
107: case 'W': /* looking for WR or WH */
108: if (inwd[1] == 'R') { // WR -> R
109: local.append(inwd, 1, inwd.length - 1);
110: break;
111: }
112: if (inwd[1] == 'H') {
113: local.append(inwd, 1, inwd.length - 1);
114: local.setCharAt(0, 'W'); // WH -> W
115: } else {
116: local.append(inwd);
117: }
118: break;
119: case 'X': /* initial X becomes S */
120: inwd[0] = 'S';
121: local.append(inwd);
122: break;
123: default:
124: local.append(inwd);
125: } // now local has working string with initials fixed
126:
127: int wdsz = local.length();
128: int n = 0;
129:
130: while ((code.length() < this .getMaxCodeLen()) && (n < wdsz)) { // max code size of 4 works well
131: char symb = local.charAt(n);
132: // remove duplicate letters except C
133: if ((symb != 'C') && (isPreviousChar(local, n, symb))) {
134: n++;
135: } else { // not dup
136: switch (symb) {
137: case 'A':
138: case 'E':
139: case 'I':
140: case 'O':
141: case 'U':
142: if (n == 0) {
143: code.append(symb);
144: }
145: break; // only use vowel if leading char
146: case 'B':
147: if (isPreviousChar(local, n, 'M')
148: && isLastChar(wdsz, n)) { // B is silent if word ends in MB
149: break;
150: }
151: code.append(symb);
152: break;
153: case 'C': // lots of C special cases
154: /* discard if SCI, SCE or SCY */
155: if (isPreviousChar(local, n, 'S')
156: && !isLastChar(wdsz, n)
157: && (this .frontv
158: .indexOf(local.charAt(n + 1)) >= 0)) {
159: break;
160: }
161: if (regionMatch(local, n, "CIA")) { // "CIA" -> X
162: code.append('X');
163: break;
164: }
165: if (!isLastChar(wdsz, n)
166: && (this .frontv
167: .indexOf(local.charAt(n + 1)) >= 0)) {
168: code.append('S');
169: break; // CI,CE,CY -> S
170: }
171: if (isPreviousChar(local, n, 'S')
172: && isNextChar(local, n, 'H')) { // SCH->sk
173: code.append('K');
174: break;
175: }
176: if (isNextChar(local, n, 'H')) { // detect CH
177: if ((n == 0) && (wdsz >= 3)
178: && isVowel(local, 2)) { // CH consonant -> K consonant
179: code.append('K');
180: } else {
181: code.append('X'); // CHvowel -> X
182: }
183: } else {
184: code.append('K');
185: }
186: break;
187: case 'D':
188: if (!isLastChar(wdsz, n + 1)
189: && isNextChar(local, n, 'G')
190: && (this .frontv
191: .indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
192: code.append('J');
193: n += 2;
194: } else {
195: code.append('T');
196: }
197: break;
198: case 'G': // GH silent at end or before consonant
199: if (isLastChar(wdsz, n + 1)
200: && isNextChar(local, n, 'H')) {
201: break;
202: }
203: if (!isLastChar(wdsz, n + 1)
204: && isNextChar(local, n, 'H')
205: && !isVowel(local, n + 2)) {
206: break;
207: }
208: if ((n > 0)
209: && (regionMatch(local, n, "GN") || regionMatch(
210: local, n, "GNED"))) {
211: break; // silent G
212: }
213: if (isPreviousChar(local, n, 'G')) {
214: hard = true;
215: } else {
216: hard = false;
217: }
218: if (!isLastChar(wdsz, n)
219: && (this .frontv
220: .indexOf(local.charAt(n + 1)) >= 0)
221: && (!hard)) {
222: code.append('J');
223: } else {
224: code.append('K');
225: }
226: break;
227: case 'H':
228: if (isLastChar(wdsz, n)) {
229: break; // terminal H
230: }
231: if ((n > 0)
232: && (this .varson
233: .indexOf(local.charAt(n - 1)) >= 0)) {
234: break;
235: }
236: if (isVowel(local, n + 1)) {
237: code.append('H'); // Hvowel
238: }
239: break;
240: case 'F':
241: case 'J':
242: case 'L':
243: case 'M':
244: case 'N':
245: case 'R':
246: code.append(symb);
247: break;
248: case 'K':
249: if (n > 0) { // not initial
250: if (!isPreviousChar(local, n, 'C')) {
251: code.append(symb);
252: }
253: } else {
254: code.append(symb); // initial K
255: }
256: break;
257: case 'P':
258: if (isNextChar(local, n, 'H')) {
259: // PH -> F
260: code.append('F');
261: } else {
262: code.append(symb);
263: }
264: break;
265: case 'Q':
266: code.append('K');
267: break;
268: case 'S':
269: if (regionMatch(local, n, "SH")
270: || regionMatch(local, n, "SIO")
271: || regionMatch(local, n, "SIA")) {
272: code.append('X');
273: } else {
274: code.append('S');
275: }
276: break;
277: case 'T':
278: if (regionMatch(local, n, "TIA")
279: || regionMatch(local, n, "TIO")) {
280: code.append('X');
281: break;
282: }
283: if (regionMatch(local, n, "TCH")) {
284: // Silent if in "TCH"
285: break;
286: }
287: // substitute numeral 0 for TH (resembles theta after all)
288: if (regionMatch(local, n, "TH")) {
289: code.append('0');
290: } else {
291: code.append('T');
292: }
293: break;
294: case 'V':
295: code.append('F');
296: break;
297: case 'W':
298: case 'Y': // silent if not followed by vowel
299: if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
300: code.append(symb);
301: }
302: break;
303: case 'X':
304: code.append('K');
305: code.append('S');
306: break;
307: case 'Z':
308: code.append('S');
309: break;
310: } // end switch
311: n++;
312: } // end else from symb != 'C'
313: if (code.length() > this .getMaxCodeLen()) {
314: code.setLength(this .getMaxCodeLen());
315: }
316: }
317: return code.toString();
318: }
319:
320: private boolean isVowel(StringBuffer string, int index) {
321: return (this .vowels.indexOf(string.charAt(index)) >= 0);
322: }
323:
324: private boolean isPreviousChar(StringBuffer string, int index,
325: char c) {
326: boolean matches = false;
327: if (index > 0 && index < string.length()) {
328: matches = string.charAt(index - 1) == c;
329: }
330: return matches;
331: }
332:
333: private boolean isNextChar(StringBuffer string, int index, char c) {
334: boolean matches = false;
335: if (index >= 0 && index < string.length() - 1) {
336: matches = string.charAt(index + 1) == c;
337: }
338: return matches;
339: }
340:
341: private boolean regionMatch(StringBuffer string, int index,
342: String test) {
343: boolean matches = false;
344: if (index >= 0 && (index + test.length() - 1) < string.length()) {
345: String substring = string.substring(index, index
346: + test.length());
347: matches = substring.equals(test);
348: }
349: return matches;
350: }
351:
352: private boolean isLastChar(int wdsz, int n) {
353: return n + 1 == wdsz;
354: }
355:
356: /**
357: * Encodes an Object using the metaphone algorithm. This method
358: * is provided in order to satisfy the requirements of the
359: * Encoder interface, and will throw an EncoderException if the
360: * supplied object is not of type java.lang.String.
361: *
362: * @param pObject Object to encode
363: * @return An object (or type java.lang.String) containing the
364: * metaphone code which corresponds to the String supplied.
365: * @throws EncoderException if the parameter supplied is not
366: * of type java.lang.String
367: */
368: public Object encode(Object pObject) throws EncoderException {
369: if (!(pObject instanceof java.lang.String)) {
370: throw new EncoderException(
371: "Parameter supplied to Metaphone encode is not of type java.lang.String");
372: }
373: return metaphone((String) pObject);
374: }
375:
376: /**
377: * Encodes a String using the Metaphone algorithm.
378: *
379: * @param pString String object to encode
380: * @return The metaphone code corresponding to the String supplied
381: */
382: public String encode(String pString) {
383: return metaphone(pString);
384: }
385:
386: /**
387: * Tests is the metaphones of two strings are identical.
388: *
389: * @param str1 First of two strings to compare
390: * @param str2 Second of two strings to compare
391: * @return true if the metaphones of these strings are identical,
392: * false otherwise.
393: */
394: public boolean isMetaphoneEqual(String str1, String str2) {
395: return metaphone(str1).equals(metaphone(str2));
396: }
397:
398: /**
399: * Returns the maxCodeLen.
400: * @return int
401: */
402: public int getMaxCodeLen() {
403: return this .maxCodeLen;
404: }
405:
406: /**
407: * Sets the maxCodeLen.
408: * @param maxCodeLen The maxCodeLen to set
409: */
410: public void setMaxCodeLen(int maxCodeLen) {
411: this.maxCodeLen = maxCodeLen;
412: }
413:
414: }
|