0001: /*
0002: * Copyright 2001-2004 The Apache Software Foundation.
0003: *
0004: * Licensed under the Apache License, Version 2.0 (the "License");
0005: * you may not use this file except in compliance with the License.
0006: * You may obtain a copy of the License at
0007: *
0008: * http://www.apache.org/licenses/LICENSE-2.0
0009: *
0010: * Unless required by applicable law or agreed to in writing, software
0011: * distributed under the License is distributed on an "AS IS" BASIS,
0012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013: * See the License for the specific language governing permissions and
0014: * limitations under the License.
0015: */
0016:
0017: package org.apache.commons.codec.language;
0018:
0019: import org.apache.commons.codec.EncoderException;
0020: import org.apache.commons.codec.StringEncoder;
0021:
0022: /**
0023: * Encodes a string into a double metaphone value.
0024: * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
0025: * <ul>
0026: * <li>Original Article: <a
0027: * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
0028: * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
0029: * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
0030: * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
0031: * </ul>
0032: *
0033: * @author Apache Software Foundation
0034: * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
0035: */
0036: public class DoubleMetaphone implements StringEncoder {
0037:
0038: /**
0039: * "Vowels" to test for
0040: */
0041: private static final String VOWELS = "AEIOUY";
0042:
0043: /**
0044: * Prefixes when present which are not pronounced
0045: */
0046: private static final String[] SILENT_START = { "GN", "KN", "PN",
0047: "WR", "PS" };
0048: private static final String[] L_R_N_M_B_H_F_V_W_SPACE = { "L", "R",
0049: "N", "M", "B", "H", "F", "V", "W", " " };
0050: private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = {
0051: "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI",
0052: "ER" };
0053: private static final String[] L_T_K_S_N_M_B_Z = { "L", "T", "K",
0054: "S", "N", "M", "B", "Z" };
0055:
0056: /**
0057: * Maximum length of an encoding, default is 4
0058: */
0059: protected int maxCodeLen = 4;
0060:
0061: /**
0062: * Creates an instance of this DoubleMetaphone encoder
0063: */
0064: public DoubleMetaphone() {
0065: super ();
0066: }
0067:
0068: /**
0069: * Encode a value with Double Metaphone
0070: *
0071: * @param value String to encode
0072: * @return an encoded string
0073: */
0074: public String doubleMetaphone(String value) {
0075: return doubleMetaphone(value, false);
0076: }
0077:
0078: /**
0079: * Encode a value with Double Metaphone, optionally using the alternate
0080: * encoding.
0081: *
0082: * @param value String to encode
0083: * @param alternate use alternate encode
0084: * @return an encoded string
0085: */
0086: public String doubleMetaphone(String value, boolean alternate) {
0087: value = cleanInput(value);
0088: if (value == null) {
0089: return null;
0090: }
0091:
0092: boolean slavoGermanic = isSlavoGermanic(value);
0093: int index = isSilentStart(value) ? 1 : 0;
0094:
0095: DoubleMetaphoneResult result = new DoubleMetaphoneResult(this
0096: .getMaxCodeLen());
0097:
0098: while (!result.isComplete() && index <= value.length() - 1) {
0099: switch (value.charAt(index)) {
0100: case 'A':
0101: case 'E':
0102: case 'I':
0103: case 'O':
0104: case 'U':
0105: case 'Y':
0106: index = handleAEIOUY(value, result, index);
0107: break;
0108: case 'B':
0109: result.append('P');
0110: index = charAt(value, index + 1) == 'B' ? index + 2
0111: : index + 1;
0112: break;
0113: case '\u00C7':
0114: // A C with a Cedilla
0115: result.append('S');
0116: index++;
0117: break;
0118: case 'C':
0119: index = handleC(value, result, index);
0120: break;
0121: case 'D':
0122: index = handleD(value, result, index);
0123: break;
0124: case 'F':
0125: result.append('F');
0126: index = charAt(value, index + 1) == 'F' ? index + 2
0127: : index + 1;
0128: break;
0129: case 'G':
0130: index = handleG(value, result, index, slavoGermanic);
0131: break;
0132: case 'H':
0133: index = handleH(value, result, index);
0134: break;
0135: case 'J':
0136: index = handleJ(value, result, index, slavoGermanic);
0137: break;
0138: case 'K':
0139: result.append('K');
0140: index = charAt(value, index + 1) == 'K' ? index + 2
0141: : index + 1;
0142: break;
0143: case 'L':
0144: index = handleL(value, result, index);
0145: break;
0146: case 'M':
0147: result.append('M');
0148: index = conditionM0(value, index) ? index + 2
0149: : index + 1;
0150: break;
0151: case 'N':
0152: result.append('N');
0153: index = charAt(value, index + 1) == 'N' ? index + 2
0154: : index + 1;
0155: break;
0156: case '\u00D1':
0157: // N with a tilde (spanish ene)
0158: result.append('N');
0159: index++;
0160: break;
0161: case 'P':
0162: index = handleP(value, result, index);
0163: break;
0164: case 'Q':
0165: result.append('K');
0166: index = charAt(value, index + 1) == 'Q' ? index + 2
0167: : index + 1;
0168: break;
0169: case 'R':
0170: index = handleR(value, result, index, slavoGermanic);
0171: break;
0172: case 'S':
0173: index = handleS(value, result, index, slavoGermanic);
0174: break;
0175: case 'T':
0176: index = handleT(value, result, index);
0177: break;
0178: case 'V':
0179: result.append('F');
0180: index = charAt(value, index + 1) == 'V' ? index + 2
0181: : index + 1;
0182: break;
0183: case 'W':
0184: index = handleW(value, result, index);
0185: break;
0186: case 'X':
0187: index = handleX(value, result, index);
0188: break;
0189: case 'Z':
0190: index = handleZ(value, result, index, slavoGermanic);
0191: break;
0192: default:
0193: index++;
0194: break;
0195: }
0196: }
0197:
0198: return alternate ? result.getAlternate() : result.getPrimary();
0199: }
0200:
0201: /**
0202: * Encode the value using DoubleMetaphone. It will only work if
0203: * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
0204: *
0205: * @param obj Object to encode (should be of type String)
0206: * @return An encoded Object (will be of type String)
0207: * @throws EncoderException encode parameter is not of type String
0208: */
0209: public Object encode(Object obj) throws EncoderException {
0210: if (!(obj instanceof String)) {
0211: throw new EncoderException(
0212: "DoubleMetaphone encode parameter is not of type String");
0213: }
0214: return doubleMetaphone((String) obj);
0215: }
0216:
0217: /**
0218: * Encode the value using DoubleMetaphone.
0219: *
0220: * @param value String to encode
0221: * @return An encoded String
0222: */
0223: public String encode(String value) {
0224: return doubleMetaphone(value);
0225: }
0226:
0227: /**
0228: * Check if the Double Metaphone values of two <code>String</code> values
0229: * are equal.
0230: *
0231: * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
0232: * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
0233: * @return <code>true</code> if the encoded <code>String</code>s are equal;
0234: * <code>false</code> otherwise.
0235: * @see #isDoubleMetaphoneEqual(String,String,boolean)
0236: */
0237: public boolean isDoubleMetaphoneEqual(String value1, String value2) {
0238: return isDoubleMetaphoneEqual(value1, value2, false);
0239: }
0240:
0241: /**
0242: * Check if the Double Metaphone values of two <code>String</code> values
0243: * are equal, optionally using the alternate value.
0244: *
0245: * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
0246: * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
0247: * @param alternate use the alternate value if <code>true</code>.
0248: * @return <code>true</code> if the encoded <code>String</code>s are equal;
0249: * <code>false</code> otherwise.
0250: */
0251: public boolean isDoubleMetaphoneEqual(String value1, String value2,
0252: boolean alternate) {
0253: return doubleMetaphone(value1, alternate).equals(
0254: doubleMetaphone(value2, alternate));
0255: }
0256:
0257: /**
0258: * Returns the maxCodeLen.
0259: * @return int
0260: */
0261: public int getMaxCodeLen() {
0262: return this .maxCodeLen;
0263: }
0264:
0265: /**
0266: * Sets the maxCodeLen.
0267: * @param maxCodeLen The maxCodeLen to set
0268: */
0269: public void setMaxCodeLen(int maxCodeLen) {
0270: this .maxCodeLen = maxCodeLen;
0271: }
0272:
0273: //-- BEGIN HANDLERS --//
0274:
0275: /**
0276: * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
0277: */
0278: private int handleAEIOUY(String value,
0279: DoubleMetaphoneResult result, int index) {
0280: if (index == 0) {
0281: result.append('A');
0282: }
0283: return index + 1;
0284: }
0285:
0286: /**
0287: * Handles 'C' cases
0288: */
0289: private int handleC(String value, DoubleMetaphoneResult result,
0290: int index) {
0291: if (conditionC0(value, index)) { // very confusing, moved out
0292: result.append('K');
0293: index += 2;
0294: } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
0295: result.append('S');
0296: index += 2;
0297: } else if (contains(value, index, 2, "CH")) {
0298: index = handleCH(value, result, index);
0299: } else if (contains(value, index, 2, "CZ")
0300: && !contains(value, index - 2, 4, "WICZ")) {
0301: //-- "Czerny" --//
0302: result.append('S', 'X');
0303: index += 2;
0304: } else if (contains(value, index + 1, 3, "CIA")) {
0305: //-- "focaccia" --//
0306: result.append('X');
0307: index += 3;
0308: } else if (contains(value, index, 2, "CC")
0309: && !(index == 1 && charAt(value, 0) == 'M')) {
0310: //-- double "cc" but not "McClelland" --//
0311: return handleCC(value, result, index);
0312: } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
0313: result.append('K');
0314: index += 2;
0315: } else if (contains(value, index, 2, "CI", "CE", "CY")) {
0316: //-- Italian vs. English --//
0317: if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
0318: result.append('S', 'X');
0319: } else {
0320: result.append('S');
0321: }
0322: index += 2;
0323: } else {
0324: result.append('K');
0325: if (contains(value, index + 1, 2, " C", " Q", " G")) {
0326: //-- Mac Caffrey, Mac Gregor --//
0327: index += 3;
0328: } else if (contains(value, index + 1, 1, "C", "K", "Q")
0329: && !contains(value, index + 1, 2, "CE", "CI")) {
0330: index += 2;
0331: } else {
0332: index++;
0333: }
0334: }
0335:
0336: return index;
0337: }
0338:
0339: /**
0340: * Handles 'CC' cases
0341: */
0342: private int handleCC(String value, DoubleMetaphoneResult result,
0343: int index) {
0344: if (contains(value, index + 2, 1, "I", "E", "H")
0345: && !contains(value, index + 2, 2, "HU")) {
0346: //-- "bellocchio" but not "bacchus" --//
0347: if ((index == 1 && charAt(value, index - 1) == 'A')
0348: || contains(value, index - 1, 5, "UCCEE", "UCCES")) {
0349: //-- "accident", "accede", "succeed" --//
0350: result.append("KS");
0351: } else {
0352: //-- "bacci", "bertucci", other Italian --//
0353: result.append('X');
0354: }
0355: index += 3;
0356: } else { // Pierce's rule
0357: result.append('K');
0358: index += 2;
0359: }
0360:
0361: return index;
0362: }
0363:
0364: /**
0365: * Handles 'CH' cases
0366: */
0367: private int handleCH(String value, DoubleMetaphoneResult result,
0368: int index) {
0369: if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
0370: result.append('K', 'X');
0371: return index + 2;
0372: } else if (conditionCH0(value, index)) {
0373: //-- Greek roots ("chemistry", "chorus", etc.) --//
0374: result.append('K');
0375: return index + 2;
0376: } else if (conditionCH1(value, index)) {
0377: //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
0378: result.append('K');
0379: return index + 2;
0380: } else {
0381: if (index > 0) {
0382: if (contains(value, 0, 2, "MC")) {
0383: result.append('K');
0384: } else {
0385: result.append('X', 'K');
0386: }
0387: } else {
0388: result.append('X');
0389: }
0390: return index + 2;
0391: }
0392: }
0393:
0394: /**
0395: * Handles 'D' cases
0396: */
0397: private int handleD(String value, DoubleMetaphoneResult result,
0398: int index) {
0399: if (contains(value, index, 2, "DG")) {
0400: //-- "Edge" --//
0401: if (contains(value, index + 2, 1, "I", "E", "Y")) {
0402: result.append('J');
0403: index += 3;
0404: //-- "Edgar" --//
0405: } else {
0406: result.append("TK");
0407: index += 2;
0408: }
0409: } else if (contains(value, index, 2, "DT", "DD")) {
0410: result.append('T');
0411: index += 2;
0412: } else {
0413: result.append('T');
0414: index++;
0415: }
0416: return index;
0417: }
0418:
0419: /**
0420: * Handles 'G' cases
0421: */
0422: private int handleG(String value, DoubleMetaphoneResult result,
0423: int index, boolean slavoGermanic) {
0424: if (charAt(value, index + 1) == 'H') {
0425: index = handleGH(value, result, index);
0426: } else if (charAt(value, index + 1) == 'N') {
0427: if (index == 1 && isVowel(charAt(value, 0))
0428: && !slavoGermanic) {
0429: result.append("KN", "N");
0430: } else if (!contains(value, index + 2, 2, "EY")
0431: && charAt(value, index + 1) != 'Y'
0432: && !slavoGermanic) {
0433: result.append("N", "KN");
0434: } else {
0435: result.append("KN");
0436: }
0437: index = index + 2;
0438: } else if (contains(value, index + 1, 2, "LI")
0439: && !slavoGermanic) {
0440: result.append("KL", "L");
0441: index += 2;
0442: } else if (index == 0
0443: && (charAt(value, index + 1) == 'Y' || contains(value,
0444: index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
0445: //-- -ges-, -gep-, -gel-, -gie- at beginning --//
0446: result.append('K', 'J');
0447: index += 2;
0448: } else if ((contains(value, index + 1, 2, "ER") || charAt(
0449: value, index + 1) == 'Y')
0450: && !contains(value, 0, 6, "DANGER", "RANGER", "MANGER")
0451: && !contains(value, index - 1, 1, "E", "I")
0452: && !contains(value, index - 1, 3, "RGY", "OGY")) {
0453: //-- -ger-, -gy- --//
0454: result.append('K', 'J');
0455: index += 2;
0456: } else if (contains(value, index + 1, 1, "E", "I", "Y")
0457: || contains(value, index - 1, 4, "AGGI", "OGGI")) {
0458: //-- Italian "biaggi" --//
0459: if ((contains(value, 0, 4, "VAN ", "VON ") || contains(
0460: value, 0, 3, "SCH"))
0461: || contains(value, index + 1, 2, "ET")) {
0462: //-- obvious germanic --//
0463: result.append('K');
0464: } else if (contains(value, index + 1, 4, "IER")) {
0465: result.append('J');
0466: } else {
0467: result.append('J', 'K');
0468: }
0469: index += 2;
0470: } else if (charAt(value, index + 1) == 'G') {
0471: index += 2;
0472: result.append('K');
0473: } else {
0474: index++;
0475: result.append('K');
0476: }
0477: return index;
0478: }
0479:
0480: /**
0481: * Handles 'GH' cases
0482: */
0483: private int handleGH(String value, DoubleMetaphoneResult result,
0484: int index) {
0485: if (index > 0 && !isVowel(charAt(value, index - 1))) {
0486: result.append('K');
0487: index += 2;
0488: } else if (index == 0) {
0489: if (charAt(value, index + 2) == 'I') {
0490: result.append('J');
0491: } else {
0492: result.append('K');
0493: }
0494: index += 2;
0495: } else if ((index > 1 && contains(value, index - 2, 1, "B",
0496: "H", "D"))
0497: || (index > 2 && contains(value, index - 3, 1, "B",
0498: "H", "D"))
0499: || (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
0500: //-- Parker's rule (with some further refinements) - "hugh"
0501: index += 2;
0502: } else {
0503: if (index > 2
0504: && charAt(value, index - 1) == 'U'
0505: && contains(value, index - 3, 1, "C", "G", "L",
0506: "R", "T")) {
0507: //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
0508: result.append('F');
0509: } else if (index > 0 && charAt(value, index - 1) != 'I') {
0510: result.append('K');
0511: }
0512: index += 2;
0513: }
0514: return index;
0515: }
0516:
0517: /**
0518: * Handles 'H' cases
0519: */
0520: private int handleH(String value, DoubleMetaphoneResult result,
0521: int index) {
0522: //-- only keep if first & before vowel or between 2 vowels --//
0523: if ((index == 0 || isVowel(charAt(value, index - 1)))
0524: && isVowel(charAt(value, index + 1))) {
0525: result.append('H');
0526: index += 2;
0527: //-- also takes car of "HH" --//
0528: } else {
0529: index++;
0530: }
0531: return index;
0532: }
0533:
0534: /**
0535: * Handles 'J' cases
0536: */
0537: private int handleJ(String value, DoubleMetaphoneResult result,
0538: int index, boolean slavoGermanic) {
0539: if (contains(value, index, 4, "JOSE")
0540: || contains(value, 0, 4, "SAN ")) {
0541: //-- obvious Spanish, "Jose", "San Jacinto" --//
0542: if ((index == 0 && (charAt(value, index + 4) == ' ') || value
0543: .length() == 4)
0544: || contains(value, 0, 4, "SAN ")) {
0545: result.append('H');
0546: } else {
0547: result.append('J', 'H');
0548: }
0549: index++;
0550: } else {
0551: if (index == 0 && !contains(value, index, 4, "JOSE")) {
0552: result.append('J', 'A');
0553: } else if (isVowel(charAt(value, index - 1))
0554: && !slavoGermanic
0555: && (charAt(value, index + 1) == 'A' || charAt(
0556: value, index + 1) == 'O')) {
0557: result.append('J', 'H');
0558: } else if (index == value.length() - 1) {
0559: result.append('J', ' ');
0560: } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z)
0561: && !contains(value, index - 1, 1, "S", "K", "L")) {
0562: result.append('J');
0563: }
0564:
0565: if (charAt(value, index + 1) == 'J') {
0566: index += 2;
0567: } else {
0568: index++;
0569: }
0570: }
0571: return index;
0572: }
0573:
0574: /**
0575: * Handles 'L' cases
0576: */
0577: private int handleL(String value, DoubleMetaphoneResult result,
0578: int index) {
0579: result.append('L');
0580: if (charAt(value, index + 1) == 'L') {
0581: if (conditionL0(value, index)) {
0582: result.appendAlternate(' ');
0583: }
0584: index += 2;
0585: } else {
0586: index++;
0587: }
0588: return index;
0589: }
0590:
0591: /**
0592: * Handles 'P' cases
0593: */
0594: private int handleP(String value, DoubleMetaphoneResult result,
0595: int index) {
0596: if (charAt(value, index + 1) == 'H') {
0597: result.append('F');
0598: index += 2;
0599: } else {
0600: result.append('P');
0601: index = contains(value, index + 1, 1, "P", "B") ? index + 2
0602: : index + 1;
0603: }
0604: return index;
0605: }
0606:
0607: /**
0608: * Handles 'R' cases
0609: */
0610: private int handleR(String value, DoubleMetaphoneResult result,
0611: int index, boolean slavoGermanic) {
0612: if (index == value.length() - 1 && !slavoGermanic
0613: && contains(value, index - 2, 2, "IE")
0614: && !contains(value, index - 4, 2, "ME", "MA")) {
0615: result.appendAlternate('R');
0616: } else {
0617: result.append('R');
0618: }
0619: return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
0620: }
0621:
0622: /**
0623: * Handles 'S' cases
0624: */
0625: private int handleS(String value, DoubleMetaphoneResult result,
0626: int index, boolean slavoGermanic) {
0627: if (contains(value, index - 1, 3, "ISL", "YSL")) {
0628: //-- special cases "island", "isle", "carlisle", "carlysle" --//
0629: index++;
0630: } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
0631: //-- special case "sugar-" --//
0632: result.append('X', 'S');
0633: index++;
0634: } else if (contains(value, index, 2, "SH")) {
0635: if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM",
0636: "HOLZ")) {
0637: //-- germanic --//
0638: result.append('S');
0639: } else {
0640: result.append('X');
0641: }
0642: index += 2;
0643: } else if (contains(value, index, 3, "SIO", "SIA")
0644: || contains(value, index, 4, "SIAN")) {
0645: //-- Italian and Armenian --//
0646: if (slavoGermanic) {
0647: result.append('S');
0648: } else {
0649: result.append('S', 'X');
0650: }
0651: index += 3;
0652: } else if ((index == 0 && contains(value, index + 1, 1, "M",
0653: "N", "L", "W"))
0654: || contains(value, index + 1, 1, "Z")) {
0655: //-- german & anglicisations, e.g. "smith" match "schmidt" //
0656: // "snider" match "schneider" --//
0657: //-- also, -sz- in slavic language altho in hungarian it //
0658: // is pronounced "s" --//
0659: result.append('S', 'X');
0660: index = contains(value, index + 1, 1, "Z") ? index + 2
0661: : index + 1;
0662: } else if (contains(value, index, 2, "SC")) {
0663: index = handleSC(value, result, index);
0664: } else {
0665: if (index == value.length() - 1
0666: && contains(value, index - 2, 2, "AI", "OI")) {
0667: //-- french e.g. "resnais", "artois" --//
0668: result.appendAlternate('S');
0669: } else {
0670: result.append('S');
0671: }
0672: index = contains(value, index + 1, 1, "S", "Z") ? index + 2
0673: : index + 1;
0674: }
0675: return index;
0676: }
0677:
0678: /**
0679: * Handles 'SC' cases
0680: */
0681: private int handleSC(String value, DoubleMetaphoneResult result,
0682: int index) {
0683: if (charAt(value, index + 2) == 'H') {
0684: //-- Schlesinger's rule --//
0685: if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY",
0686: "ED", "EM")) {
0687: //-- Dutch origin, e.g. "school", "schooner" --//
0688: if (contains(value, index + 3, 2, "ER", "EN")) {
0689: //-- "schermerhorn", "schenker" --//
0690: result.append("X", "SK");
0691: } else {
0692: result.append("SK");
0693: }
0694: } else {
0695: if (index == 0 && !isVowel(charAt(value, 3))
0696: && charAt(value, 3) != 'W') {
0697: result.append('X', 'S');
0698: } else {
0699: result.append('X');
0700: }
0701: }
0702: } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
0703: result.append('S');
0704: } else {
0705: result.append("SK");
0706: }
0707: return index + 3;
0708: }
0709:
0710: /**
0711: * Handles 'T' cases
0712: */
0713: private int handleT(String value, DoubleMetaphoneResult result,
0714: int index) {
0715: if (contains(value, index, 4, "TION")) {
0716: result.append('X');
0717: index += 3;
0718: } else if (contains(value, index, 3, "TIA", "TCH")) {
0719: result.append('X');
0720: index += 3;
0721: } else if (contains(value, index, 2, "TH")
0722: || contains(value, index, 3, "TTH")) {
0723: if (contains(value, index + 2, 2, "OM", "AM")
0724: ||
0725: //-- special case "thomas", "thames" or germanic --//
0726: contains(value, 0, 4, "VAN ", "VON ")
0727: || contains(value, 0, 3, "SCH")) {
0728: result.append('T');
0729: } else {
0730: result.append('0', 'T');
0731: }
0732: index += 2;
0733: } else {
0734: result.append('T');
0735: index = contains(value, index + 1, 1, "T", "D") ? index + 2
0736: : index + 1;
0737: }
0738: return index;
0739: }
0740:
0741: /**
0742: * Handles 'W' cases
0743: */
0744: private int handleW(String value, DoubleMetaphoneResult result,
0745: int index) {
0746: if (contains(value, index, 2, "WR")) {
0747: //-- can also be in middle of word --//
0748: result.append('R');
0749: index += 2;
0750: } else {
0751: if (index == 0
0752: && (isVowel(charAt(value, index + 1)) || contains(
0753: value, index, 2, "WH"))) {
0754: if (isVowel(charAt(value, index + 1))) {
0755: //-- Wasserman should match Vasserman --//
0756: result.append('A', 'F');
0757: } else {
0758: //-- need Uomo to match Womo --//
0759: result.append('A');
0760: }
0761: index++;
0762: } else if ((index == value.length() - 1 && isVowel(charAt(
0763: value, index - 1)))
0764: || contains(value, index - 1, 5, "EWSKI", "EWSKY",
0765: "OWSKI", "OWSKY")
0766: || contains(value, 0, 3, "SCH")) {
0767: //-- Arnow should match Arnoff --//
0768: result.appendAlternate('F');
0769: index++;
0770: } else if (contains(value, index, 4, "WICZ", "WITZ")) {
0771: //-- Polish e.g. "filipowicz" --//
0772: result.append("TS", "FX");
0773: index += 4;
0774: } else {
0775: index++;
0776: }
0777: }
0778: return index;
0779: }
0780:
0781: /**
0782: * Handles 'X' cases
0783: */
0784: private int handleX(String value, DoubleMetaphoneResult result,
0785: int index) {
0786: if (index == 0) {
0787: result.append('S');
0788: index++;
0789: } else {
0790: if (!((index == value.length() - 1) && (contains(value,
0791: index - 3, 3, "IAU", "EAU") || contains(value,
0792: index - 2, 2, "AU", "OU")))) {
0793: //-- French e.g. breaux --//
0794: result.append("KS");
0795: }
0796: index = contains(value, index + 1, 1, "C", "X") ? index + 2
0797: : index + 1;
0798: }
0799: return index;
0800: }
0801:
0802: /**
0803: * Handles 'Z' cases
0804: */
0805: private int handleZ(String value, DoubleMetaphoneResult result,
0806: int index, boolean slavoGermanic) {
0807: if (charAt(value, index + 1) == 'H') {
0808: //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
0809: result.append('J');
0810: index += 2;
0811: } else {
0812: if (contains(value, index + 1, 2, "ZO", "ZI", "ZA")
0813: || (slavoGermanic && (index > 0 && charAt(value,
0814: index - 1) != 'T'))) {
0815: result.append("S", "TS");
0816: } else {
0817: result.append('S');
0818: }
0819: index = charAt(value, index + 1) == 'Z' ? index + 2
0820: : index + 1;
0821: }
0822: return index;
0823: }
0824:
0825: //-- BEGIN CONDITIONS --//
0826:
0827: /**
0828: * Complex condition 0 for 'C'
0829: */
0830: private boolean conditionC0(String value, int index) {
0831: if (contains(value, index, 4, "CHIA")) {
0832: return true;
0833: } else if (index <= 1) {
0834: return false;
0835: } else if (isVowel(charAt(value, index - 2))) {
0836: return false;
0837: } else if (!contains(value, index - 1, 3, "ACH")) {
0838: return false;
0839: } else {
0840: char c = charAt(value, index + 2);
0841: return (c != 'I' && c != 'E')
0842: || contains(value, index - 2, 6, "BACHER", "MACHER");
0843: }
0844: }
0845:
0846: /**
0847: * Complex condition 0 for 'CH'
0848: */
0849: private boolean conditionCH0(String value, int index) {
0850: if (index != 0) {
0851: return false;
0852: } else if (!contains(value, index + 1, 5, "HARAC", "HARIS")
0853: && !contains(value, index + 1, 3, "HOR", "HYM", "HIA",
0854: "HEM")) {
0855: return false;
0856: } else if (contains(value, 0, 5, "CHORE")) {
0857: return false;
0858: } else {
0859: return true;
0860: }
0861: }
0862:
0863: /**
0864: * Complex condition 1 for 'CH'
0865: */
0866: private boolean conditionCH1(String value, int index) {
0867: return ((contains(value, 0, 4, "VAN ", "VON ") || contains(
0868: value, 0, 3, "SCH"))
0869: || contains(value, index - 2, 6, "ORCHES", "ARCHIT",
0870: "ORCHID")
0871: || contains(value, index + 2, 1, "T", "S") || ((contains(
0872: value, index - 1, 1, "A", "O", "U", "E") || index == 0) && (contains(
0873: value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value
0874: .length() - 1)));
0875: }
0876:
0877: /**
0878: * Complex condition 0 for 'L'
0879: */
0880: private boolean conditionL0(String value, int index) {
0881: if (index == value.length() - 3
0882: && contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
0883: return true;
0884: } else if ((contains(value, index - 1, 2, "AS", "OS") || contains(
0885: value, value.length() - 1, 1, "A", "O"))
0886: && contains(value, index - 1, 4, "ALLE")) {
0887: return true;
0888: } else {
0889: return false;
0890: }
0891: }
0892:
0893: /**
0894: * Complex condition 0 for 'M'
0895: */
0896: private boolean conditionM0(String value, int index) {
0897: if (charAt(value, index + 1) == 'M') {
0898: return true;
0899: }
0900: return contains(value, index - 1, 3, "UMB")
0901: && ((index + 1) == value.length() - 1 || contains(
0902: value, index + 2, 2, "ER"));
0903: }
0904:
0905: //-- BEGIN HELPER FUNCTIONS --//
0906:
0907: /**
0908: * Determines whether or not a value is of slavo-germanic orgin. A value is
0909: * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
0910: */
0911: private boolean isSlavoGermanic(String value) {
0912: return value.indexOf('W') > -1 || value.indexOf('K') > -1
0913: || value.indexOf("CZ") > -1
0914: || value.indexOf("WITZ") > -1;
0915: }
0916:
0917: /**
0918: * Determines whether or not a character is a vowel or not
0919: */
0920: private boolean isVowel(char ch) {
0921: return VOWELS.indexOf(ch) != -1;
0922: }
0923:
0924: /**
0925: * Determines whether or not the value starts with a silent letter. It will
0926: * return <code>true</code> if the value starts with any of 'GN', 'KN',
0927: * 'PN', 'WR' or 'PS'.
0928: */
0929: private boolean isSilentStart(String value) {
0930: boolean result = false;
0931: for (int i = 0; i < SILENT_START.length; i++) {
0932: if (value.startsWith(SILENT_START[i])) {
0933: result = true;
0934: break;
0935: }
0936: }
0937: return result;
0938: }
0939:
0940: /**
0941: * Cleans the input
0942: */
0943: private String cleanInput(String input) {
0944: if (input == null) {
0945: return null;
0946: }
0947: input = input.trim();
0948: if (input.length() == 0) {
0949: return null;
0950: }
0951: return input.toUpperCase();
0952: }
0953:
0954: /**
0955: * Gets the character at index <code>index</code> if available, otherwise
0956: * it returns <code>Character.MIN_VALUE</code> so that there is some sort
0957: * of a default
0958: */
0959: protected char charAt(String value, int index) {
0960: if (index < 0 || index >= value.length()) {
0961: return Character.MIN_VALUE;
0962: }
0963: return value.charAt(index);
0964: }
0965:
0966: /**
0967: * Shortcut method with 1 criteria
0968: */
0969: private static boolean contains(String value, int start,
0970: int length, String criteria) {
0971: return contains(value, start, length, new String[] { criteria });
0972: }
0973:
0974: /**
0975: * Shortcut method with 2 criteria
0976: */
0977: private static boolean contains(String value, int start,
0978: int length, String criteria1, String criteria2) {
0979: return contains(value, start, length, new String[] { criteria1,
0980: criteria2 });
0981: }
0982:
0983: /**
0984: * Shortcut method with 3 criteria
0985: */
0986: private static boolean contains(String value, int start,
0987: int length, String criteria1, String criteria2,
0988: String criteria3) {
0989: return contains(value, start, length, new String[] { criteria1,
0990: criteria2, criteria3 });
0991: }
0992:
0993: /**
0994: * Shortcut method with 4 criteria
0995: */
0996: private static boolean contains(String value, int start,
0997: int length, String criteria1, String criteria2,
0998: String criteria3, String criteria4) {
0999: return contains(value, start, length, new String[] { criteria1,
1000: criteria2, criteria3, criteria4 });
1001: }
1002:
1003: /**
1004: * Shortcut method with 5 criteria
1005: */
1006: private static boolean contains(String value, int start,
1007: int length, String criteria1, String criteria2,
1008: String criteria3, String criteria4, String criteria5) {
1009: return contains(value, start, length, new String[] { criteria1,
1010: criteria2, criteria3, criteria4, criteria5 });
1011: }
1012:
1013: /**
1014: * Shortcut method with 6 criteria
1015: */
1016: private static boolean contains(String value, int start,
1017: int length, String criteria1, String criteria2,
1018: String criteria3, String criteria4, String criteria5,
1019: String criteria6) {
1020: return contains(value, start, length, new String[] { criteria1,
1021: criteria2, criteria3, criteria4, criteria5, criteria6 });
1022: }
1023:
1024: /**
1025: * Determines whether <code>value</code> contains any of the criteria
1026: starting
1027: * at index <code>start</code> and matching up to length <code>length</code>
1028: */
1029: protected static boolean contains(String value, int start,
1030: int length, String[] criteria) {
1031: boolean result = false;
1032: if (start >= 0 && start + length <= value.length()) {
1033: String target = value.substring(start, start + length);
1034:
1035: for (int i = 0; i < criteria.length; i++) {
1036: if (target.equals(criteria[i])) {
1037: result = true;
1038: break;
1039: }
1040: }
1041: }
1042: return result;
1043: }
1044:
1045: //-- BEGIN INNER CLASSES --//
1046:
1047: /**
1048: * Inner class for storing results, since there is the optional alternate
1049: * encoding.
1050: */
1051: public class DoubleMetaphoneResult {
1052:
1053: private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1054: private StringBuffer alternate = new StringBuffer(
1055: getMaxCodeLen());
1056: private int maxLength;
1057:
1058: public DoubleMetaphoneResult(int maxLength) {
1059: this .maxLength = maxLength;
1060: }
1061:
1062: public void append(char value) {
1063: appendPrimary(value);
1064: appendAlternate(value);
1065: }
1066:
1067: public void append(char primary, char alternate) {
1068: appendPrimary(primary);
1069: appendAlternate(alternate);
1070: }
1071:
1072: public void appendPrimary(char value) {
1073: if (this .primary.length() < this .maxLength) {
1074: this .primary.append(value);
1075: }
1076: }
1077:
1078: public void appendAlternate(char value) {
1079: if (this .alternate.length() < this .maxLength) {
1080: this .alternate.append(value);
1081: }
1082: }
1083:
1084: public void append(String value) {
1085: appendPrimary(value);
1086: appendAlternate(value);
1087: }
1088:
1089: public void append(String primary, String alternate) {
1090: appendPrimary(primary);
1091: appendAlternate(alternate);
1092: }
1093:
1094: public void appendPrimary(String value) {
1095: int addChars = this .maxLength - this .primary.length();
1096: if (value.length() <= addChars) {
1097: this .primary.append(value);
1098: } else {
1099: this .primary.append(value.substring(0, addChars));
1100: }
1101: }
1102:
1103: public void appendAlternate(String value) {
1104: int addChars = this .maxLength - this .alternate.length();
1105: if (value.length() <= addChars) {
1106: this .alternate.append(value);
1107: } else {
1108: this .alternate.append(value.substring(0, addChars));
1109: }
1110: }
1111:
1112: public String getPrimary() {
1113: return this .primary.toString();
1114: }
1115:
1116: public String getAlternate() {
1117: return this .alternate.toString();
1118: }
1119:
1120: public boolean isComplete() {
1121: return this.primary.length() >= this.maxLength
1122: && this.alternate.length() >= this.maxLength;
1123: }
1124: }
1125: }
|