001: /*
002: * Copyright 2001-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.commons.codec.language;
018:
019: import org.apache.commons.codec.EncoderException;
020: import org.apache.commons.codec.StringEncoder;
021:
022: /**
023: * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
024: *
025: * @author Apache Software Foundation
026: * @version $Id: SoundexUtils.java,v 1.5 2004/03/17 18:31:35 ggregory Exp $
027: * @since 1.3
028: */
029: final class SoundexUtils {
030:
031: /**
032: * Cleans up the input string before Soundex processing by only returning
033: * upper case letters.
034: *
035: * @param str
036: * The String to clean.
037: * @return A clean String.
038: */
039: static String clean(String str) {
040: if (str == null || str.length() == 0) {
041: return str;
042: }
043: int len = str.length();
044: char[] chars = new char[len];
045: int count = 0;
046: for (int i = 0; i < len; i++) {
047: if (Character.isLetter(str.charAt(i))) {
048: chars[count++] = str.charAt(i);
049: }
050: }
051: if (count == len) {
052: return str.toUpperCase();
053: }
054: return new String(chars, 0, count).toUpperCase();
055: }
056:
057: /**
058: * Encodes the Strings and returns the number of characters in the two
059: * encoded Strings that are the same.
060: * <ul>
061: * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
062: * little or no similarity, and 4 indicates strong similarity or identical
063: * values.</li>
064: * <li>For refined Soundex, the return value can be greater than 4.</li>
065: * </ul>
066: *
067: * @param encoder
068: * The encoder to use to encode the Strings.
069: * @param s1
070: * A String that will be encoded and compared.
071: * @param s2
072: * A String that will be encoded and compared.
073: * @return The number of characters in the two Soundex encoded Strings that
074: * are the same.
075: *
076: * @see #differenceEncoded(String,String)
077: * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
078: * MS T-SQL DIFFERENCE</a>
079: *
080: * @throws EncoderException
081: * if an error occurs encoding one of the strings
082: */
083: static int difference(StringEncoder encoder, String s1, String s2)
084: throws EncoderException {
085: return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
086: }
087:
088: /**
089: * Returns the number of characters in the two Soundex encoded Strings that
090: * are the same.
091: * <ul>
092: * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
093: * little or no similarity, and 4 indicates strong similarity or identical
094: * values.</li>
095: * <li>For refined Soundex, the return value can be greater than 4.</li>
096: * </ul>
097: *
098: * @param es1
099: * An encoded String.
100: * @param es2
101: * An encoded String.
102: * @return The number of characters in the two Soundex encoded Strings that
103: * are the same.
104: *
105: * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
106: * MS T-SQL DIFFERENCE</a>
107: */
108: static int differenceEncoded(String es1, String es2) {
109:
110: if (es1 == null || es2 == null) {
111: return 0;
112: }
113: int lengthToMatch = Math.min(es1.length(), es2.length());
114: int diff = 0;
115: for (int i = 0; i < lengthToMatch; i++) {
116: if (es1.charAt(i) == es2.charAt(i)) {
117: diff++;
118: }
119: }
120: return diff;
121: }
122:
123: }
|