001: package Schmortopf.Utility.Text;
002:
003: import java.util.*;
004:
005: /** Character edit-distance: find the edit distance (also called Levenstein distance)
006: between two strings.
007: contain the edit distance and the editex variant (combine edit and soundex phonix)
008: */
009: public class EditDistance {
010: private EditDistance() {
011: }
012:
013: /** zero if a and b are identical 1 otherwise
014: *
015: static int redit(int a, int b)
016: {
017: if(a==b) return 0;
018: return 1;
019: }*/
020:
021: /** same as r(a,b) zero if a and b are identical 1 if a and b in the same group, 2 otherwise
022: */
023: static int reditexEnglish(int a, int b) {
024: if (a == b)
025: return 0;
026: if ((editexGroupENGLISH((char) a) & editexGroupENGLISH((char) b)) != 0)
027: return 1;
028: return 2;
029: }
030:
031: static int d(int a, int b) {
032: if (a == 'H' && b == 'W' && a != b)
033: return 1;
034: if (a == 'W' && b == 'H' && a != b)
035: return 1;
036: return reditexEnglish(a, b);
037: }
038:
039: /** @return the editex group.
040: Caution: some letters appears twice.
041: use bitwise operator and to determine matchings
042: group1 & group2 is not zero if class match
043: */
044: private static int editexGroupENGLISH(char c) {
045: int rep = 0;
046: if ("AEIOUY".indexOf(c) != -1)
047: rep += 1;
048: if ("BP".indexOf(c) != -1)
049: rep += 2;
050: if ("CKQ".indexOf(c) != -1)
051: rep += 4;
052: if ("DT".indexOf(c) != -1)
053: rep += 8;
054: if ("LR".indexOf(c) != -1)
055: rep += 16;
056: if ("MN".indexOf(c) != -1)
057: rep += 32;
058: if ("GJ".indexOf(c) != -1)
059: rep += 64;
060: if ("FPV".indexOf(c) != -1)
061: rep += 128;
062: if ("SXZ".indexOf(c) != -1)
063: rep += 256;
064: if ("CSZ".indexOf(c) != -1)
065: rep += 512;
066: return rep;
067: }
068:
069: /** combine edit and soundex
070: w1 must be already uppercased
071: */
072: public static int EditexDistanceEnglish(String w1, String w2,
073: int tolerance) {
074: // boost??
075: if (Math.abs(w1.length() - w2.length()) > tolerance)
076: return tolerance + 1; // refused
077:
078: return editexEnglish(w1.length(), w2.length(), w1, w2
079: .toUpperCase());
080: }
081:
082: /** combine edit and soundex for English
083: */
084: static int editexEnglish(int i, int j, final String w1,
085: final String w2) {
086: if (i == 0 && j == 0)
087: return 0;
088: if (i == 0)
089: return editexEnglish(0, j - 1, w1, w2)
090: + (j < w2.length() ? d(w2.charAt(j - 1), w2
091: .charAt(j)) : 0);
092: if (j == 0)
093: return editexEnglish(i - 1, 0, w1, w2)
094: + (i < w1.length() ? d(w1.charAt(i - 1), w1
095: .charAt(i)) : 0);
096: int p1 = editexEnglish(i - 1, j - 1, w1, w2)
097: + reditexEnglish(w1.charAt(i - 1), w2.charAt(j - 1));
098: if (p1 == 0)
099: return 0;
100: int p2 = editexEnglish(i - 1, j, w1, w2)
101: + (i < w1.length() ? d(w1.charAt(i - 1), w1.charAt(i))
102: : 0);
103: int p3 = editexEnglish(i, j - 1, w1, w2)
104: + (j < w2.length() ? d(w2.charAt(j - 1), w2.charAt(j))
105: : 0);
106:
107: return Math.min(Math.min(p2, p3), p1);
108: }
109:
110: private static int min(int a, int b, int c) {
111: int mi = a;
112: if (b < mi) {
113: mi = b;
114: }
115: if (c < mi) {
116: mi = c;
117: }
118: return mi;
119: }
120:
121: /** @return the edit distance, also called Levenshtein distance.
122: is the number of inserts, deletes to transform string s into string t
123: */
124: public static int EditDistance(String s, String t, int maxDistance) {
125: int n = s.length();
126: int m = t.length();
127: if (n == 0)
128: return m;
129: if (m == 0)
130: return n;
131: if (n - m > maxDistance)
132: return n - m; // Boost for great unequal length
133: if (m - n > maxDistance)
134: return m - n;
135:
136: int d[][] = new int[n + 1][m + 1];
137:
138: for (int i = 0; i <= n; i++)
139: d[i][0] = i;
140: for (int j = 0; j <= m; j++)
141: d[0][j] = j;
142:
143: for (int i = 1; i <= n; i++) {
144: char s_i = s.charAt(i - 1);
145: for (int j = 1; j <= m; j++) {
146: d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1,
147: d[i - 1][j - 1]
148: + (s_i == t.charAt(j - 1) ? 0 : 1));
149: }
150: }
151:
152: return d[n][m];
153: }
154:
155: public static String getRandomString(int len) {
156: StringBuffer sb = new StringBuffer();
157: for (int i = 0; i < len; i++) {
158: sb.append((char) ('A' + ((int) (Math.random() * 25))));
159: }
160: return sb.toString();
161: }
162:
163: public static void test() {
164: System.out.println(EditDistance("JOERG", "JEORG", 6));
165: System.out.println(EditDistance("WATER", "WINE", 6));
166: System.out.println(EditDistance("WIINE", "WINE", 6));
167: System.out.println(getRandomString(12));
168: System.out.println(getRandomString(6));
169:
170: // 10 sec for strings gen and 10 sec for distance comp for 2 milion iterations on a P1000/NT4/Jdk1.3.1
171: long t0 = new Date().getTime();
172: for (int i = 0; i < 2000000; i++) {
173: int d;
174: String s1 = getRandomString((int) (2 + Math.random() * 8));
175: String s2 = getRandomString((int) (2 + Math.random() * 8));
176: //d = EditDistance(s1,s2,3);
177: }
178: long t1 = new Date().getTime();
179: System.out.println("t=" + (t1 - t0) / 1000.0);
180:
181: /* System.out.println( reditex('A','E'));
182: System.out.println( reditex('S','X'));
183: System.out.println( reditex('Z','S'));
184: System.out.println( reditex('S','A'));
185: System.out.println( reditex('A','A'));*/
186:
187: }
188: }
|