001: package snow.texteditor;
002:
003: import java.util.*;
004:
005: /** Character edit-distance: find the edit distance (also called Levenstein distance)
006: between two strings.
007: contain the edit distance and the editex variant (combine edit and soundex phonix)
008: */
009: public final class EditDistance {
010: private EditDistance() {
011: }
012:
013: /** same as r(a,b) zero if a and b are identical 1 if a and b in the same group, 2 otherwise
014: */
015: static int reditexEnglish(int a, int b) {
016: if (a == b)
017: return 0;
018: if ((editexGroupENGLISH((char) a) & editexGroupENGLISH((char) b)) != 0)
019: return 1;
020: return 2;
021: }
022:
023: static int d(int a, int b) {
024: if (a == 'H' && b == 'W' && a != b)
025: return 1;
026: if (a == 'W' && b == 'H' && a != b)
027: return 1;
028: return reditexEnglish(a, b);
029: }
030:
031: /** @return the editex group.
032: Caution: some letters appears twice.
033: use bitwise operator and to determine matchings
034: group1 & group2 is not zero if class match
035: */
036: private static int editexGroupENGLISH(char c) {
037: int rep = 0;
038: if ("AEIOUY".indexOf(c) != -1)
039: rep += 1;
040: if ("BP".indexOf(c) != -1)
041: rep += 2;
042: if ("CKQ".indexOf(c) != -1)
043: rep += 4;
044: if ("DT".indexOf(c) != -1)
045: rep += 8;
046: if ("LR".indexOf(c) != -1)
047: rep += 16;
048: if ("MN".indexOf(c) != -1)
049: rep += 32;
050: if ("GJ".indexOf(c) != -1)
051: rep += 64;
052: if ("FPV".indexOf(c) != -1)
053: rep += 128;
054: if ("SXZ".indexOf(c) != -1)
055: rep += 256;
056: if ("CSZ".indexOf(c) != -1)
057: rep += 512;
058: return rep;
059: }
060:
061: /** combine edit and soundex
062: w1 must be already uppercased
063: */
064: public static int editexDistanceEnglish(String w1, String w2,
065: int tolerance) {
066: // boost??
067: if (Math.abs(w1.length() - w2.length()) > tolerance)
068: return tolerance + 1; // refused
069:
070: return editexEnglish(w1.length(), w2.length(), w1, w2
071: .toUpperCase());
072: }
073:
074: /** combine edit and soundex for English
075: */
076: static int editexEnglish(int i, int j, final String w1,
077: final String w2) {
078: if (i == 0 && j == 0)
079: return 0;
080: if (i == 0)
081: return editexEnglish(0, j - 1, w1, w2)
082: + (j < w2.length() ? d(w2.charAt(j - 1), w2
083: .charAt(j)) : 0);
084: if (j == 0)
085: return editexEnglish(i - 1, 0, w1, w2)
086: + (i < w1.length() ? d(w1.charAt(i - 1), w1
087: .charAt(i)) : 0);
088: int p1 = editexEnglish(i - 1, j - 1, w1, w2)
089: + reditexEnglish(w1.charAt(i - 1), w2.charAt(j - 1));
090: if (p1 == 0)
091: return 0;
092: int p2 = editexEnglish(i - 1, j, w1, w2)
093: + (i < w1.length() ? d(w1.charAt(i - 1), w1.charAt(i))
094: : 0);
095: int p3 = editexEnglish(i, j - 1, w1, w2)
096: + (j < w2.length() ? d(w2.charAt(j - 1), w2.charAt(j))
097: : 0);
098:
099: return Math.min(Math.min(p2, p3), p1);
100: }
101:
102: private static int min(int a, int b, int c) {
103: int mi = a;
104: if (b < mi) {
105: mi = b;
106: }
107: if (c < mi) {
108: mi = c;
109: }
110: return mi;
111: }
112:
113: /** @return the edit distance, also called Levenshtein distance.
114: is the number of inserts, deletes to transform string s into string t
115: */
116: public static int editDistance(String s, String t, int maxDistance) {
117: int n = s.length();
118: int m = t.length();
119: if (n == 0)
120: return m;
121: if (m == 0)
122: return n;
123: if (n - m > maxDistance)
124: return n - m; // Boost for great unequal length
125: if (m - n > maxDistance)
126: return m - n;
127:
128: int d[][] = new int[n + 1][m + 1];
129:
130: for (int i = 0; i <= n; i++)
131: d[i][0] = i;
132: for (int j = 0; j <= m; j++)
133: d[0][j] = j;
134:
135: for (int i = 1; i <= n; i++) {
136: char s_i = s.charAt(i - 1);
137: for (int j = 1; j <= m; j++) {
138: d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1,
139: d[i - 1][j - 1]
140: + (s_i == t.charAt(j - 1) ? 0 : 1));
141: }
142: }
143:
144: return d[n][m];
145: }
146:
147: public static String getRandomString(int len) {
148: StringBuffer sb = new StringBuffer();
149: for (int i = 0; i < len; i++) {
150: sb.append((char) ('A' + ((int) (Math.random() * 25))));
151: }
152: return sb.toString();
153: }
154:
155: public static void main(String[] aaa) {
156: System.out.println(editDistance("JOERG", "JEORG", 6));
157: System.out.println(editDistance("WATER", "WINE", 6));
158: System.out.println(editDistance("WIINE", "WINE", 6));
159: System.out.println(getRandomString(12));
160: System.out.println(getRandomString(6));
161:
162: // 10 sec for strings gen and 10 sec for distance comp for 2 milion iterations on a P1000/NT4/Jdk1.3.1
163: long t0 = new Date().getTime();
164: int n = 2000000;
165: for (int i = 0; i < n; i++) {
166: int d;
167: String s1 = getRandomString((int) (2 + Math.random() * 8));
168: String s2 = getRandomString((int) (2 + Math.random() * 8));
169: d = editDistance(s1, s2, 2);
170: }
171: long t1 = new Date().getTime();
172: System.out.println("time for " + n + " edit distances = "
173: + (t1 - t0) / 1000.0 + " s");
174:
175: /* System.out.println( reditex('A','E'));
176: System.out.println( reditex('S','X'));
177: System.out.println( reditex('Z','S'));
178: System.out.println( reditex('S','A'));
179: System.out.println( reditex('A','A'));*/
180:
181: }
182: }
|