001: package org.incava.text;
002:
003: import java.io.*;
004: import java.util.*;
005: import org.incava.io.FileExt;
006:
007: /**
008: * Calculates the edit distance between two strings.
009: */
010: public class SpellChecker {
011: public static final int DEFAULT_MAX_DISTANCE = 4;
012:
013: protected static final int COMP_LEN = 20;
014:
015: protected static final int ARR_SIZE = COMP_LEN + 1;
016:
017: private Map _words = new HashMap();
018:
019: /**
020: * Computes the Levenstein edit distance between the two words, with a
021: * maximum of 3, at which point the distance is no longer computed.
022: */
023: public int editDistance(String str1, String str2) {
024: return editDistance(str1, str2, 3);
025: }
026:
027: /**
028: * Computes the Levenstein edit distance between the two words.
029: */
030: public int editDistance(String str1, String str2, int maximum) {
031: int len1 = Math.min(str1.length(), COMP_LEN);
032: int len2 = Math.min(str2.length(), COMP_LEN);
033:
034: // A minimum threshold of three is used for better results with short
035: // strings (A modification to the original C code.)
036:
037: int threshold = Math.max(maximum, (int) Math.floor((double) 1
038: + (len1 + 2) / 4.0));
039:
040: int diff = Math.abs(len1 - len2);
041: if (diff > threshold) {
042: return -1 * diff;
043: } else {
044: return compare(str1, len1, str2, len2);
045: }
046: }
047:
048: public boolean nearMatch(String str1, String str2) {
049: int edist = editDistance(str1, str2);
050:
051: // the edit distance is misleading for very short words, so it must be
052: // no more than the length of either word:
053: return edist >= 0 && edist <= DEFAULT_MAX_DISTANCE
054: && edist < str1.length() && edist < str2.length();
055: }
056:
057: /**
058: * Adds the given dictionary. Returns whether it could be read and had content.
059: */
060: public boolean addDictionary(String dictionary) {
061: tr.Ace.log("adding dictionary: " + dictionary);
062:
063: try {
064: BufferedReader br = new BufferedReader(new FileReader(
065: dictionary));
066:
067: while (true) {
068: String word = br.readLine();
069: if (word == null) {
070: break;
071: } else {
072: addWord(word);
073: }
074: }
075: return true;
076: } catch (IOException ioe) {
077: tr.Ace.log("ioe", ioe);
078: return false;
079: }
080: }
081:
082: public String getKey(String word) {
083: char[] chars = word.toCharArray();
084: if (chars.length > 1) {
085: char c0 = chars[0];
086: char c1 = chars[1];
087: if (c0 > c1) {
088: char t = c0;
089: c0 = c1;
090: c1 = t;
091: }
092: return "" + c0 + c1;
093: } else if (chars.length > 0) {
094: return "" + chars[0];
095: } else {
096: return null;
097: }
098: }
099:
100: public void addWord(String word) {
101: String key = getKey(word);
102: List atLtrs = (List) _words.get(key);
103: if (atLtrs == null) {
104: atLtrs = new ArrayList();
105: _words.put(key, atLtrs);
106: }
107: atLtrs.add(word);
108: }
109:
110: public boolean hasWord(String word) {
111: String key = getKey(word);
112: List atLtrs = (List) _words.get(key);
113: return atLtrs != null && atLtrs.contains(word);
114: }
115:
116: /**
117: * @param nearMatches a map from edit distances to matches.
118: */
119: public boolean isCorrect(String word, int maxEditDistance,
120: Map nearMatches) {
121: if (hasWord(word)) {
122: return true;
123: } else if (nearMatches != null) {
124: char[] wordChars = word.toCharArray();
125: int wordLen = wordChars.length;
126: String key = getKey(word);
127: List wds = (List) _words.get(key);
128:
129: if (wds != null) {
130: Iterator wit = wds.iterator();
131: while (wit.hasNext()) {
132: String w = (String) wit.next();
133:
134: int ed = editDistance(word, w, maxEditDistance);
135: if (ed >= 0 && ed <= maxEditDistance) {
136: Integer eDist = new Integer(ed);
137: List matches = (List) nearMatches.get(eDist);
138: if (matches == null) {
139: matches = new ArrayList();
140: nearMatches.put(eDist, matches);
141: }
142: matches.add(w);
143: }
144: }
145: }
146: }
147: return false;
148: }
149:
150: public boolean isCorrect(String word, Map nearMatches) {
151: return isCorrect(word, DEFAULT_MAX_DISTANCE, nearMatches);
152: }
153:
154: /**
155: * Compares the two characters. English words should probably be case
156: * insensitive; code should not.
157: */
158: protected int compare(String str1, int len1, String str2, int len2) {
159: final int ADDITION = 1;
160: final int CHANGE = 2;
161: final int DELETION = 1;
162:
163: int[][] distance = new int[ARR_SIZE][ARR_SIZE];
164: distance[0][0] = 0;
165:
166: for (int j = 1; j < ARR_SIZE; ++j) {
167: distance[0][j] = distance[0][j - 1] + ADDITION;
168: distance[j][0] = distance[j - 1][0] + DELETION;
169: }
170:
171: for (int i = 1; i <= len1; ++i) {
172: for (int j = 1; j <= len2; ++j) {
173: distance[i][j] = min3(distance[i - 1][j - 1]
174: + (str1.charAt(i - 1) == str2.charAt(j - 1) ? 0
175: : CHANGE), distance[i][j - 1]
176: + ADDITION, distance[i - 1][j] + DELETION);
177: }
178: }
179:
180: return distance[len1][len2];
181: }
182:
183: protected static int min3(int x, int y, int z) {
184: return (x < y) ? (x < z ? x : z) : (y < z ? y : z);
185: }
186:
187: }
|