001: /*
002: * Copyright 2001-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.commons.codec.language;
018:
019: import junit.framework.Test;
020: import junit.framework.TestSuite;
021: import org.apache.commons.codec.EncoderException;
022: import org.apache.commons.codec.StringEncoder;
023: import org.apache.commons.codec.StringEncoderAbstractTest;
024:
025: /**
026: * Tests {@link DoubleMetaphone}.
027: *
028: * @see "http://www.cuj.com/documents/s=8038/cuj0006philips/"
029: * @author Apache Software Foundation
030: * @version $Id: DoubleMetaphoneTest.java,v 1.9 2004/04/13 23:15:40 ggregory Exp $
031: */
032: public class DoubleMetaphoneTest extends StringEncoderAbstractTest {
033:
034: /**
035: * Test data from http://aspell.sourceforge.net/test/batch0.tab.
036: *
037: * "Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org). Verbatim copying
038: * and distribution of this entire article is permitted in any medium,
039: * provided this notice is preserved."
040: *
041: * Massaged the test data in the array below.
042: */
043: private static final String[][] FIXTURE = {
044: { "Accosinly", "Occasionally" },
045: { "Ciculer", "Circler" },
046: { "Circue", "Circle" },
047: { "Maddness", "Madness" },
048: { "Occusionaly", "Occasionally" },
049: { "Steffen", "Stephen" },
050: { "Thw", "The" },
051: { "Unformanlly", "Unfortunately" },
052: { "Unfortally", "Unfortunately" },
053: { "abilitey", "ability" },
054: { "abouy", "about" },
055: { "absorbtion", "absorption" },
056: { "accidently", "accidentally" },
057: { "accomodate", "accommodate" },
058: { "acommadate", "accommodate" },
059: { "acord", "accord" },
060: { "adultry", "adultery" },
061: { "aggresive", "aggressive" },
062: { "alchohol", "alcohol" },
063: { "alchoholic", "alcoholic" },
064: { "allieve", "alive" },
065: { "alot", "a lot" },
066: { "alright", "all right" },
067: { "amature", "amateur" },
068: { "ambivilant", "ambivalent" },
069: { "amification", "amplification" },
070: { "amourfous", "amorphous" },
071: { "annoint", "anoint" },
072: { "annonsment", "announcement" },
073: { "annoyting", "anting" },
074: { "annuncio", "announce" },
075: { "anonomy", "anatomy" },
076: { "anotomy", "anatomy" },
077: { "antidesestablishmentarianism",
078: "antidisestablishmentarianism" },
079: { "antidisestablishmentarism",
080: "antidisestablishmentarianism" },
081: { "anynomous", "anonymous" }, { "appelet", "applet" },
082: { "appreceiated", "appreciated" },
083: { "appresteate", "appreciate" },
084: { "aquantance", "acquaintance" },
085: { "aratictature", "architecture" },
086: { "archeype", "archetype" },
087: { "aricticure", "architecture" }, { "artic", "arctic" },
088: { "asentote", "asymptote" }, { "ast", "at" },
089: { "asterick", "asterisk" }, { "asymetric", "asymmetric" },
090: { "atentively", "attentively" },
091: { "autoamlly", "automatically" },
092: { "bankrot", "bankrupt" }, { "basicly", "basically" },
093: { "batallion", "battalion" }, { "bbrose", "browse" },
094: { "beauro", "bureau" }, { "beaurocracy", "bureaucracy" },
095: { "beggining", "beginning" }, { "beging", "beginning" },
096: { "behaviour", "behavior" }, { "beleive", "believe" },
097: { "belive", "believe" }, { "benidifs", "benefits" },
098: { "bigginging", "beginning" }, { "blait", "bleat" },
099: { "bouyant", "buoyant" }, { "boygot", "boycott" },
100: { "brocolli", "broccoli" }, { "buch", "bush" },
101: { "buder", "butter" }, { "budr", "butter" },
102: { "budter", "butter" }, { "buracracy", "bureaucracy" },
103: { "burracracy", "bureaucracy" }, { "buton", "button" },
104: { "byby", "by by" }, { "cauler", "caller" },
105: { "ceasar", "caesar" }, { "cemetary", "cemetery" },
106: { "changeing", "changing" }, { "cheet", "cheat" },
107: { "cicle", "circle" }, { "cimplicity", "simplicity" },
108: { "circumstaces", "circumstances" }, { "clob", "club" },
109: { "coaln", "colon" }, { "cocamena", "cockamamie" },
110: { "colleaque", "colleague" },
111: { "colloquilism", "colloquialism" },
112: { "columne", "column" }, { "comiler", "compiler" },
113: { "comitmment", "commitment" }, { "comitte", "committee" },
114: { "comittmen", "commitment" },
115: { "comittmend", "commitment" },
116: { "commerciasl", "commercials" },
117: { "commited", "committed" }, { "commitee", "committee" },
118: { "companys", "companies" },
119: { "compicated", "complicated" },
120: { "comupter", "computer" }, { "concensus", "consensus" },
121: { "confusionism", "confucianism" },
122: { "congradulations", "congratulations" },
123: { "conibation", "contribution" },
124: { "consident", "consistent" },
125: { "consident", "consonant" }, { "contast", "constant" },
126: { "contastant", "constant" }, { "contunie", "continue" },
127: { "cooly", "coolly" }, { "copping", "coping" },
128: { "cosmoplyton", "cosmopolitan" }, { "courst", "court" },
129: { "crasy", "crazy" }, { "cravets", "caveats" },
130: { "credetability", "credibility" },
131: { "criqitue", "critique" }, { "croke", "croak" },
132: { "crucifiction", "crucifixion" },
133: { "crusifed", "crucified" }, { "ctitique", "critique" },
134: { "cumba", "combo" }, { "custamisation", "customization" },
135: { "dag", "dog" }, { "daly", "daily" },
136: { "danguages", "dangerous" }, { "deaft", "draft" },
137: { "defence", "defense" }, { "defenly", "defiantly" },
138: { "definate", "definite" }, { "definately", "definitely" },
139: { "dependeble", "dependable" },
140: { "descrption", "description" },
141: { "descrptn", "description" },
142: { "desparate", "desperate" }, { "dessicate", "desiccate" },
143: { "destint", "distant" },
144: { "develepment", "developments" },
145: { "developement", "development" },
146: { "develpond", "development" }, { "devulge", "divulge" },
147: { "diagree", "disagree" }, { "dieties", "deities" },
148: { "dinasaur", "dinosaur" }, { "dinasour", "dinosaur" },
149: { "direcyly", "directly" }, { "discuess", "discuss" },
150: { "disect", "dissect" }, { "disippate", "dissipate" },
151: { "disition", "decision" }, { "dispair", "despair" },
152: { "disssicion", "discussion" }, { "distarct", "distract" },
153: { "distart", "distort" }, { "distroy", "destroy" },
154: { "documtations", "documentation" },
155: { "doenload", "download" }, { "dongle", "dangle" },
156: { "doog", "dog" }, { "dramaticly", "dramatically" },
157: { "drunkeness", "drunkenness" },
158: { "ductioneery", "dictionary" }, { "dur", "due" },
159: { "duren", "during" }, { "dymatic", "dynamic" },
160: { "dynaic", "dynamic" }, { "ecstacy", "ecstasy" },
161: { "efficat", "efficient" }, { "efficity", "efficacy" },
162: { "effots", "efforts" }, { "egsistence", "existence" },
163: { "eitiology", "etiology" }, { "elagent", "elegant" },
164: { "elligit", "elegant" }, { "embarass", "embarrass" },
165: { "embarassment", "embarrassment" },
166: { "embaress", "embarrass" },
167: { "encapsualtion", "encapsulation" },
168: { "encyclapidia", "encyclopedia" },
169: { "encyclopia", "encyclopedia" }, { "engins", "engine" },
170: { "enhence", "enhance" },
171: { "enligtment", "Enlightenment" }, { "ennuui", "ennui" },
172: { "enought", "enough" }, { "enventions", "inventions" },
173: { "envireminakl", "environmental" },
174: { "enviroment", "environment" }, { "epitomy", "epitome" },
175: { "equire", "acquire" }, { "errara", "error" },
176: { "erro", "error" }, { "evaualtion", "evaluation" },
177: { "evething", "everything" }, { "evtually", "eventually" },
178: { "excede", "exceed" }, { "excercise", "exercise" },
179: { "excpt", "except" }, { "excution", "execution" },
180: { "exhileration", "exhilaration" },
181: { "existance", "existence" }, { "expleyly", "explicitly" },
182: { "explity", "explicitly" }, { "expresso", "espresso" },
183: { "exspidient", "expedient" }, { "extions", "extensions" },
184: { "factontion", "factorization" }, { "failer", "failure" },
185: { "famdasy", "fantasy" }, { "faver", "favor" },
186: { "faxe", "fax" }, { "febuary", "february" },
187: { "firey", "fiery" }, { "fistival", "festival" },
188: { "flatterring", "flattering" }, { "fluk", "flux" },
189: { "flukse", "flux" }, { "fone", "phone" },
190: { "forsee", "foresee" }, { "frustartaion", "frustrating" },
191: { "fuction", "function" }, { "funetik", "phonetic" },
192: { "futs", "guts" }, { "gamne", "came" },
193: { "gaurd", "guard" }, { "generly", "generally" },
194: { "ghandi", "gandhi" }, { "goberment", "government" },
195: { "gobernement", "government" },
196: { "gobernment", "government" }, { "gotton", "gotten" },
197: { "gracefull", "graceful" }, { "gradualy", "gradually" },
198: { "grammer", "grammar" }, { "hallo", "hello" },
199: { "hapily", "happily" }, { "harrass", "harass" },
200: { "havne", "have" }, { "heellp", "help" },
201: { "heighth", "height" }, { "hellp", "help" },
202: { "helo", "hello" }, { "herlo", "hello" },
203: { "hifin", "hyphen" }, { "hifine", "hyphen" },
204: { "higer", "higher" }, { "hiphine", "hyphen" },
205: { "hippie", "hippy" }, { "hippopotamous", "hippopotamus" },
206: { "hlp", "help" }, { "hourse", "horse" },
207: { "houssing", "housing" }, { "howaver", "however" },
208: { "howver", "however" }, { "humaniti", "humanity" },
209: { "hyfin", "hyphen" }, { "hypotathes", "hypothesis" },
210: { "hypotathese", "hypothesis" },
211: { "hystrical", "hysterical" }, { "ident", "indent" },
212: { "illegitament", "illegitimate" }, { "imbed", "embed" },
213: { "imediaetly", "immediately" }, { "imfamy", "infamy" },
214: { "immenant", "immanent" }, { "implemtes", "implements" },
215: { "inadvertant", "inadvertent" }, { "incase", "in case" },
216: { "incedious", "insidious" },
217: { "incompleet", "incomplete" },
218: { "incomplot", "incomplete" },
219: { "inconvenant", "inconvenient" },
220: { "inconvience", "inconvenience" },
221: { "independant", "independent" },
222: { "independenent", "independent" },
223: { "indepnends", "independent" }, { "indepth", "in depth" },
224: { "indispensible", "indispensable" },
225: { "inefficite", "inefficient" },
226: { "inerface", "interface" }, { "infact", "in fact" },
227: { "influencial", "influential" }, { "inital", "initial" },
228: { "initinized", "initialized" },
229: { "initized", "initialized" },
230: { "innoculate", "inoculate" },
231: { "insistant", "insistent" },
232: { "insistenet", "insistent" },
233: { "instulation", "installation" },
234: { "intealignt", "intelligent" },
235: { "intejilent", "intelligent" },
236: { "intelegent", "intelligent" },
237: { "intelegnent", "intelligent" },
238: { "intelejent", "intelligent" },
239: { "inteligent", "intelligent" },
240: { "intelignt", "intelligent" },
241: { "intellagant", "intelligent" },
242: { "intellegent", "intelligent" },
243: { "intellegint", "intelligent" },
244: { "intellgnt", "intelligent" },
245: { "intensionality", "intensionally" },
246: { "interate", "iterate" },
247: { "internation", "international" },
248: { "interpretate", "interpret" },
249: { "interpretter", "interpreter" },
250: { "intertes", "interested" },
251: { "intertesd", "interested" },
252: { "invermeantial", "environmental" },
253: { "irregardless", "regardless" },
254: { "irresistable", "irresistible" },
255: { "irritible", "irritable" }, { "islams", "muslims" },
256: { "isotrop", "isotope" }, { "isreal", "israel" },
257: { "johhn", "john" }, { "judgement", "judgment" },
258: { "kippur", "kipper" }, { "knawing", "knowing" },
259: { "latext", "latest" }, { "leasve", "leave" },
260: { "lesure", "leisure" }, { "liasion", "lesion" },
261: { "liason", "liaison" }, { "libary", "library" },
262: { "likly", "likely" }, { "lilometer", "kilometer" },
263: { "liquify", "liquefy" }, { "lloyer", "layer" },
264: { "lossing", "losing" }, { "luser", "laser" },
265: { "maintanence", "maintenance" },
266: { "majaerly", "majority" }, { "majoraly", "majority" },
267: { "maks", "masks" }, { "mandelbrot", "Mandelbrot" },
268: { "mant", "want" }, { "marshall", "marshal" },
269: { "maxium", "maximum" }, { "meory", "memory" },
270: { "metter", "better" }, { "mic", "mike" },
271: { "midia", "media" }, { "millenium", "millennium" },
272: { "miniscule", "minuscule" }, { "minkay", "monkey" },
273: { "minum", "minimum" }, { "mischievious", "mischievous" },
274: { "misilous", "miscellaneous" }, { "momento", "memento" },
275: { "monkay", "monkey" }, { "mosaik", "mosaic" },
276: { "mostlikely", "most likely" }, { "mousr", "mouser" },
277: { "mroe", "more" }, { "neccessary", "necessary" },
278: { "necesary", "necessary" }, { "necesser", "necessary" },
279: { "neice", "niece" }, { "neighbour", "neighbor" },
280: { "nemonic", "pneumonic" }, { "nevade", "Nevada" },
281: { "nickleodeon", "nickelodeon" }, { "nieve", "naive" },
282: { "noone", "no one" }, { "noticably", "noticeably" },
283: { "notin", "not in" }, { "nozled", "nuzzled" },
284: { "objectsion", "objects" }, { "obsfuscate", "obfuscate" },
285: { "ocassion", "occasion" }, { "occuppied", "occupied" },
286: { "occurence", "occurrence" },
287: { "octagenarian", "octogenarian" }, { "olf", "old" },
288: { "opposim", "opossum" }, { "organise", "organize" },
289: { "organiz", "organize" }, { "orientate", "orient" },
290: { "oscilascope", "oscilloscope" }, { "oving", "moving" },
291: { "paramers", "parameters" }, { "parametic", "parameter" },
292: { "paranets", "parameters" },
293: { "partrucal", "particular" },
294: { "pataphysical", "metaphysical" },
295: { "patten", "pattern" }, { "permissable", "permissible" },
296: { "permition", "permission" },
297: { "permmasivie", "permissive" },
298: { "perogative", "prerogative" }, { "persue", "pursue" },
299: { "phantasia", "fantasia" },
300: { "phenominal", "phenomenal" },
301: { "picaresque", "picturesque" },
302: { "playwrite", "playwright" }, { "poeses", "poesies" },
303: { "polation", "politician" }, { "poligamy", "polygamy" },
304: { "politict", "politic" }, { "pollice", "police" },
305: { "polypropalene", "polypropylene" },
306: { "pompom", "pompon" }, { "possable", "possible" },
307: { "practicle", "practical" },
308: { "pragmaticism", "pragmatism" },
309: { "preceeding", "preceding" }, { "precion", "precision" },
310: { "precios", "precision" }, { "preemptory", "peremptory" },
311: { "prefices", "prefixes" }, { "prefixt", "prefixed" },
312: { "presbyterian", "Presbyterian" }, { "presue", "pursue" },
313: { "presued", "pursued" }, { "privielage", "privilege" },
314: { "priviledge", "privilege" },
315: { "proceedures", "procedures" },
316: { "pronensiation", "pronunciation" },
317: { "pronisation", "pronunciation" },
318: { "pronounciation", "pronunciation" },
319: { "properally", "properly" },
320: { "proplematic", "problematic" }, { "protray", "portray" },
321: { "pscolgst", "psychologist" },
322: { "psicolagest", "psychologist" },
323: { "psycolagest", "psychologist" }, { "quoz", "quiz" },
324: { "radious", "radius" }, { "ramplily", "rampantly" },
325: { "reccomend", "recommend" }, { "reccona", "raccoon" },
326: { "recieve", "receive" }, { "reconise", "recognize" },
327: { "rectangeles", "rectangle" }, { "redign", "redesign" },
328: { "reoccurring", "recurring" },
329: { "repitition", "repetition" },
330: { "replasments", "replacement" },
331: { "reposable", "responsible" },
332: { "reseblence", "resemblance" }, { "respct", "respect" },
333: { "respecally", "respectfully" }, { "roon", "room" },
334: { "rought", "roughly" }, { "rsx", "RSX" },
335: { "rudemtry", "rudimentary" }, { "runnung", "running" },
336: { "sacreligious", "sacrilegious" }, { "saftly", "safely" },
337: { "salut", "salute" }, { "satifly", "satisfy" },
338: { "scrabdle", "scrabble" },
339: { "searcheable", "searchable" }, { "secion", "section" },
340: { "seferal", "several" }, { "segements", "segments" },
341: { "sence", "sense" }, { "seperate", "separate" },
342: { "sherbert", "sherbet" },
343: { "sicolagest", "psychologist" }, { "sieze", "seize" },
344: { "simpfilty", "simplicity" }, { "simplye", "simply" },
345: { "singal", "signal" }, { "sitte", "site" },
346: { "situration", "situation" }, { "slyph", "sylph" },
347: { "smil", "smile" }, { "snuck", "sneaked" },
348: { "sometmes", "sometimes" }, { "soonec", "sonic" },
349: { "specificialy", "specifically" }, { "spel", "spell" },
350: { "spoak", "spoke" }, { "sponsered", "sponsored" },
351: { "stering", "steering" },
352: { "straightjacket", "straitjacket" },
353: { "stumach", "stomach" }, { "stutent", "student" },
354: { "styleguide", "style guide" },
355: { "subisitions", "substitutions" },
356: { "subjecribed", "subscribed" }, { "subpena", "subpoena" },
357: { "substations", "substitutions" }, { "suger", "sugar" },
358: { "supercede", "supersede" },
359: { "superfulous", "superfluous" }, { "susan", "Susan" },
360: { "swimwear", "swim wear" },
361: { "syncorization", "synchronization" },
362: { "taff", "tough" }, { "taht", "that" },
363: { "tattos", "tattoos" }, { "techniquely", "technically" },
364: { "teh", "the" }, { "tem", "team" }, { "teo", "two" },
365: { "teridical", "theoretical" }, { "tesst", "test" },
366: { "tets", "tests" }, { "thanot", "than or" },
367: { "theirselves", "themselves" },
368: { "theridically", "theoretical" },
369: { "thredically", "theoretically" },
370: { "thruout", "throughout" }, { "ths", "this" },
371: { "titalate", "titillate" }, { "tobagan", "tobaggon" },
372: { "tommorrow", "tomorrow" }, { "tomorow", "tomorrow" },
373: { "tradegy", "tragedy" }, { "trubbel", "trouble" },
374: { "ttest", "test" }, { "tunnellike", "tunnel like" },
375: { "tured", "turned" }, { "tyrrany", "tyranny" },
376: { "unatourral", "unnatural" },
377: { "unaturral", "unnatural" },
378: { "unconisitional", "unconstitutional" },
379: { "unconscience", "unconscious" },
380: { "underladder", "under ladder" },
381: { "unentelegible", "unintelligible" },
382: { "unfortunently", "unfortunately" },
383: { "unnaturral", "unnatural" }, { "upcast", "up cast" },
384: { "upmost", "utmost" }, { "uranisium", "uranium" },
385: { "verison", "version" }, { "vinagarette", "vinaigrette" },
386: { "volumptuous", "voluptuous" },
387: { "volunteerism", "voluntarism" }, { "volye", "volley" },
388: { "wadting", "wasting" }, { "waite", "wait" },
389: { "wan't", "won't" }, { "warloord", "warlord" },
390: { "whaaat", "what" }, { "whard", "ward" },
391: { "whimp", "wimp" }, { "wicken", "weaken" },
392: { "wierd", "weird" }, { "wrank", "rank" },
393: { "writeen", "righten" }, { "writting", "writing" },
394: { "wundeews", "windows" }, { "yeild", "yield" },
395: { "youe", "your" } };
396:
397: /**
398: * A subset of FIXTURE generated by this test.
399: */
400: private static final String[][] MATCHES = {
401: { "Accosinly", "Occasionally" },
402: { "Maddness", "Madness" },
403: { "Occusionaly", "Occasionally" },
404: { "Steffen", "Stephen" },
405: { "Thw", "The" },
406: { "Unformanlly", "Unfortunately" },
407: { "Unfortally", "Unfortunately" },
408: { "abilitey", "ability" },
409: { "absorbtion", "absorption" },
410: { "accidently", "accidentally" },
411: { "accomodate", "accommodate" },
412: { "acommadate", "accommodate" },
413: { "acord", "accord" },
414: { "adultry", "adultery" },
415: { "aggresive", "aggressive" },
416: { "alchohol", "alcohol" },
417: { "alchoholic", "alcoholic" },
418: { "allieve", "alive" },
419: { "alot", "a lot" },
420: { "alright", "all right" },
421: { "amature", "amateur" },
422: { "ambivilant", "ambivalent" },
423: { "amourfous", "amorphous" },
424: { "annoint", "anoint" },
425: { "annonsment", "announcement" },
426: { "annoyting", "anting" },
427: { "annuncio", "announce" },
428: { "anotomy", "anatomy" },
429: { "antidesestablishmentarianism",
430: "antidisestablishmentarianism" },
431: { "antidisestablishmentarism",
432: "antidisestablishmentarianism" },
433: { "anynomous", "anonymous" }, { "appelet", "applet" },
434: { "appreceiated", "appreciated" },
435: { "appresteate", "appreciate" },
436: { "aquantance", "acquaintance" },
437: { "aricticure", "architecture" },
438: { "asterick", "asterisk" }, { "asymetric", "asymmetric" },
439: { "atentively", "attentively" }, { "bankrot", "bankrupt" },
440: { "basicly", "basically" }, { "batallion", "battalion" },
441: { "bbrose", "browse" }, { "beauro", "bureau" },
442: { "beaurocracy", "bureaucracy" },
443: { "beggining", "beginning" }, { "behaviour", "behavior" },
444: { "beleive", "believe" }, { "belive", "believe" },
445: { "blait", "bleat" }, { "bouyant", "buoyant" },
446: { "boygot", "boycott" }, { "brocolli", "broccoli" },
447: { "buder", "butter" }, { "budr", "butter" },
448: { "budter", "butter" }, { "buracracy", "bureaucracy" },
449: { "burracracy", "bureaucracy" }, { "buton", "button" },
450: { "byby", "by by" }, { "cauler", "caller" },
451: { "ceasar", "caesar" }, { "cemetary", "cemetery" },
452: { "changeing", "changing" }, { "cheet", "cheat" },
453: { "cimplicity", "simplicity" },
454: { "circumstaces", "circumstances" }, { "clob", "club" },
455: { "coaln", "colon" }, { "colleaque", "colleague" },
456: { "colloquilism", "colloquialism" },
457: { "columne", "column" }, { "comitmment", "commitment" },
458: { "comitte", "committee" }, { "comittmen", "commitment" },
459: { "comittmend", "commitment" },
460: { "commerciasl", "commercials" },
461: { "commited", "committed" }, { "commitee", "committee" },
462: { "companys", "companies" }, { "comupter", "computer" },
463: { "concensus", "consensus" },
464: { "confusionism", "confucianism" },
465: { "congradulations", "congratulations" },
466: { "contunie", "continue" }, { "cooly", "coolly" },
467: { "copping", "coping" }, { "cosmoplyton", "cosmopolitan" },
468: { "crasy", "crazy" }, { "croke", "croak" },
469: { "crucifiction", "crucifixion" },
470: { "crusifed", "crucified" }, { "cumba", "combo" },
471: { "custamisation", "customization" }, { "dag", "dog" },
472: { "daly", "daily" }, { "defence", "defense" },
473: { "definate", "definite" }, { "definately", "definitely" },
474: { "dependeble", "dependable" },
475: { "descrption", "description" },
476: { "descrptn", "description" },
477: { "desparate", "desperate" }, { "dessicate", "desiccate" },
478: { "destint", "distant" },
479: { "develepment", "developments" },
480: { "developement", "development" },
481: { "develpond", "development" }, { "devulge", "divulge" },
482: { "dieties", "deities" }, { "dinasaur", "dinosaur" },
483: { "dinasour", "dinosaur" }, { "discuess", "discuss" },
484: { "disect", "dissect" }, { "disippate", "dissipate" },
485: { "disition", "decision" }, { "dispair", "despair" },
486: { "distarct", "distract" }, { "distart", "distort" },
487: { "distroy", "destroy" }, { "doenload", "download" },
488: { "dongle", "dangle" }, { "doog", "dog" },
489: { "dramaticly", "dramatically" },
490: { "drunkeness", "drunkenness" },
491: { "ductioneery", "dictionary" }, { "ecstacy", "ecstasy" },
492: { "egsistence", "existence" }, { "eitiology", "etiology" },
493: { "elagent", "elegant" }, { "embarass", "embarrass" },
494: { "embarassment", "embarrassment" },
495: { "embaress", "embarrass" },
496: { "encapsualtion", "encapsulation" },
497: { "encyclapidia", "encyclopedia" },
498: { "encyclopia", "encyclopedia" }, { "engins", "engine" },
499: { "enhence", "enhance" }, { "ennuui", "ennui" },
500: { "enventions", "inventions" },
501: { "envireminakl", "environmental" },
502: { "enviroment", "environment" }, { "epitomy", "epitome" },
503: { "equire", "acquire" }, { "errara", "error" },
504: { "evaualtion", "evaluation" }, { "excede", "exceed" },
505: { "excercise", "exercise" }, { "excpt", "except" },
506: { "exhileration", "exhilaration" },
507: { "existance", "existence" }, { "expleyly", "explicitly" },
508: { "explity", "explicitly" }, { "failer", "failure" },
509: { "faver", "favor" }, { "faxe", "fax" },
510: { "firey", "fiery" }, { "fistival", "festival" },
511: { "flatterring", "flattering" }, { "flukse", "flux" },
512: { "fone", "phone" }, { "forsee", "foresee" },
513: { "frustartaion", "frustrating" },
514: { "funetik", "phonetic" }, { "gaurd", "guard" },
515: { "generly", "generally" }, { "ghandi", "gandhi" },
516: { "gotton", "gotten" }, { "gracefull", "graceful" },
517: { "gradualy", "gradually" }, { "grammer", "grammar" },
518: { "hallo", "hello" }, { "hapily", "happily" },
519: { "harrass", "harass" }, { "heellp", "help" },
520: { "heighth", "height" }, { "hellp", "help" },
521: { "helo", "hello" }, { "hifin", "hyphen" },
522: { "hifine", "hyphen" }, { "hiphine", "hyphen" },
523: { "hippie", "hippy" }, { "hippopotamous", "hippopotamus" },
524: { "hourse", "horse" }, { "houssing", "housing" },
525: { "howaver", "however" }, { "howver", "however" },
526: { "humaniti", "humanity" }, { "hyfin", "hyphen" },
527: { "hystrical", "hysterical" },
528: { "illegitament", "illegitimate" }, { "imbed", "embed" },
529: { "imediaetly", "immediately" },
530: { "immenant", "immanent" }, { "implemtes", "implements" },
531: { "inadvertant", "inadvertent" }, { "incase", "in case" },
532: { "incedious", "insidious" },
533: { "incompleet", "incomplete" },
534: { "incomplot", "incomplete" },
535: { "inconvenant", "inconvenient" },
536: { "inconvience", "inconvenience" },
537: { "independant", "independent" },
538: { "independenent", "independent" },
539: { "indepnends", "independent" }, { "indepth", "in depth" },
540: { "indispensible", "indispensable" },
541: { "inefficite", "inefficient" }, { "infact", "in fact" },
542: { "influencial", "influential" },
543: { "innoculate", "inoculate" },
544: { "insistant", "insistent" },
545: { "insistenet", "insistent" },
546: { "instulation", "installation" },
547: { "intealignt", "intelligent" },
548: { "intelegent", "intelligent" },
549: { "intelegnent", "intelligent" },
550: { "intelejent", "intelligent" },
551: { "inteligent", "intelligent" },
552: { "intelignt", "intelligent" },
553: { "intellagant", "intelligent" },
554: { "intellegent", "intelligent" },
555: { "intellegint", "intelligent" },
556: { "intellgnt", "intelligent" },
557: { "intensionality", "intensionally" },
558: { "internation", "international" },
559: { "interpretate", "interpret" },
560: { "interpretter", "interpreter" },
561: { "intertes", "interested" },
562: { "intertesd", "interested" },
563: { "invermeantial", "environmental" },
564: { "irresistable", "irresistible" },
565: { "irritible", "irritable" }, { "isreal", "israel" },
566: { "johhn", "john" }, { "kippur", "kipper" },
567: { "knawing", "knowing" }, { "lesure", "leisure" },
568: { "liasion", "lesion" }, { "liason", "liaison" },
569: { "likly", "likely" }, { "liquify", "liquefy" },
570: { "lloyer", "layer" }, { "lossing", "losing" },
571: { "luser", "laser" }, { "maintanence", "maintenance" },
572: { "mandelbrot", "Mandelbrot" }, { "marshall", "marshal" },
573: { "maxium", "maximum" }, { "mic", "mike" },
574: { "midia", "media" }, { "millenium", "millennium" },
575: { "miniscule", "minuscule" }, { "minkay", "monkey" },
576: { "mischievious", "mischievous" },
577: { "momento", "memento" }, { "monkay", "monkey" },
578: { "mosaik", "mosaic" }, { "mostlikely", "most likely" },
579: { "mousr", "mouser" }, { "mroe", "more" },
580: { "necesary", "necessary" }, { "necesser", "necessary" },
581: { "neice", "niece" }, { "neighbour", "neighbor" },
582: { "nemonic", "pneumonic" }, { "nevade", "Nevada" },
583: { "nickleodeon", "nickelodeon" }, { "nieve", "naive" },
584: { "noone", "no one" }, { "notin", "not in" },
585: { "nozled", "nuzzled" }, { "objectsion", "objects" },
586: { "ocassion", "occasion" }, { "occuppied", "occupied" },
587: { "occurence", "occurrence" },
588: { "octagenarian", "octogenarian" },
589: { "opposim", "opossum" }, { "organise", "organize" },
590: { "organiz", "organize" }, { "orientate", "orient" },
591: { "oscilascope", "oscilloscope" },
592: { "parametic", "parameter" },
593: { "permissable", "permissible" },
594: { "permmasivie", "permissive" }, { "persue", "pursue" },
595: { "phantasia", "fantasia" },
596: { "phenominal", "phenomenal" },
597: { "playwrite", "playwright" }, { "poeses", "poesies" },
598: { "poligamy", "polygamy" }, { "politict", "politic" },
599: { "pollice", "police" },
600: { "polypropalene", "polypropylene" },
601: { "possable", "possible" }, { "practicle", "practical" },
602: { "pragmaticism", "pragmatism" },
603: { "preceeding", "preceding" }, { "precios", "precision" },
604: { "preemptory", "peremptory" }, { "prefixt", "prefixed" },
605: { "presbyterian", "Presbyterian" }, { "presue", "pursue" },
606: { "presued", "pursued" }, { "privielage", "privilege" },
607: { "priviledge", "privilege" },
608: { "proceedures", "procedures" },
609: { "pronensiation", "pronunciation" },
610: { "pronounciation", "pronunciation" },
611: { "properally", "properly" },
612: { "proplematic", "problematic" }, { "protray", "portray" },
613: { "pscolgst", "psychologist" },
614: { "psicolagest", "psychologist" },
615: { "psycolagest", "psychologist" }, { "quoz", "quiz" },
616: { "radious", "radius" }, { "reccomend", "recommend" },
617: { "reccona", "raccoon" }, { "recieve", "receive" },
618: { "reconise", "recognize" },
619: { "rectangeles", "rectangle" },
620: { "reoccurring", "recurring" },
621: { "repitition", "repetition" },
622: { "replasments", "replacement" }, { "respct", "respect" },
623: { "respecally", "respectfully" }, { "rsx", "RSX" },
624: { "runnung", "running" },
625: { "sacreligious", "sacrilegious" }, { "salut", "salute" },
626: { "searcheable", "searchable" }, { "seferal", "several" },
627: { "segements", "segments" }, { "sence", "sense" },
628: { "seperate", "separate" },
629: { "sicolagest", "psychologist" }, { "sieze", "seize" },
630: { "simplye", "simply" }, { "sitte", "site" },
631: { "slyph", "sylph" }, { "smil", "smile" },
632: { "sometmes", "sometimes" }, { "soonec", "sonic" },
633: { "specificialy", "specifically" }, { "spel", "spell" },
634: { "spoak", "spoke" }, { "sponsered", "sponsored" },
635: { "stering", "steering" },
636: { "straightjacket", "straitjacket" },
637: { "stumach", "stomach" }, { "stutent", "student" },
638: { "styleguide", "style guide" }, { "subpena", "subpoena" },
639: { "substations", "substitutions" },
640: { "supercede", "supersede" },
641: { "superfulous", "superfluous" }, { "susan", "Susan" },
642: { "swimwear", "swim wear" },
643: { "syncorization", "synchronization" },
644: { "taff", "tough" }, { "taht", "that" },
645: { "tattos", "tattoos" }, { "techniquely", "technically" },
646: { "teh", "the" }, { "tem", "team" }, { "teo", "two" },
647: { "teridical", "theoretical" }, { "tesst", "test" },
648: { "theridically", "theoretical" },
649: { "thredically", "theoretically" },
650: { "thruout", "throughout" }, { "ths", "this" },
651: { "titalate", "titillate" }, { "tobagan", "tobaggon" },
652: { "tommorrow", "tomorrow" }, { "tomorow", "tomorrow" },
653: { "trubbel", "trouble" }, { "ttest", "test" },
654: { "tyrrany", "tyranny" }, { "unatourral", "unnatural" },
655: { "unaturral", "unnatural" },
656: { "unconisitional", "unconstitutional" },
657: { "unconscience", "unconscious" },
658: { "underladder", "under ladder" },
659: { "unentelegible", "unintelligible" },
660: { "unfortunently", "unfortunately" },
661: { "unnaturral", "unnatural" }, { "upcast", "up cast" },
662: { "verison", "version" }, { "vinagarette", "vinaigrette" },
663: { "volunteerism", "voluntarism" }, { "volye", "volley" },
664: { "waite", "wait" }, { "wan't", "won't" },
665: { "warloord", "warlord" }, { "whaaat", "what" },
666: { "whard", "ward" }, { "whimp", "wimp" },
667: { "wicken", "weaken" }, { "wierd", "weird" },
668: { "wrank", "rank" }, { "writeen", "righten" },
669: { "writting", "writing" }, { "wundeews", "windows" },
670: { "yeild", "yield" }, };
671:
672: public static Test suite() {
673: return (new TestSuite(DoubleMetaphoneTest.class));
674: }
675:
676: private DoubleMetaphone doubleMetaphone = null;
677:
678: public DoubleMetaphoneTest(String name) {
679: super (name);
680: }
681:
682: /**
683: * Tests encoding APIs in one place.
684: */
685: private void assertDoubleMetaphone(String expected, String source) {
686: assertEquals(expected, this .getDoubleMetaphone().encode(source));
687: try {
688: assertEquals(expected, this .getDoubleMetaphone().encode(
689: (Object) source));
690: } catch (EncoderException e) {
691: fail("Unexpected expection: " + e);
692: }
693: assertEquals(expected, this .getDoubleMetaphone()
694: .doubleMetaphone(source));
695: assertEquals(expected, this .getDoubleMetaphone()
696: .doubleMetaphone(source, false));
697: }
698:
699: /**
700: * Tests encoding APIs in one place.
701: */
702: public void assertDoubleMetaphoneAlt(String expected, String source) {
703: assertEquals(expected, this .getDoubleMetaphone()
704: .doubleMetaphone(source, true));
705: }
706:
707: public void doubleMetaphoneEqualTest(String[][] pairs,
708: boolean useAlternate) {
709: this .validateFixture(pairs);
710: for (int i = 0; i < pairs.length; i++) {
711: String name0 = pairs[i][0];
712: String name1 = pairs[i][1];
713: String failMsg = "Expected match between " + name0
714: + " and " + name1 + " (use alternate: "
715: + useAlternate + ")";
716: assertTrue(failMsg, this .getDoubleMetaphone()
717: .isDoubleMetaphoneEqual(name0, name1, useAlternate));
718: assertTrue(failMsg, this .getDoubleMetaphone()
719: .isDoubleMetaphoneEqual(name1, name0, useAlternate));
720: if (!useAlternate) {
721: assertTrue(failMsg, this .getDoubleMetaphone()
722: .isDoubleMetaphoneEqual(name0, name1));
723: assertTrue(failMsg, this .getDoubleMetaphone()
724: .isDoubleMetaphoneEqual(name1, name0));
725: }
726: }
727: }
728:
729: public void doubleMetaphoneNotEqualTest(boolean alternate) {
730: assertFalse(this .getDoubleMetaphone().isDoubleMetaphoneEqual(
731: "Brain", "Band", alternate));
732: assertFalse(this .getDoubleMetaphone().isDoubleMetaphoneEqual(
733: "Band", "Brain", alternate));
734:
735: if (!alternate) {
736: assertFalse(this .getDoubleMetaphone()
737: .isDoubleMetaphoneEqual("Brain", "Band"));
738: assertFalse(this .getDoubleMetaphone()
739: .isDoubleMetaphoneEqual("Band", "Brain"));
740: }
741: }
742:
743: private DoubleMetaphone getDoubleMetaphone() {
744: return this .doubleMetaphone;
745: }
746:
747: protected StringEncoder makeEncoder() {
748: return new Metaphone();
749: }
750:
751: private void setDoubleMetaphone(DoubleMetaphone doubleMetaphone) {
752: this .doubleMetaphone = doubleMetaphone;
753: }
754:
755: public void setUp() throws Exception {
756: super .setUp();
757: this .setDoubleMetaphone(new DoubleMetaphone());
758: }
759:
760: public void tearDown() throws Exception {
761: super .tearDown();
762: this .setDoubleMetaphone(null);
763: }
764:
765: public void testDoubleMetaphone() {
766: assertDoubleMetaphone("TSTN", "testing");
767: assertDoubleMetaphone("0", "The");
768: assertDoubleMetaphone("KK", "quick");
769: assertDoubleMetaphone("PRN", "brown");
770: assertDoubleMetaphone("FKS", "fox");
771: assertDoubleMetaphone("JMPT", "jumped");
772: assertDoubleMetaphone("AFR", "over");
773: assertDoubleMetaphone("0", "the");
774: assertDoubleMetaphone("LS", "lazy");
775: assertDoubleMetaphone("TKS", "dogs");
776: assertDoubleMetaphone("MKFR", "MacCafferey");
777: assertDoubleMetaphone("STFN", "Stephan");
778: assertDoubleMetaphone("KSSK", "Kuczewski");
779:
780: assertDoubleMetaphoneAlt("TSTN", "testing");
781: assertDoubleMetaphoneAlt("T", "The");
782: assertDoubleMetaphoneAlt("KK", "quick");
783: assertDoubleMetaphoneAlt("PRN", "brown");
784: assertDoubleMetaphoneAlt("FKS", "fox");
785: assertDoubleMetaphoneAlt("AMPT", "jumped");
786: assertDoubleMetaphoneAlt("AFR", "over");
787: assertDoubleMetaphoneAlt("T", "the");
788: assertDoubleMetaphoneAlt("LS", "lazy");
789: assertDoubleMetaphoneAlt("TKS", "dogs");
790: assertDoubleMetaphoneAlt("MKFR", "MacCafferey");
791: assertDoubleMetaphoneAlt("STFN", "Stephan");
792: assertDoubleMetaphoneAlt("KXFS", "Kutchefski");
793: }
794:
795: public void testEmpty() {
796: assertEquals(null, this .getDoubleMetaphone().doubleMetaphone(
797: null));
798: assertEquals(null, this .getDoubleMetaphone()
799: .doubleMetaphone(""));
800: assertEquals(null, this .getDoubleMetaphone().doubleMetaphone(
801: " "));
802: assertEquals(null, this .getDoubleMetaphone().doubleMetaphone(
803: "\t\n\r "));
804: }
805:
806: public void testIsDoubleMetaphoneEqualBasic() {
807: String[][] testFixture = new String[][] { { "Case", "case" },
808: { "CASE", "Case" }, { "caSe", "cAsE" },
809: { "cookie", "quick" }, { "quick", "cookie" },
810: { "Brian", "Bryan" }, { "Auto", "Otto" },
811: { "Steven", "Stefan" }, { "Philipowitz", "Filipowicz" } };
812: doubleMetaphoneEqualTest(testFixture, false);
813: doubleMetaphoneEqualTest(testFixture, true);
814: }
815:
816: /**
817: * Example in the original article but failures in this Java impl:
818: */
819: public void testIsDoubleMetaphoneEqualExtended1() {
820: // String[][] testFixture = new String[][] { { "Smith", "Schmidt" }
821: // };
822: // doubleMetaphoneEqualTest(testFixture, false);
823: // doubleMetaphoneEqualTest(testFixture, true);
824: }
825:
826: public void testIsDoubleMetaphoneEqualExtended2() {
827: String[][] testFixture = new String[][] { { "Jablonski",
828: "Yablonsky" } };
829: //doubleMetaphoneEqualTest(testFixture, false);
830: doubleMetaphoneEqualTest(testFixture, true);
831: }
832:
833: /**
834: * Used to generate the MATCHES array and test possible matches from the
835: * FIXTURE arrary.
836: */
837: public void testIsDoubleMetaphoneEqualExtended3() {
838: this .validateFixture(FIXTURE);
839: StringBuffer failures = new StringBuffer();
840: StringBuffer matches = new StringBuffer();
841: String cr = System.getProperty("line.separator");
842: matches.append("private static final String[][] MATCHES = {"
843: + cr);
844: int failCount = 0;
845: for (int i = 0; i < FIXTURE.length; i++) {
846: String name0 = FIXTURE[i][0];
847: String name1 = FIXTURE[i][1];
848: boolean match1 = this .getDoubleMetaphone()
849: .isDoubleMetaphoneEqual(name0, name1, false);
850: boolean match2 = this .getDoubleMetaphone()
851: .isDoubleMetaphoneEqual(name0, name1, true);
852: if (match1 == false && match2 == false) {
853: String failMsg = "[" + i + "] " + name0 + " and "
854: + name1 + cr;
855: failures.append(failMsg);
856: failCount++;
857: } else {
858: matches.append("{\"" + name0 + "\", \"" + name1
859: + "\"}," + cr);
860: }
861: }
862: String msg = failures.toString();
863: matches.append("};");
864: // Turn on to print a new MATCH array
865: //System.out.println(matches.toString());
866: if (msg.length() > 0) {
867: // Turn on to see which pairs do NOT match.
868: //fail(failCount + " failures out of " + FIXTURE.length + ". The
869: // following could be made to match: " + cr + msg);
870: }
871: }
872:
873: public void testIsDoubleMetaphoneEqualWithMATCHES() {
874: this .validateFixture(MATCHES);
875: for (int i = 0; i < MATCHES.length; i++) {
876: String name0 = MATCHES[i][0];
877: String name1 = MATCHES[i][1];
878: boolean match1 = this .getDoubleMetaphone()
879: .isDoubleMetaphoneEqual(name0, name1, false);
880: boolean match2 = this .getDoubleMetaphone()
881: .isDoubleMetaphoneEqual(name0, name1, true);
882: if (match1 == false && match2 == false) {
883: fail("Expected match [" + i + "] " + name0 + " and "
884: + name1);
885: }
886: }
887: }
888:
889: public void testIsDoubleMetaphoneNotEqual() {
890: doubleMetaphoneNotEqualTest(false);
891: doubleMetaphoneNotEqualTest(true);
892: }
893:
894: public void testCCedilla() {
895: this .getDoubleMetaphone().isDoubleMetaphoneEqual("ç", "S");
896: }
897:
898: public void testNTilde() {
899: this .getDoubleMetaphone().isDoubleMetaphoneEqual("ñ", "N");
900: }
901:
902: public void validateFixture(String[][] pairs) {
903: if (pairs.length == 0) {
904: fail("Test fixture is empty");
905: }
906: for (int i = 0; i < pairs.length; i++) {
907: if (pairs[i].length != 2) {
908: fail("Error in test fixture in the data array at index "
909: + i);
910: }
911: }
912: }
913: }
|