001: /*
002: * Copyright 2001-2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: // (FYI: Formatted and sorted with Eclipse)
018: package org.apache.commons.codec.language;
019:
020: import junit.framework.Test;
021: import junit.framework.TestSuite;
022: import org.apache.commons.codec.EncoderException;
023: import org.apache.commons.codec.StringEncoder;
024: import org.apache.commons.codec.StringEncoderAbstractTest;
025:
026: /**
027: * Tests {@link Soundex}
028: *
029: * @version $Id: SoundexTest.java,v 1.18 2004/06/02 00:55:38 ggregory Exp $
030: * @author Apache Software Foundation
031: */
032: public class SoundexTest extends StringEncoderAbstractTest {
033:
034: public static Test suite() {
035: return (new TestSuite(SoundexTest.class));
036: }
037:
038: private Soundex encoder = null;
039:
040: public SoundexTest(String name) {
041: super (name);
042: }
043:
044: void encodeAll(String[] strings, String expectedEncoding) {
045: for (int i = 0; i < strings.length; i++) {
046: assertEquals(expectedEncoding, this .getEncoder().encode(
047: strings[i]));
048: }
049: }
050:
051: /**
052: * @return Returns the _encoder.
053: */
054: public Soundex getEncoder() {
055: return this .encoder;
056: }
057:
058: protected StringEncoder makeEncoder() {
059: return new Soundex();
060: }
061:
062: /**
063: * @param encoder
064: * The encoder to set.
065: */
066: public void setEncoder(Soundex encoder) {
067: this .encoder = encoder;
068: }
069:
070: public void setUp() throws Exception {
071: super .setUp();
072: this .setEncoder(new Soundex());
073: }
074:
075: public void tearDown() throws Exception {
076: super .tearDown();
077: this .setEncoder(null);
078: }
079:
080: public void testB650() {
081: this .encodeAll(new String[] { "BARHAM", "BARONE", "BARRON",
082: "BERNA", "BIRNEY", "BIRNIE", "BOOROM", "BOREN", "BORN",
083: "BOURN", "BOURNE", "BOWRON", "BRAIN", "BRAME", "BRANN",
084: "BRAUN", "BREEN", "BRIEN", "BRIM", "BRIMM", "BRINN",
085: "BRION", "BROOM", "BROOME", "BROWN", "BROWNE", "BRUEN",
086: "BRUHN", "BRUIN", "BRUMM", "BRUN", "BRUNO", "BRYAN",
087: "BURIAN", "BURN", "BURNEY", "BYRAM", "BYRNE", "BYRON",
088: "BYRUM" }, "B650");
089: }
090:
091: public void testDifference() throws EncoderException {
092: // Edge cases
093: assertEquals(0, this .getEncoder().difference(null, null));
094: assertEquals(0, this .getEncoder().difference("", ""));
095: assertEquals(0, this .getEncoder().difference(" ", " "));
096: // Normal cases
097: assertEquals(4, this .getEncoder().difference("Smith", "Smythe"));
098: assertEquals(2, this .getEncoder().difference("Ann", "Andrew"));
099: assertEquals(1, this .getEncoder().difference("Margaret",
100: "Andrew"));
101: assertEquals(0, this .getEncoder().difference("Janet",
102: "Margaret"));
103: // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
104: assertEquals(4, this .getEncoder().difference("Green", "Greene"));
105: assertEquals(0, this .getEncoder().difference("Blotchet-Halls",
106: "Greene"));
107: // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
108: assertEquals(4, this .getEncoder().difference("Smith", "Smythe"));
109: assertEquals(4, this .getEncoder().difference("Smithers",
110: "Smythers"));
111: assertEquals(2, this .getEncoder().difference("Anothers",
112: "Brothers"));
113: }
114:
115: public void testEncodeBasic() {
116: assertEquals("T235", this .getEncoder().encode("testing"));
117: assertEquals("T000", this .getEncoder().encode("The"));
118: assertEquals("Q200", this .getEncoder().encode("quick"));
119: assertEquals("B650", this .getEncoder().encode("brown"));
120: assertEquals("F200", this .getEncoder().encode("fox"));
121: assertEquals("J513", this .getEncoder().encode("jumped"));
122: assertEquals("O160", this .getEncoder().encode("over"));
123: assertEquals("T000", this .getEncoder().encode("the"));
124: assertEquals("L200", this .getEncoder().encode("lazy"));
125: assertEquals("D200", this .getEncoder().encode("dogs"));
126: }
127:
128: /**
129: * Examples from
130: * http://www.bradandkathy.com/genealogy/overviewofsoundex.html
131: */
132: public void testEncodeBatch2() {
133: assertEquals("A462", this .getEncoder().encode("Allricht"));
134: assertEquals("E166", this .getEncoder().encode("Eberhard"));
135: assertEquals("E521", this .getEncoder().encode("Engebrethson"));
136: assertEquals("H512", this .getEncoder().encode("Heimbach"));
137: assertEquals("H524", this .getEncoder().encode("Hanselmann"));
138: assertEquals("H431", this .getEncoder().encode("Hildebrand"));
139: assertEquals("K152", this .getEncoder().encode("Kavanagh"));
140: assertEquals("L530", this .getEncoder().encode("Lind"));
141: assertEquals("L222", this .getEncoder().encode("Lukaschowsky"));
142: assertEquals("M235", this .getEncoder().encode("McDonnell"));
143: assertEquals("M200", this .getEncoder().encode("McGee"));
144: assertEquals("O155", this .getEncoder().encode("Opnian"));
145: assertEquals("O155", this .getEncoder().encode("Oppenheimer"));
146: assertEquals("R355", this .getEncoder().encode("Riedemanas"));
147: assertEquals("Z300", this .getEncoder().encode("Zita"));
148: assertEquals("Z325", this .getEncoder().encode("Zitzmeinn"));
149: }
150:
151: /**
152: * Examples from
153: * http://www.archives.gov/research_room/genealogy/census/soundex.html
154: */
155: public void testEncodeBatch3() {
156: assertEquals("W252", this .getEncoder().encode("Washington"));
157: assertEquals("L000", this .getEncoder().encode("Lee"));
158: assertEquals("G362", this .getEncoder().encode("Gutierrez"));
159: assertEquals("P236", this .getEncoder().encode("Pfister"));
160: assertEquals("J250", this .getEncoder().encode("Jackson"));
161: assertEquals("T522", this .getEncoder().encode("Tymczak"));
162: // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
163: // possible.
164: assertEquals("V532", this .getEncoder().encode("VanDeusen"));
165: }
166:
167: /**
168: * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
169: */
170: public void testEncodeBatch4() {
171: assertEquals("H452", this .getEncoder().encode("HOLMES"));
172: assertEquals("A355", this .getEncoder().encode("ADOMOMI"));
173: assertEquals("V536", this .getEncoder().encode("VONDERLEHR"));
174: assertEquals("B400", this .getEncoder().encode("BALL"));
175: assertEquals("S000", this .getEncoder().encode("SHAW"));
176: assertEquals("J250", this .getEncoder().encode("JACKSON"));
177: assertEquals("S545", this .getEncoder().encode("SCANLON"));
178: assertEquals("S532", this .getEncoder().encode("SAINTJOHN"));
179:
180: }
181:
182: public void testBadCharacters() {
183: assertEquals("H452", this .getEncoder().encode("HOL>MES"));
184:
185: }
186:
187: public void testEncodeIgnoreApostrophes() {
188: this .encodeAll(
189: new String[] { "OBrien", "'OBrien", "O'Brien",
190: "OB'rien", "OBr'ien", "OBri'en", "OBrie'n",
191: "OBrien'" }, "O165");
192: }
193:
194: /**
195: * Test data from http://www.myatt.demon.co.uk/sxalg.htm
196: */
197: public void testEncodeIgnoreHyphens() {
198: this .encodeAll(new String[] { "KINGSMITH", "-KINGSMITH",
199: "K-INGSMITH", "KI-NGSMITH", "KIN-GSMITH", "KING-SMITH",
200: "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H",
201: "KINGSMITH-" }, "K525");
202: }
203:
204: public void testEncodeIgnoreTrimmable() {
205: assertEquals("W252", this .getEncoder().encode(
206: " \t\n\r Washington \t\n\r "));
207: }
208:
209: /**
210: * Consonants from the same code group separated by W or H are treated as
211: * one.
212: */
213: public void testHWRuleEx1() {
214: // From
215: // http://www.archives.gov/research_room/genealogy/census/soundex.html:
216: // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
217: // for the F). It is not coded A-226.
218: assertEquals("A261", this .getEncoder().encode("Ashcraft"));
219: }
220:
221: /**
222: * Consonants from the same code group separated by W or H are treated as
223: * one.
224: *
225: * Test data from http://www.myatt.demon.co.uk/sxalg.htm
226: */
227: public void testHWRuleEx2() {
228: assertEquals("B312", this .getEncoder().encode("BOOTHDAVIS"));
229: assertEquals("B312", this .getEncoder().encode("BOOTH-DAVIS"));
230: }
231:
232: /**
233: * Consonants from the same code group separated by W or H are treated as
234: * one.
235: */
236: public void testHWRuleEx3() {
237: assertEquals("S460", this .getEncoder().encode("Sgler"));
238: assertEquals("S460", this .getEncoder().encode("Swhgler"));
239: // Also S460:
240: this .encodeAll(new String[] { "SAILOR", "SALYER", "SAYLOR",
241: "SCHALLER", "SCHELLER", "SCHILLER", "SCHOOLER",
242: "SCHULER", "SCHUYLER", "SEILER", "SEYLER", "SHOLAR",
243: "SHULER", "SILAR", "SILER", "SILLER" }, "S460");
244: }
245:
246: public void testMaxLength() throws Exception {
247: Soundex soundex = new Soundex();
248: soundex.setMaxLength(soundex.getMaxLength());
249: assertEquals("S460", this .getEncoder().encode("Sgler"));
250: }
251:
252: public void testMaxLengthLessThan3Fix() throws Exception {
253: Soundex soundex = new Soundex();
254: soundex.setMaxLength(2);
255: assertEquals("S460", soundex.encode("SCHELLER"));
256: }
257:
258: /**
259: * Examples for MS SQLServer from
260: * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
261: */
262: public void testMsSqlServer1() {
263: assertEquals("S530", this .getEncoder().encode("Smith"));
264: assertEquals("S530", this .getEncoder().encode("Smythe"));
265: }
266:
267: /**
268: * Examples for MS SQLServer from
269: * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support/kb/articles/Q100/3/65.asp&NoWebContent=1
270: */
271: public void testMsSqlServer2() {
272: this .encodeAll(new String[] { "Erickson", "Erickson",
273: "Erikson", "Ericson", "Ericksen", "Ericsen" }, "E625");
274: }
275:
276: /**
277: * Examples for MS SQLServer from
278: * http://databases.about.com/library/weekly/aa042901a.htm
279: */
280: public void testMsSqlServer3() {
281: assertEquals("A500", this .getEncoder().encode("Ann"));
282: assertEquals("A536", this .getEncoder().encode("Andrew"));
283: assertEquals("J530", this .getEncoder().encode("Janet"));
284: assertEquals("M626", this .getEncoder().encode("Margaret"));
285: assertEquals("S315", this .getEncoder().encode("Steven"));
286: assertEquals("M240", this .getEncoder().encode("Michael"));
287: assertEquals("R163", this .getEncoder().encode("Robert"));
288: assertEquals("L600", this .getEncoder().encode("Laura"));
289: assertEquals("A500", this .getEncoder().encode("Anne"));
290: }
291:
292: /**
293: * Fancy characters are not mapped by the default US mapping.
294: *
295: * http://nagoya.apache.org/bugzilla/show_bug.cgi?id=29080
296: */
297: public void testUsMappingOWithDiaeresis() {
298: assertEquals("O000", this .getEncoder().encode("o"));
299: try {
300: assertEquals("Ö000", this .getEncoder().encode("ö"));
301: fail("Expected IllegalArgumentException not thrown");
302: } catch (IllegalArgumentException e) {
303: // expected
304: }
305: }
306:
307: /**
308: * Fancy characters are not mapped by the default US mapping.
309: *
310: * http://nagoya.apache.org/bugzilla/show_bug.cgi?id=29080
311: */
312: public void testUsMappingEWithAcute() {
313: assertEquals("E000", this .getEncoder().encode("e"));
314: try {
315: assertEquals("É000", this .getEncoder().encode("é"));
316: fail("Expected IllegalArgumentException not thrown");
317: } catch (IllegalArgumentException e) {
318: // expected
319: }
320: }
321: }
|