001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 // (FYI: Formatted and sorted with Eclipse)
019
020 package org.apache.commons.codec.language;
021
022 import junit.framework.Assert;
023
024 import org.apache.commons.codec.EncoderException;
025 import org.apache.commons.codec.StringEncoder;
026 import org.apache.commons.codec.StringEncoderAbstractTest;
027 import org.junit.Test;
028
029 /**
030 * Tests {@link Soundex}.
031 *
032 * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
033 *
034 * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $
035 */
036 public class SoundexTest extends StringEncoderAbstractTest {
037
038 @Override
039 protected StringEncoder createStringEncoder() {
040 return new Soundex();
041 }
042
043 /**
044 * @return Returns the encoder.
045 */
046 public Soundex getSoundexEncoder() {
047 return (Soundex)this.getStringEncoder();
048 }
049
050 @Test
051 public void testB650() throws EncoderException {
052 this.checkEncodingVariations("B650", new String[]{
053 "BARHAM",
054 "BARONE",
055 "BARRON",
056 "BERNA",
057 "BIRNEY",
058 "BIRNIE",
059 "BOOROM",
060 "BOREN",
061 "BORN",
062 "BOURN",
063 "BOURNE",
064 "BOWRON",
065 "BRAIN",
066 "BRAME",
067 "BRANN",
068 "BRAUN",
069 "BREEN",
070 "BRIEN",
071 "BRIM",
072 "BRIMM",
073 "BRINN",
074 "BRION",
075 "BROOM",
076 "BROOME",
077 "BROWN",
078 "BROWNE",
079 "BRUEN",
080 "BRUHN",
081 "BRUIN",
082 "BRUMM",
083 "BRUN",
084 "BRUNO",
085 "BRYAN",
086 "BURIAN",
087 "BURN",
088 "BURNEY",
089 "BYRAM",
090 "BYRNE",
091 "BYRON",
092 "BYRUM"});
093 }
094
095 @Test
096 public void testBadCharacters() {
097 Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOL>MES"));
098
099 }
100
101 @Test
102 public void testDifference() throws EncoderException {
103 // Edge cases
104 Assert.assertEquals(0, this.getSoundexEncoder().difference(null, null));
105 Assert.assertEquals(0, this.getSoundexEncoder().difference("", ""));
106 Assert.assertEquals(0, this.getSoundexEncoder().difference(" ", " "));
107 // Normal cases
108 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe"));
109 Assert.assertEquals(2, this.getSoundexEncoder().difference("Ann", "Andrew"));
110 Assert.assertEquals(1, this.getSoundexEncoder().difference("Margaret", "Andrew"));
111 Assert.assertEquals(0, this.getSoundexEncoder().difference("Janet", "Margaret"));
112 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
113 Assert.assertEquals(4, this.getSoundexEncoder().difference("Green", "Greene"));
114 Assert.assertEquals(0, this.getSoundexEncoder().difference("Blotchet-Halls", "Greene"));
115 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
116 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe"));
117 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smithers", "Smythers"));
118 Assert.assertEquals(2, this.getSoundexEncoder().difference("Anothers", "Brothers"));
119 }
120
121 @Test
122 public void testEncodeBasic() {
123 Assert.assertEquals("T235", this.getSoundexEncoder().encode("testing"));
124 Assert.assertEquals("T000", this.getSoundexEncoder().encode("The"));
125 Assert.assertEquals("Q200", this.getSoundexEncoder().encode("quick"));
126 Assert.assertEquals("B650", this.getSoundexEncoder().encode("brown"));
127 Assert.assertEquals("F200", this.getSoundexEncoder().encode("fox"));
128 Assert.assertEquals("J513", this.getSoundexEncoder().encode("jumped"));
129 Assert.assertEquals("O160", this.getSoundexEncoder().encode("over"));
130 Assert.assertEquals("T000", this.getSoundexEncoder().encode("the"));
131 Assert.assertEquals("L200", this.getSoundexEncoder().encode("lazy"));
132 Assert.assertEquals("D200", this.getSoundexEncoder().encode("dogs"));
133 }
134
135 /**
136 * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
137 */
138 @Test
139 public void testEncodeBatch2() {
140 Assert.assertEquals("A462", this.getSoundexEncoder().encode("Allricht"));
141 Assert.assertEquals("E166", this.getSoundexEncoder().encode("Eberhard"));
142 Assert.assertEquals("E521", this.getSoundexEncoder().encode("Engebrethson"));
143 Assert.assertEquals("H512", this.getSoundexEncoder().encode("Heimbach"));
144 Assert.assertEquals("H524", this.getSoundexEncoder().encode("Hanselmann"));
145 Assert.assertEquals("H431", this.getSoundexEncoder().encode("Hildebrand"));
146 Assert.assertEquals("K152", this.getSoundexEncoder().encode("Kavanagh"));
147 Assert.assertEquals("L530", this.getSoundexEncoder().encode("Lind"));
148 Assert.assertEquals("L222", this.getSoundexEncoder().encode("Lukaschowsky"));
149 Assert.assertEquals("M235", this.getSoundexEncoder().encode("McDonnell"));
150 Assert.assertEquals("M200", this.getSoundexEncoder().encode("McGee"));
151 Assert.assertEquals("O155", this.getSoundexEncoder().encode("Opnian"));
152 Assert.assertEquals("O155", this.getSoundexEncoder().encode("Oppenheimer"));
153 Assert.assertEquals("R355", this.getSoundexEncoder().encode("Riedemanas"));
154 Assert.assertEquals("Z300", this.getSoundexEncoder().encode("Zita"));
155 Assert.assertEquals("Z325", this.getSoundexEncoder().encode("Zitzmeinn"));
156 }
157
158 /**
159 * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
160 */
161 @Test
162 public void testEncodeBatch3() {
163 Assert.assertEquals("W252", this.getSoundexEncoder().encode("Washington"));
164 Assert.assertEquals("L000", this.getSoundexEncoder().encode("Lee"));
165 Assert.assertEquals("G362", this.getSoundexEncoder().encode("Gutierrez"));
166 Assert.assertEquals("P236", this.getSoundexEncoder().encode("Pfister"));
167 Assert.assertEquals("J250", this.getSoundexEncoder().encode("Jackson"));
168 Assert.assertEquals("T522", this.getSoundexEncoder().encode("Tymczak"));
169 // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
170 // possible.
171 Assert.assertEquals("V532", this.getSoundexEncoder().encode("VanDeusen"));
172 }
173
174 /**
175 * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
176 */
177 @Test
178 public void testEncodeBatch4() {
179 Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOLMES"));
180 Assert.assertEquals("A355", this.getSoundexEncoder().encode("ADOMOMI"));
181 Assert.assertEquals("V536", this.getSoundexEncoder().encode("VONDERLEHR"));
182 Assert.assertEquals("B400", this.getSoundexEncoder().encode("BALL"));
183 Assert.assertEquals("S000", this.getSoundexEncoder().encode("SHAW"));
184 Assert.assertEquals("J250", this.getSoundexEncoder().encode("JACKSON"));
185 Assert.assertEquals("S545", this.getSoundexEncoder().encode("SCANLON"));
186 Assert.assertEquals("S532", this.getSoundexEncoder().encode("SAINTJOHN"));
187
188 }
189
190 @Test
191 public void testEncodeIgnoreApostrophes() throws EncoderException {
192 this.checkEncodingVariations("O165", new String[]{
193 "OBrien",
194 "'OBrien",
195 "O'Brien",
196 "OB'rien",
197 "OBr'ien",
198 "OBri'en",
199 "OBrie'n",
200 "OBrien'"});
201 }
202
203 /**
204 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
205 *
206 * @throws EncoderException
207 */
208 @Test
209 public void testEncodeIgnoreHyphens() throws EncoderException {
210 this.checkEncodingVariations("K525", new String[]{
211 "KINGSMITH",
212 "-KINGSMITH",
213 "K-INGSMITH",
214 "KI-NGSMITH",
215 "KIN-GSMITH",
216 "KING-SMITH",
217 "KINGS-MITH",
218 "KINGSM-ITH",
219 "KINGSMI-TH",
220 "KINGSMIT-H",
221 "KINGSMITH-"});
222 }
223
224 @Test
225 public void testEncodeIgnoreTrimmable() {
226 Assert.assertEquals("W252", this.getSoundexEncoder().encode(" \t\n\r Washington \t\n\r "));
227 }
228
229 /**
230 * Consonants from the same code group separated by W or H are treated as one.
231 */
232 @Test
233 public void testHWRuleEx1() {
234 // From
235 // http://www.archives.gov/research_room/genealogy/census/soundex.html:
236 // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
237 // for the F). It is not coded A-226.
238 Assert.assertEquals("A261", this.getSoundexEncoder().encode("Ashcraft"));
239 }
240
241 /**
242 * Consonants from the same code group separated by W or H are treated as one.
243 *
244 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
245 */
246 @Test
247 public void testHWRuleEx2() {
248 Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTHDAVIS"));
249 Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTH-DAVIS"));
250 }
251
252 /**
253 * Consonants from the same code group separated by W or H are treated as one.
254 *
255 * @throws EncoderException
256 */
257 @Test
258 public void testHWRuleEx3() throws EncoderException {
259 Assert.assertEquals("S460", this.getSoundexEncoder().encode("Sgler"));
260 Assert.assertEquals("S460", this.getSoundexEncoder().encode("Swhgler"));
261 // Also S460:
262 this.checkEncodingVariations("S460", new String[]{
263 "SAILOR",
264 "SALYER",
265 "SAYLOR",
266 "SCHALLER",
267 "SCHELLER",
268 "SCHILLER",
269 "SCHOOLER",
270 "SCHULER",
271 "SCHUYLER",
272 "SEILER",
273 "SEYLER",
274 "SHOLAR",
275 "SHULER",
276 "SILAR",
277 "SILER",
278 "SILLER"});
279 }
280
281 /**
282 * Examples for MS SQLServer from
283 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
284 */
285 @Test
286 public void testMsSqlServer1() {
287 Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smith"));
288 Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smythe"));
289 }
290
291 /**
292 * Examples for MS SQLServer from
293 * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
294 * /kb/articles/Q100/3/65.asp&NoWebContent=1
295 *
296 * @throws EncoderException
297 */
298 @Test
299 public void testMsSqlServer2() throws EncoderException {
300 this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
301 }
302
303 /**
304 * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
305 */
306 @Test
307 public void testMsSqlServer3() {
308 Assert.assertEquals("A500", this.getSoundexEncoder().encode("Ann"));
309 Assert.assertEquals("A536", this.getSoundexEncoder().encode("Andrew"));
310 Assert.assertEquals("J530", this.getSoundexEncoder().encode("Janet"));
311 Assert.assertEquals("M626", this.getSoundexEncoder().encode("Margaret"));
312 Assert.assertEquals("S315", this.getSoundexEncoder().encode("Steven"));
313 Assert.assertEquals("M240", this.getSoundexEncoder().encode("Michael"));
314 Assert.assertEquals("R163", this.getSoundexEncoder().encode("Robert"));
315 Assert.assertEquals("L600", this.getSoundexEncoder().encode("Laura"));
316 Assert.assertEquals("A500", this.getSoundexEncoder().encode("Anne"));
317 }
318
319 /**
320 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
321 */
322 @Test
323 public void testNewInstance() {
324 Assert.assertEquals("W452", new Soundex().soundex("Williams"));
325 }
326
327 @Test
328 public void testNewInstance2() {
329 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
330 }
331
332 @Test
333 public void testNewInstance3() {
334 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
335 }
336
337 @Test
338 public void testSoundexUtilsConstructable() {
339 new SoundexUtils();
340 }
341
342 @Test
343 public void testSoundexUtilsNullBehaviour() {
344 Assert.assertEquals(null, SoundexUtils.clean(null));
345 Assert.assertEquals("", SoundexUtils.clean(""));
346 Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
347 Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null));
348 }
349
350 /**
351 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
352 */
353 @Test
354 public void testUsEnglishStatic() {
355 Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
356 }
357
358 /**
359 * Fancy characters are not mapped by the default US mapping.
360 *
361 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
362 */
363 @Test
364 public void testUsMappingEWithAcute() {
365 Assert.assertEquals("E000", this.getSoundexEncoder().encode("e"));
366 if (Character.isLetter('\u00e9')) { // e-acute
367 try {
368 // uppercase E-acute
369 Assert.assertEquals("\u00c9000", this.getSoundexEncoder().encode("\u00e9"));
370 Assert.fail("Expected IllegalArgumentException not thrown");
371 } catch (IllegalArgumentException e) {
372 // expected
373 }
374 } else {
375 Assert.assertEquals("", this.getSoundexEncoder().encode("\u00e9"));
376 }
377 }
378
379 /**
380 * Fancy characters are not mapped by the default US mapping.
381 *
382 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
383 */
384 @Test
385 public void testUsMappingOWithDiaeresis() {
386 Assert.assertEquals("O000", this.getSoundexEncoder().encode("o"));
387 if (Character.isLetter('\u00f6')) { // o-umlaut
388 try {
389 // uppercase O-umlaut
390 Assert.assertEquals("\u00d6000", this.getSoundexEncoder().encode("\u00f6"));
391 Assert.fail("Expected IllegalArgumentException not thrown");
392 } catch (IllegalArgumentException e) {
393 // expected
394 }
395 } else {
396 Assert.assertEquals("", this.getSoundexEncoder().encode("\u00f6"));
397 }
398 }
399 }