001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 // (FYI: Formatted and sorted with Eclipse) 019 020 package org.apache.commons.codec.language; 021 022 import junit.framework.Assert; 023 024 import org.apache.commons.codec.EncoderException; 025 import org.apache.commons.codec.StringEncoder; 026 import org.apache.commons.codec.StringEncoderAbstractTest; 027 import org.junit.Test; 028 029 /** 030 * Tests {@link Soundex}. 031 * 032 * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p> 033 * 034 * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $ 035 */ 036 public class SoundexTest extends StringEncoderAbstractTest { 037 038 @Override 039 protected StringEncoder createStringEncoder() { 040 return new Soundex(); 041 } 042 043 /** 044 * @return Returns the encoder. 045 */ 046 public Soundex getSoundexEncoder() { 047 return (Soundex)this.getStringEncoder(); 048 } 049 050 @Test 051 public void testB650() throws EncoderException { 052 this.checkEncodingVariations("B650", new String[]{ 053 "BARHAM", 054 "BARONE", 055 "BARRON", 056 "BERNA", 057 "BIRNEY", 058 "BIRNIE", 059 "BOOROM", 060 "BOREN", 061 "BORN", 062 "BOURN", 063 "BOURNE", 064 "BOWRON", 065 "BRAIN", 066 "BRAME", 067 "BRANN", 068 "BRAUN", 069 "BREEN", 070 "BRIEN", 071 "BRIM", 072 "BRIMM", 073 "BRINN", 074 "BRION", 075 "BROOM", 076 "BROOME", 077 "BROWN", 078 "BROWNE", 079 "BRUEN", 080 "BRUHN", 081 "BRUIN", 082 "BRUMM", 083 "BRUN", 084 "BRUNO", 085 "BRYAN", 086 "BURIAN", 087 "BURN", 088 "BURNEY", 089 "BYRAM", 090 "BYRNE", 091 "BYRON", 092 "BYRUM"}); 093 } 094 095 @Test 096 public void testBadCharacters() { 097 Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOL>MES")); 098 099 } 100 101 @Test 102 public void testDifference() throws EncoderException { 103 // Edge cases 104 Assert.assertEquals(0, this.getSoundexEncoder().difference(null, null)); 105 Assert.assertEquals(0, this.getSoundexEncoder().difference("", "")); 106 Assert.assertEquals(0, this.getSoundexEncoder().difference(" ", " ")); 107 // Normal cases 108 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe")); 109 Assert.assertEquals(2, this.getSoundexEncoder().difference("Ann", "Andrew")); 110 Assert.assertEquals(1, this.getSoundexEncoder().difference("Margaret", "Andrew")); 111 Assert.assertEquals(0, this.getSoundexEncoder().difference("Janet", "Margaret")); 112 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp 113 Assert.assertEquals(4, this.getSoundexEncoder().difference("Green", "Greene")); 114 Assert.assertEquals(0, this.getSoundexEncoder().difference("Blotchet-Halls", "Greene")); 115 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp 116 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe")); 117 Assert.assertEquals(4, this.getSoundexEncoder().difference("Smithers", "Smythers")); 118 Assert.assertEquals(2, this.getSoundexEncoder().difference("Anothers", "Brothers")); 119 } 120 121 @Test 122 public void testEncodeBasic() { 123 Assert.assertEquals("T235", this.getSoundexEncoder().encode("testing")); 124 Assert.assertEquals("T000", this.getSoundexEncoder().encode("The")); 125 Assert.assertEquals("Q200", this.getSoundexEncoder().encode("quick")); 126 Assert.assertEquals("B650", this.getSoundexEncoder().encode("brown")); 127 Assert.assertEquals("F200", this.getSoundexEncoder().encode("fox")); 128 Assert.assertEquals("J513", this.getSoundexEncoder().encode("jumped")); 129 Assert.assertEquals("O160", this.getSoundexEncoder().encode("over")); 130 Assert.assertEquals("T000", this.getSoundexEncoder().encode("the")); 131 Assert.assertEquals("L200", this.getSoundexEncoder().encode("lazy")); 132 Assert.assertEquals("D200", this.getSoundexEncoder().encode("dogs")); 133 } 134 135 /** 136 * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html 137 */ 138 @Test 139 public void testEncodeBatch2() { 140 Assert.assertEquals("A462", this.getSoundexEncoder().encode("Allricht")); 141 Assert.assertEquals("E166", this.getSoundexEncoder().encode("Eberhard")); 142 Assert.assertEquals("E521", this.getSoundexEncoder().encode("Engebrethson")); 143 Assert.assertEquals("H512", this.getSoundexEncoder().encode("Heimbach")); 144 Assert.assertEquals("H524", this.getSoundexEncoder().encode("Hanselmann")); 145 Assert.assertEquals("H431", this.getSoundexEncoder().encode("Hildebrand")); 146 Assert.assertEquals("K152", this.getSoundexEncoder().encode("Kavanagh")); 147 Assert.assertEquals("L530", this.getSoundexEncoder().encode("Lind")); 148 Assert.assertEquals("L222", this.getSoundexEncoder().encode("Lukaschowsky")); 149 Assert.assertEquals("M235", this.getSoundexEncoder().encode("McDonnell")); 150 Assert.assertEquals("M200", this.getSoundexEncoder().encode("McGee")); 151 Assert.assertEquals("O155", this.getSoundexEncoder().encode("Opnian")); 152 Assert.assertEquals("O155", this.getSoundexEncoder().encode("Oppenheimer")); 153 Assert.assertEquals("R355", this.getSoundexEncoder().encode("Riedemanas")); 154 Assert.assertEquals("Z300", this.getSoundexEncoder().encode("Zita")); 155 Assert.assertEquals("Z325", this.getSoundexEncoder().encode("Zitzmeinn")); 156 } 157 158 /** 159 * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html 160 */ 161 @Test 162 public void testEncodeBatch3() { 163 Assert.assertEquals("W252", this.getSoundexEncoder().encode("Washington")); 164 Assert.assertEquals("L000", this.getSoundexEncoder().encode("Lee")); 165 Assert.assertEquals("G362", this.getSoundexEncoder().encode("Gutierrez")); 166 Assert.assertEquals("P236", this.getSoundexEncoder().encode("Pfister")); 167 Assert.assertEquals("J250", this.getSoundexEncoder().encode("Jackson")); 168 Assert.assertEquals("T522", this.getSoundexEncoder().encode("Tymczak")); 169 // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also 170 // possible. 171 Assert.assertEquals("V532", this.getSoundexEncoder().encode("VanDeusen")); 172 } 173 174 /** 175 * Examples from: http://www.myatt.demon.co.uk/sxalg.htm 176 */ 177 @Test 178 public void testEncodeBatch4() { 179 Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOLMES")); 180 Assert.assertEquals("A355", this.getSoundexEncoder().encode("ADOMOMI")); 181 Assert.assertEquals("V536", this.getSoundexEncoder().encode("VONDERLEHR")); 182 Assert.assertEquals("B400", this.getSoundexEncoder().encode("BALL")); 183 Assert.assertEquals("S000", this.getSoundexEncoder().encode("SHAW")); 184 Assert.assertEquals("J250", this.getSoundexEncoder().encode("JACKSON")); 185 Assert.assertEquals("S545", this.getSoundexEncoder().encode("SCANLON")); 186 Assert.assertEquals("S532", this.getSoundexEncoder().encode("SAINTJOHN")); 187 188 } 189 190 @Test 191 public void testEncodeIgnoreApostrophes() throws EncoderException { 192 this.checkEncodingVariations("O165", new String[]{ 193 "OBrien", 194 "'OBrien", 195 "O'Brien", 196 "OB'rien", 197 "OBr'ien", 198 "OBri'en", 199 "OBrie'n", 200 "OBrien'"}); 201 } 202 203 /** 204 * Test data from http://www.myatt.demon.co.uk/sxalg.htm 205 * 206 * @throws EncoderException 207 */ 208 @Test 209 public void testEncodeIgnoreHyphens() throws EncoderException { 210 this.checkEncodingVariations("K525", new String[]{ 211 "KINGSMITH", 212 "-KINGSMITH", 213 "K-INGSMITH", 214 "KI-NGSMITH", 215 "KIN-GSMITH", 216 "KING-SMITH", 217 "KINGS-MITH", 218 "KINGSM-ITH", 219 "KINGSMI-TH", 220 "KINGSMIT-H", 221 "KINGSMITH-"}); 222 } 223 224 @Test 225 public void testEncodeIgnoreTrimmable() { 226 Assert.assertEquals("W252", this.getSoundexEncoder().encode(" \t\n\r Washington \t\n\r ")); 227 } 228 229 /** 230 * Consonants from the same code group separated by W or H are treated as one. 231 */ 232 @Test 233 public void testHWRuleEx1() { 234 // From 235 // http://www.archives.gov/research_room/genealogy/census/soundex.html: 236 // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 237 // for the F). It is not coded A-226. 238 Assert.assertEquals("A261", this.getSoundexEncoder().encode("Ashcraft")); 239 } 240 241 /** 242 * Consonants from the same code group separated by W or H are treated as one. 243 * 244 * Test data from http://www.myatt.demon.co.uk/sxalg.htm 245 */ 246 @Test 247 public void testHWRuleEx2() { 248 Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTHDAVIS")); 249 Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTH-DAVIS")); 250 } 251 252 /** 253 * Consonants from the same code group separated by W or H are treated as one. 254 * 255 * @throws EncoderException 256 */ 257 @Test 258 public void testHWRuleEx3() throws EncoderException { 259 Assert.assertEquals("S460", this.getSoundexEncoder().encode("Sgler")); 260 Assert.assertEquals("S460", this.getSoundexEncoder().encode("Swhgler")); 261 // Also S460: 262 this.checkEncodingVariations("S460", new String[]{ 263 "SAILOR", 264 "SALYER", 265 "SAYLOR", 266 "SCHALLER", 267 "SCHELLER", 268 "SCHILLER", 269 "SCHOOLER", 270 "SCHULER", 271 "SCHUYLER", 272 "SEILER", 273 "SEYLER", 274 "SHOLAR", 275 "SHULER", 276 "SILAR", 277 "SILER", 278 "SILLER"}); 279 } 280 281 /** 282 * Examples for MS SQLServer from 283 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp 284 */ 285 @Test 286 public void testMsSqlServer1() { 287 Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smith")); 288 Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smythe")); 289 } 290 291 /** 292 * Examples for MS SQLServer from 293 * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support 294 * /kb/articles/Q100/3/65.asp&NoWebContent=1 295 * 296 * @throws EncoderException 297 */ 298 @Test 299 public void testMsSqlServer2() throws EncoderException { 300 this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"}); 301 } 302 303 /** 304 * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm 305 */ 306 @Test 307 public void testMsSqlServer3() { 308 Assert.assertEquals("A500", this.getSoundexEncoder().encode("Ann")); 309 Assert.assertEquals("A536", this.getSoundexEncoder().encode("Andrew")); 310 Assert.assertEquals("J530", this.getSoundexEncoder().encode("Janet")); 311 Assert.assertEquals("M626", this.getSoundexEncoder().encode("Margaret")); 312 Assert.assertEquals("S315", this.getSoundexEncoder().encode("Steven")); 313 Assert.assertEquals("M240", this.getSoundexEncoder().encode("Michael")); 314 Assert.assertEquals("R163", this.getSoundexEncoder().encode("Robert")); 315 Assert.assertEquals("L600", this.getSoundexEncoder().encode("Laura")); 316 Assert.assertEquals("A500", this.getSoundexEncoder().encode("Anne")); 317 } 318 319 /** 320 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 321 */ 322 @Test 323 public void testNewInstance() { 324 Assert.assertEquals("W452", new Soundex().soundex("Williams")); 325 } 326 327 @Test 328 public void testNewInstance2() { 329 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams")); 330 } 331 332 @Test 333 public void testNewInstance3() { 334 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams")); 335 } 336 337 @Test 338 public void testSoundexUtilsConstructable() { 339 new SoundexUtils(); 340 } 341 342 @Test 343 public void testSoundexUtilsNullBehaviour() { 344 Assert.assertEquals(null, SoundexUtils.clean(null)); 345 Assert.assertEquals("", SoundexUtils.clean("")); 346 Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, "")); 347 Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null)); 348 } 349 350 /** 351 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 352 */ 353 @Test 354 public void testUsEnglishStatic() { 355 Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams")); 356 } 357 358 /** 359 * Fancy characters are not mapped by the default US mapping. 360 * 361 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 362 */ 363 @Test 364 public void testUsMappingEWithAcute() { 365 Assert.assertEquals("E000", this.getSoundexEncoder().encode("e")); 366 if (Character.isLetter('\u00e9')) { // e-acute 367 try { 368 // uppercase E-acute 369 Assert.assertEquals("\u00c9000", this.getSoundexEncoder().encode("\u00e9")); 370 Assert.fail("Expected IllegalArgumentException not thrown"); 371 } catch (IllegalArgumentException e) { 372 // expected 373 } 374 } else { 375 Assert.assertEquals("", this.getSoundexEncoder().encode("\u00e9")); 376 } 377 } 378 379 /** 380 * Fancy characters are not mapped by the default US mapping. 381 * 382 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 383 */ 384 @Test 385 public void testUsMappingOWithDiaeresis() { 386 Assert.assertEquals("O000", this.getSoundexEncoder().encode("o")); 387 if (Character.isLetter('\u00f6')) { // o-umlaut 388 try { 389 // uppercase O-umlaut 390 Assert.assertEquals("\u00d6000", this.getSoundexEncoder().encode("\u00f6")); 391 Assert.fail("Expected IllegalArgumentException not thrown"); 392 } catch (IllegalArgumentException e) { 393 // expected 394 } 395 } else { 396 Assert.assertEquals("", this.getSoundexEncoder().encode("\u00f6")); 397 } 398 } 399 }