001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 // (FYI: Formatted and sorted with Eclipse) 019 020 package org.apache.commons.codec.language; 021 022 import org.junit.Assert; 023 024 import org.apache.commons.codec.EncoderException; 025 import org.apache.commons.codec.StringEncoderAbstractTest; 026 import org.junit.Test; 027 028 /** 029 * Tests {@link Soundex}. 030 * 031 * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p> 032 * 033 * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $ 034 */ 035 public class SoundexTest extends StringEncoderAbstractTest<Soundex> { 036 037 @Override 038 protected Soundex createStringEncoder() { 039 return new Soundex(); 040 } 041 042 @Test 043 public void testB650() throws EncoderException { 044 this.checkEncodingVariations("B650", new String[]{ 045 "BARHAM", 046 "BARONE", 047 "BARRON", 048 "BERNA", 049 "BIRNEY", 050 "BIRNIE", 051 "BOOROM", 052 "BOREN", 053 "BORN", 054 "BOURN", 055 "BOURNE", 056 "BOWRON", 057 "BRAIN", 058 "BRAME", 059 "BRANN", 060 "BRAUN", 061 "BREEN", 062 "BRIEN", 063 "BRIM", 064 "BRIMM", 065 "BRINN", 066 "BRION", 067 "BROOM", 068 "BROOME", 069 "BROWN", 070 "BROWNE", 071 "BRUEN", 072 "BRUHN", 073 "BRUIN", 074 "BRUMM", 075 "BRUN", 076 "BRUNO", 077 "BRYAN", 078 "BURIAN", 079 "BURN", 080 "BURNEY", 081 "BYRAM", 082 "BYRNE", 083 "BYRON", 084 "BYRUM"}); 085 } 086 087 @Test 088 public void testBadCharacters() { 089 Assert.assertEquals("H452", this.getStringEncoder().encode("HOL>MES")); 090 091 } 092 093 @Test 094 public void testDifference() throws EncoderException { 095 // Edge cases 096 Assert.assertEquals(0, this.getStringEncoder().difference(null, null)); 097 Assert.assertEquals(0, this.getStringEncoder().difference("", "")); 098 Assert.assertEquals(0, this.getStringEncoder().difference(" ", " ")); 099 // Normal cases 100 Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe")); 101 Assert.assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew")); 102 Assert.assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew")); 103 Assert.assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret")); 104 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp 105 Assert.assertEquals(4, this.getStringEncoder().difference("Green", "Greene")); 106 Assert.assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene")); 107 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp 108 Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe")); 109 Assert.assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers")); 110 Assert.assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers")); 111 } 112 113 @Test 114 public void testEncodeBasic() { 115 Assert.assertEquals("T235", this.getStringEncoder().encode("testing")); 116 Assert.assertEquals("T000", this.getStringEncoder().encode("The")); 117 Assert.assertEquals("Q200", this.getStringEncoder().encode("quick")); 118 Assert.assertEquals("B650", this.getStringEncoder().encode("brown")); 119 Assert.assertEquals("F200", this.getStringEncoder().encode("fox")); 120 Assert.assertEquals("J513", this.getStringEncoder().encode("jumped")); 121 Assert.assertEquals("O160", this.getStringEncoder().encode("over")); 122 Assert.assertEquals("T000", this.getStringEncoder().encode("the")); 123 Assert.assertEquals("L200", this.getStringEncoder().encode("lazy")); 124 Assert.assertEquals("D200", this.getStringEncoder().encode("dogs")); 125 } 126 127 /** 128 * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html 129 */ 130 @Test 131 public void testEncodeBatch2() { 132 Assert.assertEquals("A462", this.getStringEncoder().encode("Allricht")); 133 Assert.assertEquals("E166", this.getStringEncoder().encode("Eberhard")); 134 Assert.assertEquals("E521", this.getStringEncoder().encode("Engebrethson")); 135 Assert.assertEquals("H512", this.getStringEncoder().encode("Heimbach")); 136 Assert.assertEquals("H524", this.getStringEncoder().encode("Hanselmann")); 137 Assert.assertEquals("H431", this.getStringEncoder().encode("Hildebrand")); 138 Assert.assertEquals("K152", this.getStringEncoder().encode("Kavanagh")); 139 Assert.assertEquals("L530", this.getStringEncoder().encode("Lind")); 140 Assert.assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky")); 141 Assert.assertEquals("M235", this.getStringEncoder().encode("McDonnell")); 142 Assert.assertEquals("M200", this.getStringEncoder().encode("McGee")); 143 Assert.assertEquals("O155", this.getStringEncoder().encode("Opnian")); 144 Assert.assertEquals("O155", this.getStringEncoder().encode("Oppenheimer")); 145 Assert.assertEquals("R355", this.getStringEncoder().encode("Riedemanas")); 146 Assert.assertEquals("Z300", this.getStringEncoder().encode("Zita")); 147 Assert.assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn")); 148 } 149 150 /** 151 * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html 152 */ 153 @Test 154 public void testEncodeBatch3() { 155 Assert.assertEquals("W252", this.getStringEncoder().encode("Washington")); 156 Assert.assertEquals("L000", this.getStringEncoder().encode("Lee")); 157 Assert.assertEquals("G362", this.getStringEncoder().encode("Gutierrez")); 158 Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister")); 159 Assert.assertEquals("J250", this.getStringEncoder().encode("Jackson")); 160 Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak")); 161 // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also 162 // possible. 163 Assert.assertEquals("V532", this.getStringEncoder().encode("VanDeusen")); 164 } 165 166 /** 167 * Examples from: http://www.myatt.demon.co.uk/sxalg.htm 168 */ 169 @Test 170 public void testEncodeBatch4() { 171 Assert.assertEquals("H452", this.getStringEncoder().encode("HOLMES")); 172 Assert.assertEquals("A355", this.getStringEncoder().encode("ADOMOMI")); 173 Assert.assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR")); 174 Assert.assertEquals("B400", this.getStringEncoder().encode("BALL")); 175 Assert.assertEquals("S000", this.getStringEncoder().encode("SHAW")); 176 Assert.assertEquals("J250", this.getStringEncoder().encode("JACKSON")); 177 Assert.assertEquals("S545", this.getStringEncoder().encode("SCANLON")); 178 Assert.assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN")); 179 180 } 181 182 @Test 183 public void testEncodeIgnoreApostrophes() throws EncoderException { 184 this.checkEncodingVariations("O165", new String[]{ 185 "OBrien", 186 "'OBrien", 187 "O'Brien", 188 "OB'rien", 189 "OBr'ien", 190 "OBri'en", 191 "OBrie'n", 192 "OBrien'"}); 193 } 194 195 /** 196 * Test data from http://www.myatt.demon.co.uk/sxalg.htm 197 * 198 * @throws EncoderException 199 */ 200 @Test 201 public void testEncodeIgnoreHyphens() throws EncoderException { 202 this.checkEncodingVariations("K525", new String[]{ 203 "KINGSMITH", 204 "-KINGSMITH", 205 "K-INGSMITH", 206 "KI-NGSMITH", 207 "KIN-GSMITH", 208 "KING-SMITH", 209 "KINGS-MITH", 210 "KINGSM-ITH", 211 "KINGSMI-TH", 212 "KINGSMIT-H", 213 "KINGSMITH-"}); 214 } 215 216 @Test 217 public void testEncodeIgnoreTrimmable() { 218 Assert.assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r ")); 219 } 220 221 /** 222 * Consonants from the same code group separated by W or H are treated as one. 223 */ 224 @Test 225 public void testHWRuleEx1() { 226 // From 227 // http://www.archives.gov/research_room/genealogy/census/soundex.html: 228 // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 229 // for the F). It is not coded A-226. 230 Assert.assertEquals("A261", this.getStringEncoder().encode("Ashcraft")); 231 } 232 233 /** 234 * Consonants from the same code group separated by W or H are treated as one. 235 * 236 * Test data from http://www.myatt.demon.co.uk/sxalg.htm 237 */ 238 @Test 239 public void testHWRuleEx2() { 240 Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS")); 241 Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS")); 242 } 243 244 /** 245 * Consonants from the same code group separated by W or H are treated as one. 246 * 247 * @throws EncoderException 248 */ 249 @Test 250 public void testHWRuleEx3() throws EncoderException { 251 Assert.assertEquals("S460", this.getStringEncoder().encode("Sgler")); 252 Assert.assertEquals("S460", this.getStringEncoder().encode("Swhgler")); 253 // Also S460: 254 this.checkEncodingVariations("S460", new String[]{ 255 "SAILOR", 256 "SALYER", 257 "SAYLOR", 258 "SCHALLER", 259 "SCHELLER", 260 "SCHILLER", 261 "SCHOOLER", 262 "SCHULER", 263 "SCHUYLER", 264 "SEILER", 265 "SEYLER", 266 "SHOLAR", 267 "SHULER", 268 "SILAR", 269 "SILER", 270 "SILLER"}); 271 } 272 273 /** 274 * Examples for MS SQLServer from 275 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp 276 */ 277 @Test 278 public void testMsSqlServer1() { 279 Assert.assertEquals("S530", this.getStringEncoder().encode("Smith")); 280 Assert.assertEquals("S530", this.getStringEncoder().encode("Smythe")); 281 } 282 283 /** 284 * Examples for MS SQLServer from 285 * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support 286 * /kb/articles/Q100/3/65.asp&NoWebContent=1 287 * 288 * @throws EncoderException 289 */ 290 @Test 291 public void testMsSqlServer2() throws EncoderException { 292 this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"}); 293 } 294 295 /** 296 * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm 297 */ 298 @Test 299 public void testMsSqlServer3() { 300 Assert.assertEquals("A500", this.getStringEncoder().encode("Ann")); 301 Assert.assertEquals("A536", this.getStringEncoder().encode("Andrew")); 302 Assert.assertEquals("J530", this.getStringEncoder().encode("Janet")); 303 Assert.assertEquals("M626", this.getStringEncoder().encode("Margaret")); 304 Assert.assertEquals("S315", this.getStringEncoder().encode("Steven")); 305 Assert.assertEquals("M240", this.getStringEncoder().encode("Michael")); 306 Assert.assertEquals("R163", this.getStringEncoder().encode("Robert")); 307 Assert.assertEquals("L600", this.getStringEncoder().encode("Laura")); 308 Assert.assertEquals("A500", this.getStringEncoder().encode("Anne")); 309 } 310 311 /** 312 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 313 */ 314 @Test 315 public void testNewInstance() { 316 Assert.assertEquals("W452", new Soundex().soundex("Williams")); 317 } 318 319 @Test 320 public void testNewInstance2() { 321 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams")); 322 } 323 324 @Test 325 public void testNewInstance3() { 326 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams")); 327 } 328 329 @Test 330 public void testSoundexUtilsConstructable() { 331 new SoundexUtils(); 332 } 333 334 @Test 335 public void testSoundexUtilsNullBehaviour() { 336 Assert.assertEquals(null, SoundexUtils.clean(null)); 337 Assert.assertEquals("", SoundexUtils.clean("")); 338 Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, "")); 339 Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null)); 340 } 341 342 /** 343 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 344 */ 345 @Test 346 public void testUsEnglishStatic() { 347 Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams")); 348 } 349 350 /** 351 * Fancy characters are not mapped by the default US mapping. 352 * 353 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 354 */ 355 @Test 356 public void testUsMappingEWithAcute() { 357 Assert.assertEquals("E000", this.getStringEncoder().encode("e")); 358 if (Character.isLetter('\u00e9')) { // e-acute 359 try { 360 // uppercase E-acute 361 Assert.assertEquals("\u00c9000", this.getStringEncoder().encode("\u00e9")); 362 Assert.fail("Expected IllegalArgumentException not thrown"); 363 } catch (final IllegalArgumentException e) { 364 // expected 365 } 366 } else { 367 Assert.assertEquals("", this.getStringEncoder().encode("\u00e9")); 368 } 369 } 370 371 /** 372 * Fancy characters are not mapped by the default US mapping. 373 * 374 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 375 */ 376 @Test 377 public void testUsMappingOWithDiaeresis() { 378 Assert.assertEquals("O000", this.getStringEncoder().encode("o")); 379 if (Character.isLetter('\u00f6')) { // o-umlaut 380 try { 381 // uppercase O-umlaut 382 Assert.assertEquals("\u00d6000", this.getStringEncoder().encode("\u00f6")); 383 Assert.fail("Expected IllegalArgumentException not thrown"); 384 } catch (final IllegalArgumentException e) { 385 // expected 386 } 387 } else { 388 Assert.assertEquals("", this.getStringEncoder().encode("\u00f6")); 389 } 390 } 391 }