View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  // (FYI: Formatted and sorted with Eclipse)
19  
20  package org.apache.commons.codec.language;
21  
22  import static org.junit.jupiter.api.Assertions.assertEquals;
23  import static org.junit.jupiter.api.Assertions.assertNull;
24  import static org.junit.jupiter.api.Assertions.assertThrows;
25  
26  import org.apache.commons.codec.AbstractStringEncoderTest;
27  import org.apache.commons.codec.EncoderException;
28  import org.junit.jupiter.api.Test;
29  
30  /**
31   * Tests {@link Soundex}.
32   *
33   * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
34   */
35  public class SoundexTest extends AbstractStringEncoderTest<Soundex> {
36  
37      @Override
38      protected Soundex createStringEncoder() {
39          return new Soundex();
40      }
41  
42      @Test
43      public void testB650() throws EncoderException {
44          this.checkEncodingVariations("B650", new String[]{
45              "BARHAM",
46              "BARONE",
47              "BARRON",
48              "BERNA",
49              "BIRNEY",
50              "BIRNIE",
51              "BOOROM",
52              "BOREN",
53              "BORN",
54              "BOURN",
55              "BOURNE",
56              "BOWRON",
57              "BRAIN",
58              "BRAME",
59              "BRANN",
60              "BRAUN",
61              "BREEN",
62              "BRIEN",
63              "BRIM",
64              "BRIMM",
65              "BRINN",
66              "BRION",
67              "BROOM",
68              "BROOME",
69              "BROWN",
70              "BROWNE",
71              "BRUEN",
72              "BRUHN",
73              "BRUIN",
74              "BRUMM",
75              "BRUN",
76              "BRUNO",
77              "BRYAN",
78              "BURIAN",
79              "BURN",
80              "BURNEY",
81              "BYRAM",
82              "BYRNE",
83              "BYRON",
84              "BYRUM"});
85      }
86  
87      @Test
88      public void testBadCharacters() {
89          assertEquals("H452", this.getStringEncoder().encode("HOL>MES"));
90  
91      }
92  
93      @Test
94      public void testDifference() throws EncoderException {
95          // Edge cases
96          assertEquals(0, this.getStringEncoder().difference(null, null));
97          assertEquals(0, this.getStringEncoder().difference("", ""));
98          assertEquals(0, this.getStringEncoder().difference(" ", " "));
99          // Normal cases
100         assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
101         assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew"));
102         assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew"));
103         assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret"));
104         // Examples from https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
105         assertEquals(4, this.getStringEncoder().difference("Green", "Greene"));
106         assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene"));
107         // Examples from https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
108         assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
109         assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers"));
110         assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers"));
111     }
112 
113     @Test
114     public void testEncodeBasic() {
115         assertEquals("T235", this.getStringEncoder().encode("testing"));
116         assertEquals("T000", this.getStringEncoder().encode("The"));
117         assertEquals("Q200", this.getStringEncoder().encode("quick"));
118         assertEquals("B650", this.getStringEncoder().encode("brown"));
119         assertEquals("F200", this.getStringEncoder().encode("fox"));
120         assertEquals("J513", this.getStringEncoder().encode("jumped"));
121         assertEquals("O160", this.getStringEncoder().encode("over"));
122         assertEquals("T000", this.getStringEncoder().encode("the"));
123         assertEquals("L200", this.getStringEncoder().encode("lazy"));
124         assertEquals("D200", this.getStringEncoder().encode("dogs"));
125     }
126 
127     /**
128      * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
129      */
130     @Test
131     public void testEncodeBatch2() {
132         assertEquals("A462", this.getStringEncoder().encode("Allricht"));
133         assertEquals("E166", this.getStringEncoder().encode("Eberhard"));
134         assertEquals("E521", this.getStringEncoder().encode("Engebrethson"));
135         assertEquals("H512", this.getStringEncoder().encode("Heimbach"));
136         assertEquals("H524", this.getStringEncoder().encode("Hanselmann"));
137         assertEquals("H431", this.getStringEncoder().encode("Hildebrand"));
138         assertEquals("K152", this.getStringEncoder().encode("Kavanagh"));
139         assertEquals("L530", this.getStringEncoder().encode("Lind"));
140         assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky"));
141         assertEquals("M235", this.getStringEncoder().encode("McDonnell"));
142         assertEquals("M200", this.getStringEncoder().encode("McGee"));
143         assertEquals("O155", this.getStringEncoder().encode("Opnian"));
144         assertEquals("O155", this.getStringEncoder().encode("Oppenheimer"));
145         assertEquals("R355", this.getStringEncoder().encode("Riedemanas"));
146         assertEquals("Z300", this.getStringEncoder().encode("Zita"));
147         assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn"));
148     }
149 
150     /**
151      * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
152      */
153     @Test
154     public void testEncodeBatch3() {
155         assertEquals("W252", this.getStringEncoder().encode("Washington"));
156         assertEquals("L000", this.getStringEncoder().encode("Lee"));
157         assertEquals("G362", this.getStringEncoder().encode("Gutierrez"));
158         assertEquals("P236", this.getStringEncoder().encode("Pfister"));
159         assertEquals("J250", this.getStringEncoder().encode("Jackson"));
160         assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
161         // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
162         // possible.
163         assertEquals("V532", this.getStringEncoder().encode("VanDeusen"));
164     }
165 
166     /**
167      * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
168      */
169     @Test
170     public void testEncodeBatch4() {
171         assertEquals("H452", this.getStringEncoder().encode("HOLMES"));
172         assertEquals("A355", this.getStringEncoder().encode("ADOMOMI"));
173         assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR"));
174         assertEquals("B400", this.getStringEncoder().encode("BALL"));
175         assertEquals("S000", this.getStringEncoder().encode("SHAW"));
176         assertEquals("J250", this.getStringEncoder().encode("JACKSON"));
177         assertEquals("S545", this.getStringEncoder().encode("SCANLON"));
178         assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN"));
179 
180     }
181 
182     @Test
183     public void testEncodeIgnoreApostrophes() throws EncoderException {
184         this.checkEncodingVariations("O165", new String[]{
185             "OBrien",
186             "'OBrien",
187             "O'Brien",
188             "OB'rien",
189             "OBr'ien",
190             "OBri'en",
191             "OBrie'n",
192             "OBrien'"});
193     }
194 
195     /**
196      * Test data from http://www.myatt.demon.co.uk/sxalg.htm
197      *
198      * @throws EncoderException for some failure scenarios     */
199     @Test
200     public void testEncodeIgnoreHyphens() throws EncoderException {
201         this.checkEncodingVariations("K525", new String[]{
202             "KINGSMITH",
203             "-KINGSMITH",
204             "K-INGSMITH",
205             "KI-NGSMITH",
206             "KIN-GSMITH",
207             "KING-SMITH",
208             "KINGS-MITH",
209             "KINGSM-ITH",
210             "KINGSMI-TH",
211             "KINGSMIT-H",
212             "KINGSMITH-"});
213     }
214 
215     @Test
216     public void testEncodeIgnoreTrimmable() {
217         assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
218     }
219 
220     @Test
221 // examples and algorithm rules from:  http://www.genealogy.com/articles/research/00000060.html
222     public void testGenealogy() { // treat vowels and HW as silent
223         final Soundex s = Soundex.US_ENGLISH_GENEALOGY;
224         assertEquals("H251", s.encode("Heggenburger"));
225         assertEquals("B425", s.encode("Blackman"));
226         assertEquals("S530", s.encode("Schmidt"));
227         assertEquals("L150", s.encode("Lippmann"));
228         // Additional local example
229         assertEquals("D200", s.encode("Dodds")); // 'o' is not a separator here - it is silent
230         assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
231         assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
232     }
233 
234     /**
235      * Consonants from the same code group separated by W or H are treated as one.
236      */
237     @Test
238     public void testHWRuleEx1() {
239         // From
240         // http://www.archives.gov/research_room/genealogy/census/soundex.html:
241         // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
242         // for the F). It is not coded A-226.
243         assertEquals("A261", this.getStringEncoder().encode("Ashcraft"));
244         assertEquals("A261", this.getStringEncoder().encode("Ashcroft"));
245         assertEquals("Y330", this.getStringEncoder().encode("yehudit"));
246         assertEquals("Y330", this.getStringEncoder().encode("yhwdyt"));
247     }
248 
249     /**
250      * Consonants from the same code group separated by W or H are treated as one.
251      *
252      * Test data from http://www.myatt.demon.co.uk/sxalg.htm
253      */
254     @Test
255     public void testHWRuleEx2() {
256         assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS"));
257         assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS"));
258     }
259 
260     /**
261      * Consonants from the same code group separated by W or H are treated as one.
262      *
263      * @throws EncoderException for some failure scenarios     */
264     @Test
265     public void testHWRuleEx3() throws EncoderException {
266         assertEquals("S460", this.getStringEncoder().encode("Sgler"));
267         assertEquals("S460", this.getStringEncoder().encode("Swhgler"));
268         // Also S460:
269         this.checkEncodingVariations("S460", new String[]{
270             "SAILOR",
271             "SALYER",
272             "SAYLOR",
273             "SCHALLER",
274             "SCHELLER",
275             "SCHILLER",
276             "SCHOOLER",
277             "SCHULER",
278             "SCHUYLER",
279             "SEILER",
280             "SEYLER",
281             "SHOLAR",
282             "SHULER",
283             "SILAR",
284             "SILER",
285             "SILLER"});
286     }
287 
288     /**
289      * Examples for MS SQLServer from
290      * https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
291      */
292     @Test
293     public void testMsSqlServer1() {
294         assertEquals("S530", this.getStringEncoder().encode("Smith"));
295         assertEquals("S530", this.getStringEncoder().encode("Smythe"));
296     }
297 
298     /**
299      * Examples for MS SQLServer from
300      * https://support.microsoft.com/default.aspx?scid=https://support.microsoft.com:80/support
301      * /kb/articles/Q100/3/65.asp&NoWebContent=1
302      *
303      * @throws EncoderException for some failure scenarios     */
304     @Test
305     public void testMsSqlServer2() throws EncoderException {
306         this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
307     }
308 
309     /**
310      * Examples for MS SQLServer from https://databases.about.com/library/weekly/aa042901a.htm
311      */
312     @Test
313     public void testMsSqlServer3() {
314         assertEquals("A500", this.getStringEncoder().encode("Ann"));
315         assertEquals("A536", this.getStringEncoder().encode("Andrew"));
316         assertEquals("J530", this.getStringEncoder().encode("Janet"));
317         assertEquals("M626", this.getStringEncoder().encode("Margaret"));
318         assertEquals("S315", this.getStringEncoder().encode("Steven"));
319         assertEquals("M240", this.getStringEncoder().encode("Michael"));
320         assertEquals("R163", this.getStringEncoder().encode("Robert"));
321         assertEquals("L600", this.getStringEncoder().encode("Laura"));
322         assertEquals("A500", this.getStringEncoder().encode("Anne"));
323     }
324 
325     /**
326      * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
327      */
328     @Test
329     public void testNewInstance() {
330         assertEquals("W452", new Soundex().soundex("Williams"));
331     }
332 
333     @Test
334     public void testNewInstance2() {
335         assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
336     }
337 
338     @Test
339     public void testNewInstance3() {
340         assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
341     }
342 
343     @Test
344 // examples and algorithm rules from:  http://west-penwith.org.uk/misc/soundex.htm
345     public void testSimplifiedSoundex() { // treat vowels and HW as separators
346         final Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
347         assertEquals("W452", s.encode("WILLIAMS"));
348         assertEquals("B625", s.encode("BARAGWANATH"));
349         assertEquals("D540", s.encode("DONNELL"));
350         assertEquals("L300", s.encode("LLOYD"));
351         assertEquals("W422", s.encode("WOOLCOCK"));
352         // Additional local examples
353         assertEquals("D320", s.encode("Dodds"));
354         assertEquals("D320", s.encode("Dwdds")); // w is a separator
355         assertEquals("D320", s.encode("Dhdds")); // h is a separator
356     }
357 
358     @Test
359     public void testSoundexUtilsConstructable() {
360         new SoundexUtils();
361     }
362 
363     @Test
364     public void testSoundexUtilsNullBehaviour() {
365         assertNull(SoundexUtils.clean(null));
366         assertEquals("", SoundexUtils.clean(""));
367         assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
368         assertEquals(0, SoundexUtils.differenceEncoded("", null));
369     }
370 
371     /**
372      * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
373      */
374     @Test
375     public void testUsEnglishStatic() {
376         assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
377     }
378 
379     /**
380      * Fancy characters are not mapped by the default US mapping.
381      *
382      * https://issues.apache.org/jira/browse/CODEC-30
383      */
384     @Test
385     public void testUsMappingEWithAcute() {
386         assertEquals("E000", this.getStringEncoder().encode("e"));
387         if (Character.isLetter('\u00e9')) { // e-acute
388             //         uppercase E-acute
389             assertThrows(IllegalArgumentException.class, () -> getStringEncoder().encode("\u00e9"));
390         } else {
391             assertEquals("", this.getStringEncoder().encode("\u00e9"));
392         }
393     }
394 
395     /**
396      * Fancy characters are not mapped by the default US mapping.
397      *
398      * https://issues.apache.org/jira/browse/CODEC-30
399      */
400     @Test
401     public void testUsMappingOWithDiaeresis() {
402         assertEquals("O000", this.getStringEncoder().encode("o"));
403         if (Character.isLetter('\u00f6')) { // o-umlaut
404             //         uppercase O-umlaut
405             assertThrows(IllegalArgumentException.class, () -> getStringEncoder().encode("\u00f6"));
406         } else {
407             assertEquals("", this.getStringEncoder().encode("\u00f6"));
408         }
409     }
410 
411     /**
412      * Tests example from https://en.wikipedia.org/wiki/Soundex#American_Soundex as of 2015-03-22.
413      */
414     @Test
415     public void testWikipediaAmericanSoundex() {
416         assertEquals("R163", this.getStringEncoder().encode("Robert"));
417         assertEquals("R163", this.getStringEncoder().encode("Rupert"));
418         assertEquals("A261", this.getStringEncoder().encode("Ashcraft"));
419         assertEquals("A261", this.getStringEncoder().encode("Ashcroft"));
420         assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
421         assertEquals("P236", this.getStringEncoder().encode("Pfister"));
422     }
423 }