View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  // (FYI: Formatted and sorted with Eclipse)
19  
20  package org.apache.commons.codec.language;
21  
22  import static org.junit.jupiter.api.Assertions.assertEquals;
23  import static org.junit.jupiter.api.Assertions.assertNull;
24  import static org.junit.jupiter.api.Assertions.assertThrows;
25  
26  import org.apache.commons.codec.AbstractStringEncoderTest;
27  import org.apache.commons.codec.EncoderException;
28  import org.junit.jupiter.api.Test;
29  
30  /**
31   * Tests {@link Soundex}.
32   *
33   * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
34   */
35  class SoundexTest extends AbstractStringEncoderTest<Soundex> {
36  
37      @Override
38      protected Soundex createStringEncoder() {
39          return new Soundex();
40      }
41  
42      @Test
43      void testB650() throws EncoderException {
44          // @formatter:off
45          checkEncodingVariations("B650",
46              "BARHAM",
47              "BARONE",
48              "BARRON",
49              "BERNA",
50              "BIRNEY",
51              "BIRNIE",
52              "BOOROM",
53              "BOREN",
54              "BORN",
55              "BOURN",
56              "BOURNE",
57              "BOWRON",
58              "BRAIN",
59              "BRAME",
60              "BRANN",
61              "BRAUN",
62              "BREEN",
63              "BRIEN",
64              "BRIM",
65              "BRIMM",
66              "BRINN",
67              "BRION",
68              "BROOM",
69              "BROOME",
70              "BROWN",
71              "BROWNE",
72              "BRUEN",
73              "BRUHN",
74              "BRUIN",
75              "BRUMM",
76              "BRUN",
77              "BRUNO",
78              "BRYAN",
79              "BURIAN",
80              "BURN",
81              "BURNEY",
82              "BYRAM",
83              "BYRNE",
84              "BYRON",
85              "BYRUM");
86          // @formatter:on
87      }
88  
89      @Test
90      void testBadCharacters() {
91          assertEquals("H452", getStringEncoder().encode("HOL>MES"));
92      }
93  
94      @Test
95      void testDifference() throws EncoderException {
96          // Edge cases
97          assertEquals(0, getStringEncoder().difference(null, null));
98          assertEquals(0, getStringEncoder().difference("", ""));
99          assertEquals(0, getStringEncoder().difference(" ", " "));
100         // Normal cases
101         assertEquals(4, getStringEncoder().difference("Smith", "Smythe"));
102         assertEquals(2, getStringEncoder().difference("Ann", "Andrew"));
103         assertEquals(1, getStringEncoder().difference("Margaret", "Andrew"));
104         assertEquals(0, getStringEncoder().difference("Janet", "Margaret"));
105         // Examples from https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
106         assertEquals(4, getStringEncoder().difference("Green", "Greene"));
107         assertEquals(0, getStringEncoder().difference("Blotchet-Halls", "Greene"));
108         // Examples from https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
109         assertEquals(4, getStringEncoder().difference("Smith", "Smythe"));
110         assertEquals(4, getStringEncoder().difference("Smithers", "Smythers"));
111         assertEquals(2, getStringEncoder().difference("Anothers", "Brothers"));
112     }
113 
114     @Test
115     void testEncodeBasic() {
116         assertEquals("T235", getStringEncoder().encode("testing"));
117         assertEquals("T000", getStringEncoder().encode("The"));
118         assertEquals("Q200", getStringEncoder().encode("quick"));
119         assertEquals("B650", getStringEncoder().encode("brown"));
120         assertEquals("F200", getStringEncoder().encode("fox"));
121         assertEquals("J513", getStringEncoder().encode("jumped"));
122         assertEquals("O160", getStringEncoder().encode("over"));
123         assertEquals("T000", getStringEncoder().encode("the"));
124         assertEquals("L200", getStringEncoder().encode("lazy"));
125         assertEquals("D200", getStringEncoder().encode("dogs"));
126     }
127 
128     /**
129      * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
130      */
131     @Test
132     void testEncodeBatch2() {
133         assertEquals("A462", getStringEncoder().encode("Allricht"));
134         assertEquals("E166", getStringEncoder().encode("Eberhard"));
135         assertEquals("E521", getStringEncoder().encode("Engebrethson"));
136         assertEquals("H512", getStringEncoder().encode("Heimbach"));
137         assertEquals("H524", getStringEncoder().encode("Hanselmann"));
138         assertEquals("H431", getStringEncoder().encode("Hildebrand"));
139         assertEquals("K152", getStringEncoder().encode("Kavanagh"));
140         assertEquals("L530", getStringEncoder().encode("Lind"));
141         assertEquals("L222", getStringEncoder().encode("Lukaschowsky"));
142         assertEquals("M235", getStringEncoder().encode("McDonnell"));
143         assertEquals("M200", getStringEncoder().encode("McGee"));
144         assertEquals("O155", getStringEncoder().encode("Opnian"));
145         assertEquals("O155", getStringEncoder().encode("Oppenheimer"));
146         assertEquals("R355", getStringEncoder().encode("Riedemanas"));
147         assertEquals("Z300", getStringEncoder().encode("Zita"));
148         assertEquals("Z325", getStringEncoder().encode("Zitzmeinn"));
149     }
150 
151     /**
152      * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
153      */
154     @Test
155     void testEncodeBatch3() {
156         assertEquals("W252", getStringEncoder().encode("Washington"));
157         assertEquals("L000", getStringEncoder().encode("Lee"));
158         assertEquals("G362", getStringEncoder().encode("Gutierrez"));
159         assertEquals("P236", getStringEncoder().encode("Pfister"));
160         assertEquals("J250", getStringEncoder().encode("Jackson"));
161         assertEquals("T522", getStringEncoder().encode("Tymczak"));
162         // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
163         // possible.
164         assertEquals("V532", getStringEncoder().encode("VanDeusen"));
165     }
166 
167     /**
168      * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
169      */
170     @Test
171     void testEncodeBatch4() {
172         assertEquals("H452", getStringEncoder().encode("HOLMES"));
173         assertEquals("A355", getStringEncoder().encode("ADOMOMI"));
174         assertEquals("V536", getStringEncoder().encode("VONDERLEHR"));
175         assertEquals("B400", getStringEncoder().encode("BALL"));
176         assertEquals("S000", getStringEncoder().encode("SHAW"));
177         assertEquals("J250", getStringEncoder().encode("JACKSON"));
178         assertEquals("S545", getStringEncoder().encode("SCANLON"));
179         assertEquals("S532", getStringEncoder().encode("SAINTJOHN"));
180 
181     }
182 
183     @Test
184     void testEncodeIgnoreApostrophes() throws EncoderException {
185         // @formatter:off
186         checkEncodingVariations("O165",
187             "OBrien",
188             "'OBrien",
189             "O'Brien",
190             "OB'rien",
191             "OBr'ien",
192             "OBri'en",
193             "OBrie'n",
194             "OBrien'");
195         // @formatter:on
196     }
197 
198     /**
199      * Test data from http://www.myatt.demon.co.uk/sxalg.htm
200      *
201      * @throws EncoderException for some failure scenarios     */
202     @Test
203     void testEncodeIgnoreHyphens() throws EncoderException {
204         // @formatter:off
205         checkEncodingVariations("K525",
206             "KINGSMITH",
207             "-KINGSMITH",
208             "K-INGSMITH",
209             "KI-NGSMITH",
210             "KIN-GSMITH",
211             "KING-SMITH",
212             "KINGS-MITH",
213             "KINGSM-ITH",
214             "KINGSMI-TH",
215             "KINGSMIT-H",
216             "KINGSMITH-");
217         // @formatter:on
218     }
219 
220     @Test
221     void testEncodeIgnoreTrimmable() {
222         assertEquals("W252", getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
223     }
224 
225     @Test
226 // examples and algorithm rules from:  http://www.genealogy.com/articles/research/00000060.html
227     void testGenealogy() { // treat vowels and HW as silent
228         final Soundex s = Soundex.US_ENGLISH_GENEALOGY;
229         assertEquals("H251", s.encode("Heggenburger"));
230         assertEquals("B425", s.encode("Blackman"));
231         assertEquals("S530", s.encode("Schmidt"));
232         assertEquals("L150", s.encode("Lippmann"));
233         // Additional local example
234         assertEquals("D200", s.encode("Dodds")); // 'o' is not a separator here - it is silent
235         assertEquals("D200", s.encode("Dhdds")); // 'h' is silent
236         assertEquals("D200", s.encode("Dwdds")); // 'w' is silent
237     }
238 
239     /**
240      * Consonants from the same code group separated by W or H are treated as one.
241      */
242     @Test
243     void testHWRuleEx1() {
244         // From
245         // http://www.archives.gov/research_room/genealogy/census/soundex.html:
246         // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
247         // for the F). It is not coded A-226.
248         assertEquals("A261", getStringEncoder().encode("Ashcraft"));
249         assertEquals("A261", getStringEncoder().encode("Ashcroft"));
250         assertEquals("Y330", getStringEncoder().encode("yehudit"));
251         assertEquals("Y330", getStringEncoder().encode("yhwdyt"));
252     }
253 
254     /**
255      * Consonants from the same code group separated by W or H are treated as one.
256      *
257      * Test data from http://www.myatt.demon.co.uk/sxalg.htm
258      */
259     @Test
260     void testHWRuleEx2() {
261         assertEquals("B312", getStringEncoder().encode("BOOTHDAVIS"));
262         assertEquals("B312", getStringEncoder().encode("BOOTH-DAVIS"));
263     }
264 
265     /**
266      * Consonants from the same code group separated by W or H are treated as one.
267      *
268      * @throws EncoderException for some failure scenarios     */
269     @Test
270     void testHWRuleEx3() throws EncoderException {
271         assertEquals("S460", getStringEncoder().encode("Sgler"));
272         assertEquals("S460", getStringEncoder().encode("Swhgler"));
273         // Also S460:
274         // @formatter:off
275         checkEncodingVariations("S460",
276             "SAILOR",
277             "SALYER",
278             "SAYLOR",
279             "SCHALLER",
280             "SCHELLER",
281             "SCHILLER",
282             "SCHOOLER",
283             "SCHULER",
284             "SCHUYLER",
285             "SEILER",
286             "SEYLER",
287             "SHOLAR",
288             "SHULER",
289             "SILAR",
290             "SILER",
291             "SILLER");
292         // @formatter:on
293     }
294 
295     /**
296      * Examples for MS SQLServer from
297      * https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
298      */
299     @Test
300     void testMsSqlServer1() {
301         assertEquals("S530", getStringEncoder().encode("Smith"));
302         assertEquals("S530", getStringEncoder().encode("Smythe"));
303     }
304 
305     /**
306      * Examples for MS SQLServer from
307      * https://support.microsoft.com/default.aspx?scid=https://support.microsoft.com:80/support
308      * /kb/articles/Q100/3/65.asp&NoWebContent=1
309      *
310      * @throws EncoderException for some failure scenarios     */
311     @Test
312     void testMsSqlServer2() throws EncoderException {
313         checkEncodingVariations("E625", "Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen");
314     }
315 
316     /**
317      * Examples for MS SQLServer from https://databases.about.com/library/weekly/aa042901a.htm
318      */
319     @Test
320     void testMsSqlServer3() {
321         assertEquals("A500", getStringEncoder().encode("Ann"));
322         assertEquals("A536", getStringEncoder().encode("Andrew"));
323         assertEquals("J530", getStringEncoder().encode("Janet"));
324         assertEquals("M626", getStringEncoder().encode("Margaret"));
325         assertEquals("S315", getStringEncoder().encode("Steven"));
326         assertEquals("M240", getStringEncoder().encode("Michael"));
327         assertEquals("R163", getStringEncoder().encode("Robert"));
328         assertEquals("L600", getStringEncoder().encode("Laura"));
329         assertEquals("A500", getStringEncoder().encode("Anne"));
330     }
331 
332     /**
333      * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
334      */
335     @Test
336     void testNewInstance() {
337         assertEquals("W452", new Soundex().soundex("Williams"));
338     }
339 
340     @Test
341     void testNewInstance2() {
342         assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
343     }
344 
345     @Test
346     void testNewInstance3() {
347         assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
348     }
349 
350     @Test
351 // examples and algorithm rules from:  http://west-penwith.org.uk/misc/soundex.htm
352     void testSimplifiedSoundex() { // treat vowels and HW as separators
353         final Soundex s = Soundex.US_ENGLISH_SIMPLIFIED;
354         assertEquals("W452", s.encode("WILLIAMS"));
355         assertEquals("B625", s.encode("BARAGWANATH"));
356         assertEquals("D540", s.encode("DONNELL"));
357         assertEquals("L300", s.encode("LLOYD"));
358         assertEquals("W422", s.encode("WOOLCOCK"));
359         // Additional local examples
360         assertEquals("D320", s.encode("Dodds"));
361         assertEquals("D320", s.encode("Dwdds")); // w is a separator
362         assertEquals("D320", s.encode("Dhdds")); // h is a separator
363     }
364 
365     @Test
366     void testSoundexUtilsConstructable() {
367         new SoundexUtils();
368     }
369 
370     @Test
371     void testSoundexUtilsNullBehaviour() {
372         assertNull(SoundexUtils.clean(null));
373         assertEquals("", SoundexUtils.clean(""));
374         assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
375         assertEquals(0, SoundexUtils.differenceEncoded("", null));
376     }
377 
378     /**
379      * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
380      */
381     @Test
382     void testUsEnglishStatic() {
383         assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
384     }
385 
386     /**
387      * Fancy characters are not mapped by the default US mapping.
388      *
389      * https://issues.apache.org/jira/browse/CODEC-30
390      */
391     @Test
392     void testUsMappingEWithAcute() {
393         assertEquals("E000", getStringEncoder().encode("e"));
394         if (Character.isLetter('\u00e9')) { // e-acute
395             //         uppercase E-acute
396             assertThrows(IllegalArgumentException.class, () -> getStringEncoder().encode("\u00e9"));
397         } else {
398             assertEquals("", getStringEncoder().encode("\u00e9"));
399         }
400     }
401 
402     /**
403      * Fancy characters are not mapped by the default US mapping.
404      *
405      * https://issues.apache.org/jira/browse/CODEC-30
406      */
407     @Test
408     void testUsMappingOWithDiaeresis() {
409         assertEquals("O000", getStringEncoder().encode("o"));
410         if (Character.isLetter('\u00f6')) { // o-umlaut
411             //         uppercase O-umlaut
412             assertThrows(IllegalArgumentException.class, () -> getStringEncoder().encode("\u00f6"));
413         } else {
414             assertEquals("", getStringEncoder().encode("\u00f6"));
415         }
416     }
417 
418     /**
419      * Tests example from https://en.wikipedia.org/wiki/Soundex#American_Soundex as of 2015-03-22.
420      */
421     @Test
422     void testWikipediaAmericanSoundex() {
423         assertEquals("R163", getStringEncoder().encode("Robert"));
424         assertEquals("R163", getStringEncoder().encode("Rupert"));
425         assertEquals("A261", getStringEncoder().encode("Ashcraft"));
426         assertEquals("A261", getStringEncoder().encode("Ashcroft"));
427         assertEquals("T522", getStringEncoder().encode("Tymczak"));
428         assertEquals("P236", getStringEncoder().encode("Pfister"));
429     }
430 }