001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    // (FYI: Formatted and sorted with Eclipse)
019    
020    package org.apache.commons.codec.language;
021    
022    import junit.framework.Assert;
023    
024    import org.apache.commons.codec.EncoderException;
025    import org.apache.commons.codec.StringEncoder;
026    import org.apache.commons.codec.StringEncoderAbstractTest;
027    import org.junit.Test;
028    
029    /**
030     * Tests {@link Soundex}.
031     *
032     * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
033     *
034     * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $
035     */
036    public class SoundexTest extends StringEncoderAbstractTest {
037    
038        @Override
039        protected StringEncoder createStringEncoder() {
040            return new Soundex();
041        }
042    
043        /**
044         * @return Returns the encoder.
045         */
046        public Soundex getSoundexEncoder() {
047            return (Soundex)this.getStringEncoder();
048        }
049    
050        @Test
051        public void testB650() throws EncoderException {
052            this.checkEncodingVariations("B650", new String[]{
053                "BARHAM",
054                "BARONE",
055                "BARRON",
056                "BERNA",
057                "BIRNEY",
058                "BIRNIE",
059                "BOOROM",
060                "BOREN",
061                "BORN",
062                "BOURN",
063                "BOURNE",
064                "BOWRON",
065                "BRAIN",
066                "BRAME",
067                "BRANN",
068                "BRAUN",
069                "BREEN",
070                "BRIEN",
071                "BRIM",
072                "BRIMM",
073                "BRINN",
074                "BRION",
075                "BROOM",
076                "BROOME",
077                "BROWN",
078                "BROWNE",
079                "BRUEN",
080                "BRUHN",
081                "BRUIN",
082                "BRUMM",
083                "BRUN",
084                "BRUNO",
085                "BRYAN",
086                "BURIAN",
087                "BURN",
088                "BURNEY",
089                "BYRAM",
090                "BYRNE",
091                "BYRON",
092                "BYRUM"});
093        }
094    
095        @Test
096        public void testBadCharacters() {
097            Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOL>MES"));
098    
099        }
100    
101        @Test
102        public void testDifference() throws EncoderException {
103            // Edge cases
104            Assert.assertEquals(0, this.getSoundexEncoder().difference(null, null));
105            Assert.assertEquals(0, this.getSoundexEncoder().difference("", ""));
106            Assert.assertEquals(0, this.getSoundexEncoder().difference(" ", " "));
107            // Normal cases
108            Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe"));
109            Assert.assertEquals(2, this.getSoundexEncoder().difference("Ann", "Andrew"));
110            Assert.assertEquals(1, this.getSoundexEncoder().difference("Margaret", "Andrew"));
111            Assert.assertEquals(0, this.getSoundexEncoder().difference("Janet", "Margaret"));
112            // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
113            Assert.assertEquals(4, this.getSoundexEncoder().difference("Green", "Greene"));
114            Assert.assertEquals(0, this.getSoundexEncoder().difference("Blotchet-Halls", "Greene"));
115            // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
116            Assert.assertEquals(4, this.getSoundexEncoder().difference("Smith", "Smythe"));
117            Assert.assertEquals(4, this.getSoundexEncoder().difference("Smithers", "Smythers"));
118            Assert.assertEquals(2, this.getSoundexEncoder().difference("Anothers", "Brothers"));
119        }
120    
121        @Test
122        public void testEncodeBasic() {
123            Assert.assertEquals("T235", this.getSoundexEncoder().encode("testing"));
124            Assert.assertEquals("T000", this.getSoundexEncoder().encode("The"));
125            Assert.assertEquals("Q200", this.getSoundexEncoder().encode("quick"));
126            Assert.assertEquals("B650", this.getSoundexEncoder().encode("brown"));
127            Assert.assertEquals("F200", this.getSoundexEncoder().encode("fox"));
128            Assert.assertEquals("J513", this.getSoundexEncoder().encode("jumped"));
129            Assert.assertEquals("O160", this.getSoundexEncoder().encode("over"));
130            Assert.assertEquals("T000", this.getSoundexEncoder().encode("the"));
131            Assert.assertEquals("L200", this.getSoundexEncoder().encode("lazy"));
132            Assert.assertEquals("D200", this.getSoundexEncoder().encode("dogs"));
133        }
134    
135        /**
136         * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
137         */
138        @Test
139        public void testEncodeBatch2() {
140            Assert.assertEquals("A462", this.getSoundexEncoder().encode("Allricht"));
141            Assert.assertEquals("E166", this.getSoundexEncoder().encode("Eberhard"));
142            Assert.assertEquals("E521", this.getSoundexEncoder().encode("Engebrethson"));
143            Assert.assertEquals("H512", this.getSoundexEncoder().encode("Heimbach"));
144            Assert.assertEquals("H524", this.getSoundexEncoder().encode("Hanselmann"));
145            Assert.assertEquals("H431", this.getSoundexEncoder().encode("Hildebrand"));
146            Assert.assertEquals("K152", this.getSoundexEncoder().encode("Kavanagh"));
147            Assert.assertEquals("L530", this.getSoundexEncoder().encode("Lind"));
148            Assert.assertEquals("L222", this.getSoundexEncoder().encode("Lukaschowsky"));
149            Assert.assertEquals("M235", this.getSoundexEncoder().encode("McDonnell"));
150            Assert.assertEquals("M200", this.getSoundexEncoder().encode("McGee"));
151            Assert.assertEquals("O155", this.getSoundexEncoder().encode("Opnian"));
152            Assert.assertEquals("O155", this.getSoundexEncoder().encode("Oppenheimer"));
153            Assert.assertEquals("R355", this.getSoundexEncoder().encode("Riedemanas"));
154            Assert.assertEquals("Z300", this.getSoundexEncoder().encode("Zita"));
155            Assert.assertEquals("Z325", this.getSoundexEncoder().encode("Zitzmeinn"));
156        }
157    
158        /**
159         * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
160         */
161        @Test
162        public void testEncodeBatch3() {
163            Assert.assertEquals("W252", this.getSoundexEncoder().encode("Washington"));
164            Assert.assertEquals("L000", this.getSoundexEncoder().encode("Lee"));
165            Assert.assertEquals("G362", this.getSoundexEncoder().encode("Gutierrez"));
166            Assert.assertEquals("P236", this.getSoundexEncoder().encode("Pfister"));
167            Assert.assertEquals("J250", this.getSoundexEncoder().encode("Jackson"));
168            Assert.assertEquals("T522", this.getSoundexEncoder().encode("Tymczak"));
169            // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
170            // possible.
171            Assert.assertEquals("V532", this.getSoundexEncoder().encode("VanDeusen"));
172        }
173    
174        /**
175         * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
176         */
177        @Test
178        public void testEncodeBatch4() {
179            Assert.assertEquals("H452", this.getSoundexEncoder().encode("HOLMES"));
180            Assert.assertEquals("A355", this.getSoundexEncoder().encode("ADOMOMI"));
181            Assert.assertEquals("V536", this.getSoundexEncoder().encode("VONDERLEHR"));
182            Assert.assertEquals("B400", this.getSoundexEncoder().encode("BALL"));
183            Assert.assertEquals("S000", this.getSoundexEncoder().encode("SHAW"));
184            Assert.assertEquals("J250", this.getSoundexEncoder().encode("JACKSON"));
185            Assert.assertEquals("S545", this.getSoundexEncoder().encode("SCANLON"));
186            Assert.assertEquals("S532", this.getSoundexEncoder().encode("SAINTJOHN"));
187    
188        }
189    
190        @Test
191        public void testEncodeIgnoreApostrophes() throws EncoderException {
192            this.checkEncodingVariations("O165", new String[]{
193                "OBrien",
194                "'OBrien",
195                "O'Brien",
196                "OB'rien",
197                "OBr'ien",
198                "OBri'en",
199                "OBrie'n",
200                "OBrien'"});
201        }
202    
203        /**
204         * Test data from http://www.myatt.demon.co.uk/sxalg.htm
205         *
206         * @throws EncoderException
207         */
208        @Test
209        public void testEncodeIgnoreHyphens() throws EncoderException {
210            this.checkEncodingVariations("K525", new String[]{
211                "KINGSMITH",
212                "-KINGSMITH",
213                "K-INGSMITH",
214                "KI-NGSMITH",
215                "KIN-GSMITH",
216                "KING-SMITH",
217                "KINGS-MITH",
218                "KINGSM-ITH",
219                "KINGSMI-TH",
220                "KINGSMIT-H",
221                "KINGSMITH-"});
222        }
223    
224        @Test
225        public void testEncodeIgnoreTrimmable() {
226            Assert.assertEquals("W252", this.getSoundexEncoder().encode(" \t\n\r Washington \t\n\r "));
227        }
228    
229        /**
230         * Consonants from the same code group separated by W or H are treated as one.
231         */
232        @Test
233        public void testHWRuleEx1() {
234            // From
235            // http://www.archives.gov/research_room/genealogy/census/soundex.html:
236            // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
237            // for the F). It is not coded A-226.
238            Assert.assertEquals("A261", this.getSoundexEncoder().encode("Ashcraft"));
239        }
240    
241        /**
242         * Consonants from the same code group separated by W or H are treated as one.
243         *
244         * Test data from http://www.myatt.demon.co.uk/sxalg.htm
245         */
246        @Test
247        public void testHWRuleEx2() {
248            Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTHDAVIS"));
249            Assert.assertEquals("B312", this.getSoundexEncoder().encode("BOOTH-DAVIS"));
250        }
251    
252        /**
253         * Consonants from the same code group separated by W or H are treated as one.
254         *
255         * @throws EncoderException
256         */
257        @Test
258        public void testHWRuleEx3() throws EncoderException {
259            Assert.assertEquals("S460", this.getSoundexEncoder().encode("Sgler"));
260            Assert.assertEquals("S460", this.getSoundexEncoder().encode("Swhgler"));
261            // Also S460:
262            this.checkEncodingVariations("S460", new String[]{
263                "SAILOR",
264                "SALYER",
265                "SAYLOR",
266                "SCHALLER",
267                "SCHELLER",
268                "SCHILLER",
269                "SCHOOLER",
270                "SCHULER",
271                "SCHUYLER",
272                "SEILER",
273                "SEYLER",
274                "SHOLAR",
275                "SHULER",
276                "SILAR",
277                "SILER",
278                "SILLER"});
279        }
280    
281        /**
282         * Examples for MS SQLServer from
283         * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
284         */
285        @Test
286        public void testMsSqlServer1() {
287            Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smith"));
288            Assert.assertEquals("S530", this.getSoundexEncoder().encode("Smythe"));
289        }
290    
291        /**
292         * Examples for MS SQLServer from
293         * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
294         * /kb/articles/Q100/3/65.asp&NoWebContent=1
295         *
296         * @throws EncoderException
297         */
298        @Test
299        public void testMsSqlServer2() throws EncoderException {
300            this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
301        }
302    
303        /**
304         * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
305         */
306        @Test
307        public void testMsSqlServer3() {
308            Assert.assertEquals("A500", this.getSoundexEncoder().encode("Ann"));
309            Assert.assertEquals("A536", this.getSoundexEncoder().encode("Andrew"));
310            Assert.assertEquals("J530", this.getSoundexEncoder().encode("Janet"));
311            Assert.assertEquals("M626", this.getSoundexEncoder().encode("Margaret"));
312            Assert.assertEquals("S315", this.getSoundexEncoder().encode("Steven"));
313            Assert.assertEquals("M240", this.getSoundexEncoder().encode("Michael"));
314            Assert.assertEquals("R163", this.getSoundexEncoder().encode("Robert"));
315            Assert.assertEquals("L600", this.getSoundexEncoder().encode("Laura"));
316            Assert.assertEquals("A500", this.getSoundexEncoder().encode("Anne"));
317        }
318    
319        /**
320         * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
321         */
322        @Test
323        public void testNewInstance() {
324            Assert.assertEquals("W452", new Soundex().soundex("Williams"));
325        }
326    
327        @Test
328        public void testNewInstance2() {
329            Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
330        }
331    
332        @Test
333        public void testNewInstance3() {
334            Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
335        }
336    
337        @Test
338        public void testSoundexUtilsConstructable() {
339            new SoundexUtils();
340        }
341    
342        @Test
343        public void testSoundexUtilsNullBehaviour() {
344            Assert.assertEquals(null, SoundexUtils.clean(null));
345            Assert.assertEquals("", SoundexUtils.clean(""));
346            Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
347            Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null));
348        }
349    
350        /**
351         * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
352         */
353        @Test
354        public void testUsEnglishStatic() {
355            Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
356        }
357    
358        /**
359         * Fancy characters are not mapped by the default US mapping.
360         *
361         * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
362         */
363        @Test
364        public void testUsMappingEWithAcute() {
365            Assert.assertEquals("E000", this.getSoundexEncoder().encode("e"));
366            if (Character.isLetter('\u00e9')) { // e-acute
367                try {
368                    //         uppercase E-acute
369                    Assert.assertEquals("\u00c9000", this.getSoundexEncoder().encode("\u00e9"));
370                    Assert.fail("Expected IllegalArgumentException not thrown");
371                } catch (IllegalArgumentException e) {
372                    // expected
373                }
374            } else {
375                Assert.assertEquals("", this.getSoundexEncoder().encode("\u00e9"));
376            }
377        }
378    
379        /**
380         * Fancy characters are not mapped by the default US mapping.
381         *
382         * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
383         */
384        @Test
385        public void testUsMappingOWithDiaeresis() {
386            Assert.assertEquals("O000", this.getSoundexEncoder().encode("o"));
387            if (Character.isLetter('\u00f6')) { // o-umlaut
388                try {
389                    //         uppercase O-umlaut
390                    Assert.assertEquals("\u00d6000", this.getSoundexEncoder().encode("\u00f6"));
391                    Assert.fail("Expected IllegalArgumentException not thrown");
392                } catch (IllegalArgumentException e) {
393                    // expected
394                }
395            } else {
396                Assert.assertEquals("", this.getSoundexEncoder().encode("\u00f6"));
397            }
398        }
399    }