001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    // (FYI: Formatted and sorted with Eclipse)
019    
020    package org.apache.commons.codec.language;
021    
022    import org.junit.Assert;
023    
024    import org.apache.commons.codec.EncoderException;
025    import org.apache.commons.codec.StringEncoderAbstractTest;
026    import org.junit.Test;
027    
028    /**
029     * Tests {@link Soundex}.
030     *
031     * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
032     *
033     * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $
034     */
035    public class SoundexTest extends StringEncoderAbstractTest<Soundex> {
036    
037        @Override
038        protected Soundex createStringEncoder() {
039            return new Soundex();
040        }
041    
042        @Test
043        public void testB650() throws EncoderException {
044            this.checkEncodingVariations("B650", new String[]{
045                "BARHAM",
046                "BARONE",
047                "BARRON",
048                "BERNA",
049                "BIRNEY",
050                "BIRNIE",
051                "BOOROM",
052                "BOREN",
053                "BORN",
054                "BOURN",
055                "BOURNE",
056                "BOWRON",
057                "BRAIN",
058                "BRAME",
059                "BRANN",
060                "BRAUN",
061                "BREEN",
062                "BRIEN",
063                "BRIM",
064                "BRIMM",
065                "BRINN",
066                "BRION",
067                "BROOM",
068                "BROOME",
069                "BROWN",
070                "BROWNE",
071                "BRUEN",
072                "BRUHN",
073                "BRUIN",
074                "BRUMM",
075                "BRUN",
076                "BRUNO",
077                "BRYAN",
078                "BURIAN",
079                "BURN",
080                "BURNEY",
081                "BYRAM",
082                "BYRNE",
083                "BYRON",
084                "BYRUM"});
085        }
086    
087        @Test
088        public void testBadCharacters() {
089            Assert.assertEquals("H452", this.getStringEncoder().encode("HOL>MES"));
090    
091        }
092    
093        @Test
094        public void testDifference() throws EncoderException {
095            // Edge cases
096            Assert.assertEquals(0, this.getStringEncoder().difference(null, null));
097            Assert.assertEquals(0, this.getStringEncoder().difference("", ""));
098            Assert.assertEquals(0, this.getStringEncoder().difference(" ", " "));
099            // Normal cases
100            Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
101            Assert.assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew"));
102            Assert.assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew"));
103            Assert.assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret"));
104            // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
105            Assert.assertEquals(4, this.getStringEncoder().difference("Green", "Greene"));
106            Assert.assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene"));
107            // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
108            Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
109            Assert.assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers"));
110            Assert.assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers"));
111        }
112    
113        @Test
114        public void testEncodeBasic() {
115            Assert.assertEquals("T235", this.getStringEncoder().encode("testing"));
116            Assert.assertEquals("T000", this.getStringEncoder().encode("The"));
117            Assert.assertEquals("Q200", this.getStringEncoder().encode("quick"));
118            Assert.assertEquals("B650", this.getStringEncoder().encode("brown"));
119            Assert.assertEquals("F200", this.getStringEncoder().encode("fox"));
120            Assert.assertEquals("J513", this.getStringEncoder().encode("jumped"));
121            Assert.assertEquals("O160", this.getStringEncoder().encode("over"));
122            Assert.assertEquals("T000", this.getStringEncoder().encode("the"));
123            Assert.assertEquals("L200", this.getStringEncoder().encode("lazy"));
124            Assert.assertEquals("D200", this.getStringEncoder().encode("dogs"));
125        }
126    
127        /**
128         * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
129         */
130        @Test
131        public void testEncodeBatch2() {
132            Assert.assertEquals("A462", this.getStringEncoder().encode("Allricht"));
133            Assert.assertEquals("E166", this.getStringEncoder().encode("Eberhard"));
134            Assert.assertEquals("E521", this.getStringEncoder().encode("Engebrethson"));
135            Assert.assertEquals("H512", this.getStringEncoder().encode("Heimbach"));
136            Assert.assertEquals("H524", this.getStringEncoder().encode("Hanselmann"));
137            Assert.assertEquals("H431", this.getStringEncoder().encode("Hildebrand"));
138            Assert.assertEquals("K152", this.getStringEncoder().encode("Kavanagh"));
139            Assert.assertEquals("L530", this.getStringEncoder().encode("Lind"));
140            Assert.assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky"));
141            Assert.assertEquals("M235", this.getStringEncoder().encode("McDonnell"));
142            Assert.assertEquals("M200", this.getStringEncoder().encode("McGee"));
143            Assert.assertEquals("O155", this.getStringEncoder().encode("Opnian"));
144            Assert.assertEquals("O155", this.getStringEncoder().encode("Oppenheimer"));
145            Assert.assertEquals("R355", this.getStringEncoder().encode("Riedemanas"));
146            Assert.assertEquals("Z300", this.getStringEncoder().encode("Zita"));
147            Assert.assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn"));
148        }
149    
150        /**
151         * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
152         */
153        @Test
154        public void testEncodeBatch3() {
155            Assert.assertEquals("W252", this.getStringEncoder().encode("Washington"));
156            Assert.assertEquals("L000", this.getStringEncoder().encode("Lee"));
157            Assert.assertEquals("G362", this.getStringEncoder().encode("Gutierrez"));
158            Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
159            Assert.assertEquals("J250", this.getStringEncoder().encode("Jackson"));
160            Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
161            // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
162            // possible.
163            Assert.assertEquals("V532", this.getStringEncoder().encode("VanDeusen"));
164        }
165    
166        /**
167         * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
168         */
169        @Test
170        public void testEncodeBatch4() {
171            Assert.assertEquals("H452", this.getStringEncoder().encode("HOLMES"));
172            Assert.assertEquals("A355", this.getStringEncoder().encode("ADOMOMI"));
173            Assert.assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR"));
174            Assert.assertEquals("B400", this.getStringEncoder().encode("BALL"));
175            Assert.assertEquals("S000", this.getStringEncoder().encode("SHAW"));
176            Assert.assertEquals("J250", this.getStringEncoder().encode("JACKSON"));
177            Assert.assertEquals("S545", this.getStringEncoder().encode("SCANLON"));
178            Assert.assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN"));
179    
180        }
181    
182        @Test
183        public void testEncodeIgnoreApostrophes() throws EncoderException {
184            this.checkEncodingVariations("O165", new String[]{
185                "OBrien",
186                "'OBrien",
187                "O'Brien",
188                "OB'rien",
189                "OBr'ien",
190                "OBri'en",
191                "OBrie'n",
192                "OBrien'"});
193        }
194    
195        /**
196         * Test data from http://www.myatt.demon.co.uk/sxalg.htm
197         *
198         * @throws EncoderException
199         */
200        @Test
201        public void testEncodeIgnoreHyphens() throws EncoderException {
202            this.checkEncodingVariations("K525", new String[]{
203                "KINGSMITH",
204                "-KINGSMITH",
205                "K-INGSMITH",
206                "KI-NGSMITH",
207                "KIN-GSMITH",
208                "KING-SMITH",
209                "KINGS-MITH",
210                "KINGSM-ITH",
211                "KINGSMI-TH",
212                "KINGSMIT-H",
213                "KINGSMITH-"});
214        }
215    
216        @Test
217        public void testEncodeIgnoreTrimmable() {
218            Assert.assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
219        }
220    
221        /**
222         * Consonants from the same code group separated by W or H are treated as one.
223         */
224        @Test
225        public void testHWRuleEx1() {
226            // From
227            // http://www.archives.gov/research_room/genealogy/census/soundex.html:
228            // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
229            // for the F). It is not coded A-226.
230            Assert.assertEquals("A261", this.getStringEncoder().encode("Ashcraft"));
231        }
232    
233        /**
234         * Consonants from the same code group separated by W or H are treated as one.
235         *
236         * Test data from http://www.myatt.demon.co.uk/sxalg.htm
237         */
238        @Test
239        public void testHWRuleEx2() {
240            Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS"));
241            Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS"));
242        }
243    
244        /**
245         * Consonants from the same code group separated by W or H are treated as one.
246         *
247         * @throws EncoderException
248         */
249        @Test
250        public void testHWRuleEx3() throws EncoderException {
251            Assert.assertEquals("S460", this.getStringEncoder().encode("Sgler"));
252            Assert.assertEquals("S460", this.getStringEncoder().encode("Swhgler"));
253            // Also S460:
254            this.checkEncodingVariations("S460", new String[]{
255                "SAILOR",
256                "SALYER",
257                "SAYLOR",
258                "SCHALLER",
259                "SCHELLER",
260                "SCHILLER",
261                "SCHOOLER",
262                "SCHULER",
263                "SCHUYLER",
264                "SEILER",
265                "SEYLER",
266                "SHOLAR",
267                "SHULER",
268                "SILAR",
269                "SILER",
270                "SILLER"});
271        }
272    
273        /**
274         * Examples for MS SQLServer from
275         * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
276         */
277        @Test
278        public void testMsSqlServer1() {
279            Assert.assertEquals("S530", this.getStringEncoder().encode("Smith"));
280            Assert.assertEquals("S530", this.getStringEncoder().encode("Smythe"));
281        }
282    
283        /**
284         * Examples for MS SQLServer from
285         * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
286         * /kb/articles/Q100/3/65.asp&NoWebContent=1
287         *
288         * @throws EncoderException
289         */
290        @Test
291        public void testMsSqlServer2() throws EncoderException {
292            this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
293        }
294    
295        /**
296         * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
297         */
298        @Test
299        public void testMsSqlServer3() {
300            Assert.assertEquals("A500", this.getStringEncoder().encode("Ann"));
301            Assert.assertEquals("A536", this.getStringEncoder().encode("Andrew"));
302            Assert.assertEquals("J530", this.getStringEncoder().encode("Janet"));
303            Assert.assertEquals("M626", this.getStringEncoder().encode("Margaret"));
304            Assert.assertEquals("S315", this.getStringEncoder().encode("Steven"));
305            Assert.assertEquals("M240", this.getStringEncoder().encode("Michael"));
306            Assert.assertEquals("R163", this.getStringEncoder().encode("Robert"));
307            Assert.assertEquals("L600", this.getStringEncoder().encode("Laura"));
308            Assert.assertEquals("A500", this.getStringEncoder().encode("Anne"));
309        }
310    
311        /**
312         * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
313         */
314        @Test
315        public void testNewInstance() {
316            Assert.assertEquals("W452", new Soundex().soundex("Williams"));
317        }
318    
319        @Test
320        public void testNewInstance2() {
321            Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
322        }
323    
324        @Test
325        public void testNewInstance3() {
326            Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
327        }
328    
329        @Test
330        public void testSoundexUtilsConstructable() {
331            new SoundexUtils();
332        }
333    
334        @Test
335        public void testSoundexUtilsNullBehaviour() {
336            Assert.assertEquals(null, SoundexUtils.clean(null));
337            Assert.assertEquals("", SoundexUtils.clean(""));
338            Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
339            Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null));
340        }
341    
342        /**
343         * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
344         */
345        @Test
346        public void testUsEnglishStatic() {
347            Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
348        }
349    
350        /**
351         * Fancy characters are not mapped by the default US mapping.
352         *
353         * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
354         */
355        @Test
356        public void testUsMappingEWithAcute() {
357            Assert.assertEquals("E000", this.getStringEncoder().encode("e"));
358            if (Character.isLetter('\u00e9')) { // e-acute
359                try {
360                    //         uppercase E-acute
361                    Assert.assertEquals("\u00c9000", this.getStringEncoder().encode("\u00e9"));
362                    Assert.fail("Expected IllegalArgumentException not thrown");
363                } catch (final IllegalArgumentException e) {
364                    // expected
365                }
366            } else {
367                Assert.assertEquals("", this.getStringEncoder().encode("\u00e9"));
368            }
369        }
370    
371        /**
372         * Fancy characters are not mapped by the default US mapping.
373         *
374         * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
375         */
376        @Test
377        public void testUsMappingOWithDiaeresis() {
378            Assert.assertEquals("O000", this.getStringEncoder().encode("o"));
379            if (Character.isLetter('\u00f6')) { // o-umlaut
380                try {
381                    //         uppercase O-umlaut
382                    Assert.assertEquals("\u00d6000", this.getStringEncoder().encode("\u00f6"));
383                    Assert.fail("Expected IllegalArgumentException not thrown");
384                } catch (final IllegalArgumentException e) {
385                    // expected
386                }
387            } else {
388                Assert.assertEquals("", this.getStringEncoder().encode("\u00f6"));
389            }
390        }
391    }