001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018// (FYI: Formatted and sorted with Eclipse)
019
020package org.apache.commons.codec.language;
021
022import org.junit.Assert;
023
024import org.apache.commons.codec.EncoderException;
025import org.apache.commons.codec.StringEncoderAbstractTest;
026import org.junit.Test;
027
028/**
029 * Tests {@link Soundex}.
030 *
031 * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
032 *
033 * @version $Id: SoundexTest.html 891688 2013-12-24 20:49:46Z ggregory $
034 */
035public class SoundexTest extends StringEncoderAbstractTest<Soundex> {
036
037    @Override
038    protected Soundex createStringEncoder() {
039        return new Soundex();
040    }
041
042    @Test
043    public void testB650() throws EncoderException {
044        this.checkEncodingVariations("B650", new String[]{
045            "BARHAM",
046            "BARONE",
047            "BARRON",
048            "BERNA",
049            "BIRNEY",
050            "BIRNIE",
051            "BOOROM",
052            "BOREN",
053            "BORN",
054            "BOURN",
055            "BOURNE",
056            "BOWRON",
057            "BRAIN",
058            "BRAME",
059            "BRANN",
060            "BRAUN",
061            "BREEN",
062            "BRIEN",
063            "BRIM",
064            "BRIMM",
065            "BRINN",
066            "BRION",
067            "BROOM",
068            "BROOME",
069            "BROWN",
070            "BROWNE",
071            "BRUEN",
072            "BRUHN",
073            "BRUIN",
074            "BRUMM",
075            "BRUN",
076            "BRUNO",
077            "BRYAN",
078            "BURIAN",
079            "BURN",
080            "BURNEY",
081            "BYRAM",
082            "BYRNE",
083            "BYRON",
084            "BYRUM"});
085    }
086
087    @Test
088    public void testBadCharacters() {
089        Assert.assertEquals("H452", this.getStringEncoder().encode("HOL>MES"));
090
091    }
092
093    @Test
094    public void testDifference() throws EncoderException {
095        // Edge cases
096        Assert.assertEquals(0, this.getStringEncoder().difference(null, null));
097        Assert.assertEquals(0, this.getStringEncoder().difference("", ""));
098        Assert.assertEquals(0, this.getStringEncoder().difference(" ", " "));
099        // Normal cases
100        Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
101        Assert.assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew"));
102        Assert.assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew"));
103        Assert.assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret"));
104        // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
105        Assert.assertEquals(4, this.getStringEncoder().difference("Green", "Greene"));
106        Assert.assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene"));
107        // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
108        Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
109        Assert.assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers"));
110        Assert.assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers"));
111    }
112
113    @Test
114    public void testEncodeBasic() {
115        Assert.assertEquals("T235", this.getStringEncoder().encode("testing"));
116        Assert.assertEquals("T000", this.getStringEncoder().encode("The"));
117        Assert.assertEquals("Q200", this.getStringEncoder().encode("quick"));
118        Assert.assertEquals("B650", this.getStringEncoder().encode("brown"));
119        Assert.assertEquals("F200", this.getStringEncoder().encode("fox"));
120        Assert.assertEquals("J513", this.getStringEncoder().encode("jumped"));
121        Assert.assertEquals("O160", this.getStringEncoder().encode("over"));
122        Assert.assertEquals("T000", this.getStringEncoder().encode("the"));
123        Assert.assertEquals("L200", this.getStringEncoder().encode("lazy"));
124        Assert.assertEquals("D200", this.getStringEncoder().encode("dogs"));
125    }
126
127    /**
128     * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
129     */
130    @Test
131    public void testEncodeBatch2() {
132        Assert.assertEquals("A462", this.getStringEncoder().encode("Allricht"));
133        Assert.assertEquals("E166", this.getStringEncoder().encode("Eberhard"));
134        Assert.assertEquals("E521", this.getStringEncoder().encode("Engebrethson"));
135        Assert.assertEquals("H512", this.getStringEncoder().encode("Heimbach"));
136        Assert.assertEquals("H524", this.getStringEncoder().encode("Hanselmann"));
137        Assert.assertEquals("H431", this.getStringEncoder().encode("Hildebrand"));
138        Assert.assertEquals("K152", this.getStringEncoder().encode("Kavanagh"));
139        Assert.assertEquals("L530", this.getStringEncoder().encode("Lind"));
140        Assert.assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky"));
141        Assert.assertEquals("M235", this.getStringEncoder().encode("McDonnell"));
142        Assert.assertEquals("M200", this.getStringEncoder().encode("McGee"));
143        Assert.assertEquals("O155", this.getStringEncoder().encode("Opnian"));
144        Assert.assertEquals("O155", this.getStringEncoder().encode("Oppenheimer"));
145        Assert.assertEquals("R355", this.getStringEncoder().encode("Riedemanas"));
146        Assert.assertEquals("Z300", this.getStringEncoder().encode("Zita"));
147        Assert.assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn"));
148    }
149
150    /**
151     * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
152     */
153    @Test
154    public void testEncodeBatch3() {
155        Assert.assertEquals("W252", this.getStringEncoder().encode("Washington"));
156        Assert.assertEquals("L000", this.getStringEncoder().encode("Lee"));
157        Assert.assertEquals("G362", this.getStringEncoder().encode("Gutierrez"));
158        Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
159        Assert.assertEquals("J250", this.getStringEncoder().encode("Jackson"));
160        Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
161        // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
162        // possible.
163        Assert.assertEquals("V532", this.getStringEncoder().encode("VanDeusen"));
164    }
165
166    /**
167     * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
168     */
169    @Test
170    public void testEncodeBatch4() {
171        Assert.assertEquals("H452", this.getStringEncoder().encode("HOLMES"));
172        Assert.assertEquals("A355", this.getStringEncoder().encode("ADOMOMI"));
173        Assert.assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR"));
174        Assert.assertEquals("B400", this.getStringEncoder().encode("BALL"));
175        Assert.assertEquals("S000", this.getStringEncoder().encode("SHAW"));
176        Assert.assertEquals("J250", this.getStringEncoder().encode("JACKSON"));
177        Assert.assertEquals("S545", this.getStringEncoder().encode("SCANLON"));
178        Assert.assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN"));
179
180    }
181
182    @Test
183    public void testEncodeIgnoreApostrophes() throws EncoderException {
184        this.checkEncodingVariations("O165", new String[]{
185            "OBrien",
186            "'OBrien",
187            "O'Brien",
188            "OB'rien",
189            "OBr'ien",
190            "OBri'en",
191            "OBrie'n",
192            "OBrien'"});
193    }
194
195    /**
196     * Test data from http://www.myatt.demon.co.uk/sxalg.htm
197     *
198     * @throws EncoderException
199     */
200    @Test
201    public void testEncodeIgnoreHyphens() throws EncoderException {
202        this.checkEncodingVariations("K525", new String[]{
203            "KINGSMITH",
204            "-KINGSMITH",
205            "K-INGSMITH",
206            "KI-NGSMITH",
207            "KIN-GSMITH",
208            "KING-SMITH",
209            "KINGS-MITH",
210            "KINGSM-ITH",
211            "KINGSMI-TH",
212            "KINGSMIT-H",
213            "KINGSMITH-"});
214    }
215
216    @Test
217    public void testEncodeIgnoreTrimmable() {
218        Assert.assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
219    }
220
221    /**
222     * Consonants from the same code group separated by W or H are treated as one.
223     */
224    @Test
225    public void testHWRuleEx1() {
226        // From
227        // http://www.archives.gov/research_room/genealogy/census/soundex.html:
228        // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
229        // for the F). It is not coded A-226.
230        Assert.assertEquals("A261", this.getStringEncoder().encode("Ashcraft"));
231    }
232
233    /**
234     * Consonants from the same code group separated by W or H are treated as one.
235     *
236     * Test data from http://www.myatt.demon.co.uk/sxalg.htm
237     */
238    @Test
239    public void testHWRuleEx2() {
240        Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS"));
241        Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS"));
242    }
243
244    /**
245     * Consonants from the same code group separated by W or H are treated as one.
246     *
247     * @throws EncoderException
248     */
249    @Test
250    public void testHWRuleEx3() throws EncoderException {
251        Assert.assertEquals("S460", this.getStringEncoder().encode("Sgler"));
252        Assert.assertEquals("S460", this.getStringEncoder().encode("Swhgler"));
253        // Also S460:
254        this.checkEncodingVariations("S460", new String[]{
255            "SAILOR",
256            "SALYER",
257            "SAYLOR",
258            "SCHALLER",
259            "SCHELLER",
260            "SCHILLER",
261            "SCHOOLER",
262            "SCHULER",
263            "SCHUYLER",
264            "SEILER",
265            "SEYLER",
266            "SHOLAR",
267            "SHULER",
268            "SILAR",
269            "SILER",
270            "SILLER"});
271    }
272
273    /**
274     * Examples for MS SQLServer from
275     * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
276     */
277    @Test
278    public void testMsSqlServer1() {
279        Assert.assertEquals("S530", this.getStringEncoder().encode("Smith"));
280        Assert.assertEquals("S530", this.getStringEncoder().encode("Smythe"));
281    }
282
283    /**
284     * Examples for MS SQLServer from
285     * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
286     * /kb/articles/Q100/3/65.asp&NoWebContent=1
287     *
288     * @throws EncoderException
289     */
290    @Test
291    public void testMsSqlServer2() throws EncoderException {
292        this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
293    }
294
295    /**
296     * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
297     */
298    @Test
299    public void testMsSqlServer3() {
300        Assert.assertEquals("A500", this.getStringEncoder().encode("Ann"));
301        Assert.assertEquals("A536", this.getStringEncoder().encode("Andrew"));
302        Assert.assertEquals("J530", this.getStringEncoder().encode("Janet"));
303        Assert.assertEquals("M626", this.getStringEncoder().encode("Margaret"));
304        Assert.assertEquals("S315", this.getStringEncoder().encode("Steven"));
305        Assert.assertEquals("M240", this.getStringEncoder().encode("Michael"));
306        Assert.assertEquals("R163", this.getStringEncoder().encode("Robert"));
307        Assert.assertEquals("L600", this.getStringEncoder().encode("Laura"));
308        Assert.assertEquals("A500", this.getStringEncoder().encode("Anne"));
309    }
310
311    /**
312     * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
313     */
314    @Test
315    public void testNewInstance() {
316        Assert.assertEquals("W452", new Soundex().soundex("Williams"));
317    }
318
319    @Test
320    public void testNewInstance2() {
321        Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
322    }
323
324    @Test
325    public void testNewInstance3() {
326        Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
327    }
328
329    @Test
330    public void testSoundexUtilsConstructable() {
331        new SoundexUtils();
332    }
333
334    @Test
335    public void testSoundexUtilsNullBehaviour() {
336        Assert.assertEquals(null, SoundexUtils.clean(null));
337        Assert.assertEquals("", SoundexUtils.clean(""));
338        Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
339        Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null));
340    }
341
342    /**
343     * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
344     */
345    @Test
346    public void testUsEnglishStatic() {
347        Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
348    }
349
350    /**
351     * Fancy characters are not mapped by the default US mapping.
352     *
353     * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
354     */
355    @Test
356    public void testUsMappingEWithAcute() {
357        Assert.assertEquals("E000", this.getStringEncoder().encode("e"));
358        if (Character.isLetter('\u00e9')) { // e-acute
359            try {
360                //         uppercase E-acute
361                Assert.assertEquals("\u00c9000", this.getStringEncoder().encode("\u00e9"));
362                Assert.fail("Expected IllegalArgumentException not thrown");
363            } catch (final IllegalArgumentException e) {
364                // expected
365            }
366        } else {
367            Assert.assertEquals("", this.getStringEncoder().encode("\u00e9"));
368        }
369    }
370
371    /**
372     * Fancy characters are not mapped by the default US mapping.
373     *
374     * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
375     */
376    @Test
377    public void testUsMappingOWithDiaeresis() {
378        Assert.assertEquals("O000", this.getStringEncoder().encode("o"));
379        if (Character.isLetter('\u00f6')) { // o-umlaut
380            try {
381                //         uppercase O-umlaut
382                Assert.assertEquals("\u00d6000", this.getStringEncoder().encode("\u00f6"));
383                Assert.fail("Expected IllegalArgumentException not thrown");
384            } catch (final IllegalArgumentException e) {
385                // expected
386            }
387        } else {
388            Assert.assertEquals("", this.getStringEncoder().encode("\u00f6"));
389        }
390    }
391}