001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 // (FYI: Formatted and sorted with Eclipse)
019
020 package org.apache.commons.codec.language;
021
022 import org.junit.Assert;
023
024 import org.apache.commons.codec.EncoderException;
025 import org.apache.commons.codec.StringEncoderAbstractTest;
026 import org.junit.Test;
027
028 /**
029 * Tests {@link Soundex}.
030 *
031 * <p>Keep this file in UTF-8 encoding for proper Javadoc processing.</p>
032 *
033 * @version $Id: SoundexTest.html 889935 2013-12-11 05:05:13Z ggregory $
034 */
035 public class SoundexTest extends StringEncoderAbstractTest<Soundex> {
036
037 @Override
038 protected Soundex createStringEncoder() {
039 return new Soundex();
040 }
041
042 @Test
043 public void testB650() throws EncoderException {
044 this.checkEncodingVariations("B650", new String[]{
045 "BARHAM",
046 "BARONE",
047 "BARRON",
048 "BERNA",
049 "BIRNEY",
050 "BIRNIE",
051 "BOOROM",
052 "BOREN",
053 "BORN",
054 "BOURN",
055 "BOURNE",
056 "BOWRON",
057 "BRAIN",
058 "BRAME",
059 "BRANN",
060 "BRAUN",
061 "BREEN",
062 "BRIEN",
063 "BRIM",
064 "BRIMM",
065 "BRINN",
066 "BRION",
067 "BROOM",
068 "BROOME",
069 "BROWN",
070 "BROWNE",
071 "BRUEN",
072 "BRUHN",
073 "BRUIN",
074 "BRUMM",
075 "BRUN",
076 "BRUNO",
077 "BRYAN",
078 "BURIAN",
079 "BURN",
080 "BURNEY",
081 "BYRAM",
082 "BYRNE",
083 "BYRON",
084 "BYRUM"});
085 }
086
087 @Test
088 public void testBadCharacters() {
089 Assert.assertEquals("H452", this.getStringEncoder().encode("HOL>MES"));
090
091 }
092
093 @Test
094 public void testDifference() throws EncoderException {
095 // Edge cases
096 Assert.assertEquals(0, this.getStringEncoder().difference(null, null));
097 Assert.assertEquals(0, this.getStringEncoder().difference("", ""));
098 Assert.assertEquals(0, this.getStringEncoder().difference(" ", " "));
099 // Normal cases
100 Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
101 Assert.assertEquals(2, this.getStringEncoder().difference("Ann", "Andrew"));
102 Assert.assertEquals(1, this.getStringEncoder().difference("Margaret", "Andrew"));
103 Assert.assertEquals(0, this.getStringEncoder().difference("Janet", "Margaret"));
104 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp
105 Assert.assertEquals(4, this.getStringEncoder().difference("Green", "Greene"));
106 Assert.assertEquals(0, this.getStringEncoder().difference("Blotchet-Halls", "Greene"));
107 // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
108 Assert.assertEquals(4, this.getStringEncoder().difference("Smith", "Smythe"));
109 Assert.assertEquals(4, this.getStringEncoder().difference("Smithers", "Smythers"));
110 Assert.assertEquals(2, this.getStringEncoder().difference("Anothers", "Brothers"));
111 }
112
113 @Test
114 public void testEncodeBasic() {
115 Assert.assertEquals("T235", this.getStringEncoder().encode("testing"));
116 Assert.assertEquals("T000", this.getStringEncoder().encode("The"));
117 Assert.assertEquals("Q200", this.getStringEncoder().encode("quick"));
118 Assert.assertEquals("B650", this.getStringEncoder().encode("brown"));
119 Assert.assertEquals("F200", this.getStringEncoder().encode("fox"));
120 Assert.assertEquals("J513", this.getStringEncoder().encode("jumped"));
121 Assert.assertEquals("O160", this.getStringEncoder().encode("over"));
122 Assert.assertEquals("T000", this.getStringEncoder().encode("the"));
123 Assert.assertEquals("L200", this.getStringEncoder().encode("lazy"));
124 Assert.assertEquals("D200", this.getStringEncoder().encode("dogs"));
125 }
126
127 /**
128 * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html
129 */
130 @Test
131 public void testEncodeBatch2() {
132 Assert.assertEquals("A462", this.getStringEncoder().encode("Allricht"));
133 Assert.assertEquals("E166", this.getStringEncoder().encode("Eberhard"));
134 Assert.assertEquals("E521", this.getStringEncoder().encode("Engebrethson"));
135 Assert.assertEquals("H512", this.getStringEncoder().encode("Heimbach"));
136 Assert.assertEquals("H524", this.getStringEncoder().encode("Hanselmann"));
137 Assert.assertEquals("H431", this.getStringEncoder().encode("Hildebrand"));
138 Assert.assertEquals("K152", this.getStringEncoder().encode("Kavanagh"));
139 Assert.assertEquals("L530", this.getStringEncoder().encode("Lind"));
140 Assert.assertEquals("L222", this.getStringEncoder().encode("Lukaschowsky"));
141 Assert.assertEquals("M235", this.getStringEncoder().encode("McDonnell"));
142 Assert.assertEquals("M200", this.getStringEncoder().encode("McGee"));
143 Assert.assertEquals("O155", this.getStringEncoder().encode("Opnian"));
144 Assert.assertEquals("O155", this.getStringEncoder().encode("Oppenheimer"));
145 Assert.assertEquals("R355", this.getStringEncoder().encode("Riedemanas"));
146 Assert.assertEquals("Z300", this.getStringEncoder().encode("Zita"));
147 Assert.assertEquals("Z325", this.getStringEncoder().encode("Zitzmeinn"));
148 }
149
150 /**
151 * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html
152 */
153 @Test
154 public void testEncodeBatch3() {
155 Assert.assertEquals("W252", this.getStringEncoder().encode("Washington"));
156 Assert.assertEquals("L000", this.getStringEncoder().encode("Lee"));
157 Assert.assertEquals("G362", this.getStringEncoder().encode("Gutierrez"));
158 Assert.assertEquals("P236", this.getStringEncoder().encode("Pfister"));
159 Assert.assertEquals("J250", this.getStringEncoder().encode("Jackson"));
160 Assert.assertEquals("T522", this.getStringEncoder().encode("Tymczak"));
161 // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also
162 // possible.
163 Assert.assertEquals("V532", this.getStringEncoder().encode("VanDeusen"));
164 }
165
166 /**
167 * Examples from: http://www.myatt.demon.co.uk/sxalg.htm
168 */
169 @Test
170 public void testEncodeBatch4() {
171 Assert.assertEquals("H452", this.getStringEncoder().encode("HOLMES"));
172 Assert.assertEquals("A355", this.getStringEncoder().encode("ADOMOMI"));
173 Assert.assertEquals("V536", this.getStringEncoder().encode("VONDERLEHR"));
174 Assert.assertEquals("B400", this.getStringEncoder().encode("BALL"));
175 Assert.assertEquals("S000", this.getStringEncoder().encode("SHAW"));
176 Assert.assertEquals("J250", this.getStringEncoder().encode("JACKSON"));
177 Assert.assertEquals("S545", this.getStringEncoder().encode("SCANLON"));
178 Assert.assertEquals("S532", this.getStringEncoder().encode("SAINTJOHN"));
179
180 }
181
182 @Test
183 public void testEncodeIgnoreApostrophes() throws EncoderException {
184 this.checkEncodingVariations("O165", new String[]{
185 "OBrien",
186 "'OBrien",
187 "O'Brien",
188 "OB'rien",
189 "OBr'ien",
190 "OBri'en",
191 "OBrie'n",
192 "OBrien'"});
193 }
194
195 /**
196 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
197 *
198 * @throws EncoderException
199 */
200 @Test
201 public void testEncodeIgnoreHyphens() throws EncoderException {
202 this.checkEncodingVariations("K525", new String[]{
203 "KINGSMITH",
204 "-KINGSMITH",
205 "K-INGSMITH",
206 "KI-NGSMITH",
207 "KIN-GSMITH",
208 "KING-SMITH",
209 "KINGS-MITH",
210 "KINGSM-ITH",
211 "KINGSMI-TH",
212 "KINGSMIT-H",
213 "KINGSMITH-"});
214 }
215
216 @Test
217 public void testEncodeIgnoreTrimmable() {
218 Assert.assertEquals("W252", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
219 }
220
221 /**
222 * Consonants from the same code group separated by W or H are treated as one.
223 */
224 @Test
225 public void testHWRuleEx1() {
226 // From
227 // http://www.archives.gov/research_room/genealogy/census/soundex.html:
228 // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1
229 // for the F). It is not coded A-226.
230 Assert.assertEquals("A261", this.getStringEncoder().encode("Ashcraft"));
231 }
232
233 /**
234 * Consonants from the same code group separated by W or H are treated as one.
235 *
236 * Test data from http://www.myatt.demon.co.uk/sxalg.htm
237 */
238 @Test
239 public void testHWRuleEx2() {
240 Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTHDAVIS"));
241 Assert.assertEquals("B312", this.getStringEncoder().encode("BOOTH-DAVIS"));
242 }
243
244 /**
245 * Consonants from the same code group separated by W or H are treated as one.
246 *
247 * @throws EncoderException
248 */
249 @Test
250 public void testHWRuleEx3() throws EncoderException {
251 Assert.assertEquals("S460", this.getStringEncoder().encode("Sgler"));
252 Assert.assertEquals("S460", this.getStringEncoder().encode("Swhgler"));
253 // Also S460:
254 this.checkEncodingVariations("S460", new String[]{
255 "SAILOR",
256 "SALYER",
257 "SAYLOR",
258 "SCHALLER",
259 "SCHELLER",
260 "SCHILLER",
261 "SCHOOLER",
262 "SCHULER",
263 "SCHUYLER",
264 "SEILER",
265 "SEYLER",
266 "SHOLAR",
267 "SHULER",
268 "SILAR",
269 "SILER",
270 "SILLER"});
271 }
272
273 /**
274 * Examples for MS SQLServer from
275 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp
276 */
277 @Test
278 public void testMsSqlServer1() {
279 Assert.assertEquals("S530", this.getStringEncoder().encode("Smith"));
280 Assert.assertEquals("S530", this.getStringEncoder().encode("Smythe"));
281 }
282
283 /**
284 * Examples for MS SQLServer from
285 * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support
286 * /kb/articles/Q100/3/65.asp&NoWebContent=1
287 *
288 * @throws EncoderException
289 */
290 @Test
291 public void testMsSqlServer2() throws EncoderException {
292 this.checkEncodingVariations("E625", new String[]{"Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen"});
293 }
294
295 /**
296 * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm
297 */
298 @Test
299 public void testMsSqlServer3() {
300 Assert.assertEquals("A500", this.getStringEncoder().encode("Ann"));
301 Assert.assertEquals("A536", this.getStringEncoder().encode("Andrew"));
302 Assert.assertEquals("J530", this.getStringEncoder().encode("Janet"));
303 Assert.assertEquals("M626", this.getStringEncoder().encode("Margaret"));
304 Assert.assertEquals("S315", this.getStringEncoder().encode("Steven"));
305 Assert.assertEquals("M240", this.getStringEncoder().encode("Michael"));
306 Assert.assertEquals("R163", this.getStringEncoder().encode("Robert"));
307 Assert.assertEquals("L600", this.getStringEncoder().encode("Laura"));
308 Assert.assertEquals("A500", this.getStringEncoder().encode("Anne"));
309 }
310
311 /**
312 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
313 */
314 @Test
315 public void testNewInstance() {
316 Assert.assertEquals("W452", new Soundex().soundex("Williams"));
317 }
318
319 @Test
320 public void testNewInstance2() {
321 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).soundex("Williams"));
322 }
323
324 @Test
325 public void testNewInstance3() {
326 Assert.assertEquals("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).soundex("Williams"));
327 }
328
329 @Test
330 public void testSoundexUtilsConstructable() {
331 new SoundexUtils();
332 }
333
334 @Test
335 public void testSoundexUtilsNullBehaviour() {
336 Assert.assertEquals(null, SoundexUtils.clean(null));
337 Assert.assertEquals("", SoundexUtils.clean(""));
338 Assert.assertEquals(0, SoundexUtils.differenceEncoded(null, ""));
339 Assert.assertEquals(0, SoundexUtils.differenceEncoded("", null));
340 }
341
342 /**
343 * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56
344 */
345 @Test
346 public void testUsEnglishStatic() {
347 Assert.assertEquals("W452", Soundex.US_ENGLISH.soundex("Williams"));
348 }
349
350 /**
351 * Fancy characters are not mapped by the default US mapping.
352 *
353 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
354 */
355 @Test
356 public void testUsMappingEWithAcute() {
357 Assert.assertEquals("E000", this.getStringEncoder().encode("e"));
358 if (Character.isLetter('\u00e9')) { // e-acute
359 try {
360 // uppercase E-acute
361 Assert.assertEquals("\u00c9000", this.getStringEncoder().encode("\u00e9"));
362 Assert.fail("Expected IllegalArgumentException not thrown");
363 } catch (final IllegalArgumentException e) {
364 // expected
365 }
366 } else {
367 Assert.assertEquals("", this.getStringEncoder().encode("\u00e9"));
368 }
369 }
370
371 /**
372 * Fancy characters are not mapped by the default US mapping.
373 *
374 * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080
375 */
376 @Test
377 public void testUsMappingOWithDiaeresis() {
378 Assert.assertEquals("O000", this.getStringEncoder().encode("o"));
379 if (Character.isLetter('\u00f6')) { // o-umlaut
380 try {
381 // uppercase O-umlaut
382 Assert.assertEquals("\u00d6000", this.getStringEncoder().encode("\u00f6"));
383 Assert.fail("Expected IllegalArgumentException not thrown");
384 } catch (final IllegalArgumentException e) {
385 // expected
386 }
387 } else {
388 Assert.assertEquals("", this.getStringEncoder().encode("\u00f6"));
389 }
390 }
391 }