View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  
22  import java.util.stream.IntStream;
23  
24  import org.apache.commons.codec.AbstractStringEncoderTest;
25  import org.apache.commons.codec.EncoderException;
26  import org.junit.jupiter.api.Test;
27  import org.junit.jupiter.params.ParameterizedTest;
28  import org.junit.jupiter.params.provider.MethodSource;
29  
30  /**
31   * Tests {@link DaitchMokotoffSoundex}.
32   * <p>
33   * Keep this file in UTF-8 encoding for proper Javadoc processing.
34   * </p>
35   */
36  class DaitchMokotoffSoundexTest extends AbstractStringEncoderTest<DaitchMokotoffSoundex> {
37  
38      static IntStream getNonLetters() {
39          return IntStream.rangeClosed(Character.MIN_VALUE, Character.MAX_VALUE).filter(c -> !Character.isLetter(c));
40      }
41  
42      @Override
43      protected DaitchMokotoffSoundex createStringEncoder() {
44          return new DaitchMokotoffSoundex();
45      }
46  
47      private String encode(final String source) {
48          return getStringEncoder().encode(source);
49      }
50  
51      private String soundex(final String source) {
52          return getStringEncoder().soundex(source);
53      }
54  
55      @Test
56      void testAccentedCharacterFolding() {
57          assertEquals("294795", soundex("Straßburg"));
58          assertEquals("294795", soundex("Strasburg"));
59          assertEquals("095600", soundex("Éregon"));
60          assertEquals("095600", soundex("Eregon"));
61      }
62  
63      @Test
64      void testAdjacentCodes() {
65          // AKSSOL
66          // A-KS-S-O-L
67          // 0-54-4---8 -> wrong
68          // 0-54-----8 -> correct
69          assertEquals("054800", soundex("AKSSOL"));
70          // GERSCHFELD
71          // G-E-RS-CH-F-E-L-D
72          // 5--4/94-5/4-7-8-3 -> wrong
73          // 5--4/94-5/--7-8-3 -> correct
74          assertEquals("547830|545783|594783|594578", soundex("GERSCHFELD"));
75      }
76  
77      @Test
78      void testEncodeBasic() {
79          // same as above, but without branching
80          assertEquals("097400", encode("AUERBACH"));
81          assertEquals("097400", encode("OHRBACH"));
82          assertEquals("874400", encode("LIPSHITZ"));
83          assertEquals("874400", encode("LIPPSZYC"));
84          assertEquals("876450", encode("LEWINSKY"));
85          assertEquals("876450", encode("LEVINSKI"));
86          assertEquals("486740", encode("SZLAMAWICZ"));
87          assertEquals("486740", encode("SHLAMOVITZ"));
88      }
89  
90      @Test
91      void testEncodeIgnoreApostrophes() throws EncoderException {
92          checkEncodingVariations("079600", "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", "OBri'en", "OBrie'n", "OBrien'");
93      }
94  
95      /**
96       * Test data from http://www.myatt.demon.co.uk/sxalg.htm
97       *
98       * @throws EncoderException for some failure scenarios
99       */
100     @Test
101     void testEncodeIgnoreHyphens() throws EncoderException {
102         checkEncodingVariations("565463", "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH",
103                 "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-");
104     }
105 
106     @ParameterizedTest
107     @MethodSource("getNonLetters")
108     void testEncodeIgnoreNonLetters(final int nonLetterInt) throws EncoderException {
109         final char nonLetterChar = (char) nonLetterInt;
110         checkEncodingVariations("746536", "Washington" + nonLetterChar, nonLetterChar + "Washington", nonLetterChar + "Washington" + nonLetterChar,
111                 "Washi" + nonLetterChar + "ngton");
112     }
113 
114     @Test
115     void testEncodeIgnoreTrimmable() {
116         assertEquals("746536", encode(" \t\n\r Washington \t\n\r "));
117         assertEquals("746536", encode("Washington"));
118     }
119 
120     /**
121      * Examples from http://www.jewishgen.org/infofiles/soundex.html
122      */
123     @Test
124     void testSoundexBasic() {
125         assertEquals("583600", soundex("GOLDEN"));
126         assertEquals("087930", soundex("Alpert"));
127         assertEquals("791900", soundex("Breuer"));
128         assertEquals("579000", soundex("Haber"));
129         assertEquals("665600", soundex("Mannheim"));
130         assertEquals("664000", soundex("Mintz"));
131         assertEquals("370000", soundex("Topf"));
132         assertEquals("586660", soundex("Kleinmann"));
133         assertEquals("769600", soundex("Ben Aron"));
134         assertEquals("097400|097500", soundex("AUERBACH"));
135         assertEquals("097400|097500", soundex("OHRBACH"));
136         assertEquals("874400", soundex("LIPSHITZ"));
137         assertEquals("874400|874500", soundex("LIPPSZYC"));
138         assertEquals("876450", soundex("LEWINSKY"));
139         assertEquals("876450", soundex("LEVINSKI"));
140         assertEquals("486740", soundex("SZLAMAWICZ"));
141         assertEquals("486740", soundex("SHLAMOVITZ"));
142     }
143 
144     /**
145      * Examples from http://www.avotaynu.com/soundex.htm
146      */
147     @Test
148     void testSoundexBasic2() {
149         assertEquals("467000|567000", soundex("Ceniow"));
150         assertEquals("467000", soundex("Tsenyuv"));
151         assertEquals("587400|587500", soundex("Holubica"));
152         assertEquals("587400", soundex("Golubitsa"));
153         assertEquals("746480|794648", soundex("Przemysl"));
154         assertEquals("746480", soundex("Pshemeshil"));
155         assertEquals("944744|944745|944754|944755|945744|945745|945754|945755", soundex("Rosochowaciec"));
156         assertEquals("945744", soundex("Rosokhovatsets"));
157     }
158 
159     /**
160      * Examples from https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
161      */
162     @Test
163     void testSoundexBasic3() {
164         assertEquals("734000|739400", soundex("Peters"));
165         assertEquals("734600|739460", soundex("Peterson"));
166         assertEquals("645740", soundex("Moskowitz"));
167         assertEquals("645740", soundex("Moskovitz"));
168         assertEquals("154600|145460|454600|445460", soundex("Jackson"));
169         final String jacksonJackson = "154654|154645|154644|145465|145464|454654|454645|454644|445465|445464";
170         assertEquals(jacksonJackson, soundex("Jackson-Jackson"));
171         assertEquals(jacksonJackson, soundex("Jackson--Jackson"));
172         assertEquals(jacksonJackson, soundex("Jackson—Jackson"));
173         assertEquals(jacksonJackson, soundex("Jackson_Jackson"));
174         assertEquals(jacksonJackson, soundex("Jackson$Jackson"));
175         assertEquals(jacksonJackson, soundex("JacksonJackson"));
176     }
177 
178     @Test
179     void testSpecialRomanianCharacters() {
180         assertEquals("364000|464000", soundex("ţamas")); // t-cedilla
181         assertEquals("364000|464000", soundex("țamas")); // t-comma
182     }
183 }