View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import static org.junit.jupiter.api.Assertions.assertEquals;
20  
21  import org.apache.commons.codec.AbstractStringEncoderTest;
22  import org.apache.commons.codec.EncoderException;
23  import org.junit.jupiter.api.Test;
24  
25  /**
26   * Tests {@link DaitchMokotoffSoundex}.
27   * <p>
28   * Keep this file in UTF-8 encoding for proper Javadoc processing.
29   * </p>
30   */
31  public class DaitchMokotoffSoundexTest extends AbstractStringEncoderTest<DaitchMokotoffSoundex> {
32  
33      @Override
34      protected DaitchMokotoffSoundex createStringEncoder() {
35          return new DaitchMokotoffSoundex();
36      }
37  
38      private String encode(final String source) {
39          return getStringEncoder().encode(source);
40      }
41  
42      private String soundex(final String source) {
43          return getStringEncoder().soundex(source);
44      }
45  
46      @Test
47      public void testAccentedCharacterFolding() {
48          assertEquals("294795", soundex("Straßburg"));
49          assertEquals("294795", soundex("Strasburg"));
50  
51          assertEquals("095600", soundex("Éregon"));
52          assertEquals("095600", soundex("Eregon"));
53      }
54  
55      @Test
56      public void testAdjacentCodes() {
57          // AKSSOL
58          // A-KS-S-O-L
59          // 0-54-4---8 -> wrong
60          // 0-54-----8 -> correct
61          assertEquals("054800", soundex("AKSSOL"));
62  
63          // GERSCHFELD
64          // G-E-RS-CH-F-E-L-D
65          // 5--4/94-5/4-7-8-3 -> wrong
66          // 5--4/94-5/--7-8-3 -> correct
67          assertEquals("547830|545783|594783|594578", soundex("GERSCHFELD"));
68      }
69  
70      public void testEncodeBasic() {
71          // same as above, but without branching
72          assertEquals("097400", encode("AUERBACH"));
73          assertEquals("097400", encode("OHRBACH"));
74          assertEquals("874400", encode("LIPSHITZ"));
75          assertEquals("874400", encode("LIPPSZYC"));
76          assertEquals("876450", encode("LEWINSKY"));
77          assertEquals("876450", encode("LEVINSKI"));
78          assertEquals("486740", encode("SZLAMAWICZ"));
79          assertEquals("486740", encode("SHLAMOVITZ"));
80      }
81  
82      @Test
83      public void testEncodeIgnoreApostrophes() throws EncoderException {
84          this.checkEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
85                  "OBri'en", "OBrie'n", "OBrien'" });
86      }
87  
88      /**
89       * Test data from http://www.myatt.demon.co.uk/sxalg.htm
90       *
91       * @throws EncoderException for some failure scenarios     */
92      @Test
93      public void testEncodeIgnoreHyphens() throws EncoderException {
94          this.checkEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
95                  "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
96      }
97  
98      @Test
99      public void testEncodeIgnoreTrimmable() {
100         assertEquals("746536", encode(" \t\n\r Washington \t\n\r "));
101         assertEquals("746536", encode("Washington"));
102     }
103 
104     /**
105      * Examples from http://www.jewishgen.org/infofiles/soundex.html
106      */
107     @Test
108     public void testSoundexBasic() {
109         assertEquals("583600", soundex("GOLDEN"));
110         assertEquals("087930", soundex("Alpert"));
111         assertEquals("791900", soundex("Breuer"));
112         assertEquals("579000", soundex("Haber"));
113         assertEquals("665600", soundex("Mannheim"));
114         assertEquals("664000", soundex("Mintz"));
115         assertEquals("370000", soundex("Topf"));
116         assertEquals("586660", soundex("Kleinmann"));
117         assertEquals("769600", soundex("Ben Aron"));
118 
119         assertEquals("097400|097500", soundex("AUERBACH"));
120         assertEquals("097400|097500", soundex("OHRBACH"));
121         assertEquals("874400", soundex("LIPSHITZ"));
122         assertEquals("874400|874500", soundex("LIPPSZYC"));
123         assertEquals("876450", soundex("LEWINSKY"));
124         assertEquals("876450", soundex("LEVINSKI"));
125         assertEquals("486740", soundex("SZLAMAWICZ"));
126         assertEquals("486740", soundex("SHLAMOVITZ"));
127     }
128 
129     /**
130      * Examples from http://www.avotaynu.com/soundex.htm
131      */
132     @Test
133     public void testSoundexBasic2() {
134         assertEquals("467000|567000", soundex("Ceniow"));
135         assertEquals("467000", soundex("Tsenyuv"));
136         assertEquals("587400|587500", soundex("Holubica"));
137         assertEquals("587400", soundex("Golubitsa"));
138         assertEquals("746480|794648", soundex("Przemysl"));
139         assertEquals("746480", soundex("Pshemeshil"));
140         assertEquals("944744|944745|944754|944755|945744|945745|945754|945755", soundex("Rosochowaciec"));
141         assertEquals("945744", soundex("Rosokhovatsets"));
142     }
143 
144     /**
145      * Examples from https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
146      */
147     @Test
148     public void testSoundexBasic3() {
149         assertEquals("734000|739400", soundex("Peters"));
150         assertEquals("734600|739460", soundex("Peterson"));
151         assertEquals("645740", soundex("Moskowitz"));
152         assertEquals("645740", soundex("Moskovitz"));
153         assertEquals("154600|145460|454600|445460", soundex("Jackson"));
154         assertEquals("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464",
155                 soundex("Jackson-Jackson"));
156     }
157 
158     @Test
159     public void testSpecialRomanianCharacters() {
160         assertEquals("364000|464000", soundex("ţamas")); // t-cedilla
161         assertEquals("364000|464000", soundex("țamas")); // t-comma
162     }
163 
164 }