View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  import static org.junit.jupiter.api.Assertions.assertThrows;
22  import static org.junit.jupiter.api.Assertions.assertTrue;
23  
24  import java.util.HashSet;
25  import java.util.Locale;
26  import java.util.Set;
27  
28  import org.apache.commons.codec.AbstractStringEncoderTest;
29  import org.apache.commons.codec.EncoderException;
30  import org.junit.jupiter.api.AfterAll;
31  import org.junit.jupiter.api.Test;
32  import org.opentest4j.AssertionFailedError;
33  
34  /**
35   * Tests the {@code ColognePhonetic} class.
36   *
37   * <p>
38   * Keep this file in UTF-8 encoding for proper Javadoc processing.
39   * </p>
40   */
41  public class ColognePhoneticTest extends AbstractStringEncoderTest<ColognePhonetic> {
42  
43      private static final Set<String> TESTSET = new HashSet<>();
44  
45      /** Character sequences to be tested by the code. */
46      // @formatter:off
47      private static final String[] MATCHES = {
48              ".*[AEIOUJY].*",         // A, E, I, J, O, U, Y
49              ".*H.*",                 // H
50              ".*B.*",                 // B
51              ".*P[^H].*",             // P not before H
52              ".*[DT][^CSZ].*",        // D,T not before C,S,Z
53              ".*[FVW].*",             // F,V,W
54              ".*PH.*",                // P before H
55              ".*[GKQ].*",             // G,K,Q
56              "C[AHKLOQRUX].*",        // Initial C before A, H, K, L, O, Q, R, U, X
57              ".*[^SZ]C[AHKLOQRUX].*", // C before A, H, K, L, O, Q, R, U, X but not after S, Z
58              ".*[^CKQ]X.*",           // X not after C,K,Q
59              ".*L.*",                 // L
60              ".*[MN].*",              // M,N
61              ".*R.*",                 // R
62              ".*[SZ].*",              // S,Z
63              ".*[SZ]C.*",             // C after S,Z
64              "C[^AHKLOQRUX].*",       // Initial C except before A, H, K, L, O, Q, R, U, X
65              ".+C[^AHKLOQRUX].*",     // C except before A, H, K, L, O, Q, R, U, X
66              ".*[DT][CSZ].*",         // D,T before C,S,Z
67              ".*[CKQ]X.*",            // X after C,K,Q
68              // @formatter:on
69      };
70  
71      @AfterAll
72      // Check that all possible input sequence conditions are represented
73      public static void finishTests() {
74          int errors = 0;
75          for (final String m : MATCHES) {
76              if (!hasTestCase(m)) {
77                  System.out.println(m + " has no test case");
78                  errors++;
79              }
80          }
81          assertEquals(0, errors, "Not expecting any missing test cases");
82      }
83  
84      private static boolean hasTestCase(final String re) {
85          for (final String s : TESTSET) {
86              if (s.matches(re)) {
87                  return true;
88              }
89          }
90          return false;
91      }
92  
93      // Allow command-line testing
94      public static void main(final String args[]) {
95          final ColognePhonetic coder = new ColognePhonetic();
96          for (final String arg : args) {
97              final String code = coder.encode(arg);
98              System.out.println("'" + arg + "' = '" + code + "'");
99          }
100     }
101 
102     @Override
103     // Capture test strings for later checking
104     public void checkEncoding(final String expected, final String source) throws EncoderException {
105         // Note that the German letter Eszett is converted to SS by toUpperCase, so we don't need to replace it
106         TESTSET.add(source.toUpperCase(Locale.GERMAN).replace('Ä', 'A').replace('Ö', 'O').replace('Ü', 'U'));
107         super.checkEncoding(expected, source);
108     }
109 
110     @Override
111     protected ColognePhonetic createStringEncoder() {
112         return new ColognePhonetic();
113     }
114 
115     @Test
116     public void testAabjoe() throws EncoderException {
117         this.checkEncoding("01", "Aabjoe");
118     }
119 
120     @Test
121     public void testAaclan() throws EncoderException {
122         this.checkEncoding("0856", "Aaclan");
123     }
124 
125     /**
126      * Tests [CODEC-122]
127      *
128      * @throws EncoderException for some failure scenarios
129      */
130     @Test
131     public void testAychlmajrForCodec122() throws EncoderException {
132         this.checkEncoding("04567", "Aychlmajr");
133     }
134 
135     @Test
136     // Ensure that override still allows tests to work
137     public void testCanFail() {
138         assertThrows(AssertionFailedError.class, () -> this.checkEncoding("/", "Fehler"));
139     }
140 
141     @Test
142     public void testEdgeCases() throws EncoderException {
143         // @formatter:off
144         final String[][] data = {
145             { "a", "0" },
146             { "e", "0" },
147             { "i", "0" },
148             { "o", "0" },
149             { "u", "0" },
150             { "\u00E4", "0" }, // a-umlaut
151             { "\u00F6", "0" }, // o-umlaut
152             { "\u00FC", "0" }, // u-umlaut
153             { "\u00DF", "8" }, // small sharp s
154             { "aa", "0" },
155             { "ha", "0" },
156             { "h", "" },
157             { "aha", "0" },
158             { "b", "1" },
159             { "p", "1" },
160             { "ph", "3" },
161             { "f", "3" },
162             { "v", "3" },
163             { "w", "3" },
164             { "g", "4" },
165             { "k", "4" },
166             { "q", "4" },
167             { "x", "48" },
168             { "ax", "048" },
169             { "cx", "48" },
170             { "l", "5" },
171             { "cl", "45" },
172             { "acl", "085" },
173             { "mn", "6" },
174             { "{mn}", "6" }, // test chars above Z
175             { "r", "7" }
176         };
177         // @formatter:on
178         this.checkEncodings(data);
179     }
180 
181     @Test
182     public void testExamples() throws EncoderException {
183         // @formatter:off
184         final String[][] data = {
185             { "m\u00DCller", "657" }, // mÜller - why upper case U-umlaut?
186             { "m\u00FCller", "657" }, // müller - add equivalent lower-case
187             { "schmidt", "862" },
188             { "schneider", "8627" },
189             { "fischer", "387" },
190             { "weber", "317" },
191             { "wagner", "3467" },
192             { "becker", "147" },
193             { "hoffmann", "0366" },
194             { "sch\u00C4fer", "837" }, // schÄfer - why upper case A-umlaut ?
195             { "sch\u00e4fer", "837" }, // schäfer - add equivalent lower-case
196             { "Breschnew", "17863" },
197             { "Wikipedia", "3412" },
198             { "peter", "127" },
199             { "pharma", "376" },
200             { "m\u00f6nchengladbach", "664645214" }, // mönchengladbach
201             { "deutsch", "28" },
202             { "deutz", "28" },
203             { "hamburg", "06174" },
204             { "hannover", "0637" },
205             { "christstollen", "478256" },
206             { "Xanthippe", "48621" },
207             { "Zacharias", "8478" },
208             { "Holzbau", "0581" },
209             { "matsch", "68" },
210             { "matz", "68" },
211             { "Arbeitsamt", "071862" },
212             { "Eberhard", "01772" },
213             { "Eberhardt", "01772" },
214             { "Celsius", "8588" },
215             { "Ace", "08" },
216             { "shch", "84" }, // CODEC-254
217             { "xch", "484" }, // CODEC-255
218             { "heithabu", "021" }
219         };
220         // @formatter:on
221         this.checkEncodings(data);
222     }
223 
224     @Test
225     public void testHyphen() throws EncoderException {
226         final String[][] data = { { "bergisch-gladbach", "174845214" }, { "M\u00fcller-L\u00fcdenscheidt", "65752682" } }; // Müller-Lüdenscheidt
227         this.checkEncodings(data);
228     }
229 
230     @Test
231     public void testIsEncodeEquals() {
232         //@formatter:off
233         final String[][] data = {
234             { "Muller", "M\u00fcller" }, // Müller
235             { "Meyer", "Mayr" },
236             { "house", "house" },
237             { "House", "house" },
238             { "Haus", "house" },
239             { "ganz", "Gans" },
240             { "ganz", "G\u00e4nse" }, // Gänse
241             { "Miyagi", "Miyako" }
242         };
243         //@formatter:on
244         for (final String[] element : data) {
245             final boolean encodeEqual = this.getStringEncoder().isEncodeEqual(element[1], element[0]);
246             assertTrue(encodeEqual, element[1] + " != " + element[0]);
247         }
248     }
249 
250     @Test
251     public void testSpecialCharsBetweenSameLetters() throws EncoderException {
252         final String[] data = { "Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test" };
253         this.checkEncodingVariations("28282", data);
254     }
255 
256     @Test
257     public void testVariationsMella() throws EncoderException {
258         final String[] data = { "mella", "milah", "moulla", "mellah", "muehle", "mule" };
259         this.checkEncodingVariations("65", data);
260     }
261 
262     @Test
263     public void testVariationsMeyer() throws EncoderException {
264         final String[] data = { "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major" };
265         this.checkEncodingVariations("67", data);
266     }
267 }