View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  import static org.junit.jupiter.api.Assertions.assertThrows;
22  import static org.junit.jupiter.api.Assertions.assertTrue;
23  
24  import java.util.HashSet;
25  import java.util.Locale;
26  import java.util.Set;
27  import java.util.stream.Stream;
28  
29  import org.apache.commons.codec.AbstractStringEncoderTest;
30  import org.apache.commons.codec.EncoderException;
31  import org.junit.jupiter.api.AfterAll;
32  import org.junit.jupiter.api.Test;
33  import org.junit.jupiter.params.ParameterizedTest;
34  import org.junit.jupiter.params.provider.Arguments;
35  import org.junit.jupiter.params.provider.MethodSource;
36  import org.opentest4j.AssertionFailedError;
37  
38  /**
39   * Tests the {@link ColognePhonetic} class.
40   *
41   * <p>
42   * Keep this file in UTF-8 encoding for proper Javadoc processing.
43   * </p>
44   */
45  class ColognePhoneticTest extends AbstractStringEncoderTest<ColognePhonetic> {
46  
47      private static final Set<String> TESTSET = new HashSet<>();
48  
49      /** Character sequences to be tested by the code. */
50      // @formatter:off
51      private static final String[] MATCHES = {
52              ".*[AEIOUJY].*",         // A, E, I, J, O, U, Y
53              ".*H.*",                 // H
54              ".*B.*",                 // B
55              ".*P[^H].*",             // P not before H
56              ".*[DT][^CSZ].*",        // D,T not before C,S,Z
57              ".*[FVW].*",             // F,V,W
58              ".*PH.*",                // P before H
59              ".*[GKQ].*",             // G,K,Q
60              "C[AHKLOQRUX].*",        // Initial C before A, H, K, L, O, Q, R, U, X
61              ".*[^SZ]C[AHKLOQRUX].*", // C before A, H, K, L, O, Q, R, U, X but not after S, Z
62              ".*[^CKQ]X.*",           // X not after C,K,Q
63              ".*L.*",                 // L
64              ".*[MN].*",              // M,N
65              ".*R.*",                 // R
66              ".*[SZ].*",              // S,Z
67              ".*[SZ]C.*",             // C after S,Z
68              "C[^AHKLOQRUX].*",       // Initial C except before A, H, K, L, O, Q, R, U, X
69              ".+C[^AHKLOQRUX].*",     // C except before A, H, K, L, O, Q, R, U, X
70              ".*[DT][CSZ].*",         // D,T before C,S,Z
71              ".*[CKQ]X.*",            // X after C,K,Q
72              // @formatter:on
73      };
74  
75      @AfterAll
76      // Check that all possible input sequence conditions are represented
77      static void finishTests() {
78          int errors = 0;
79          for (final String m : MATCHES) {
80              if (!hasTestCase(m)) {
81                  System.out.println(m + " has no test case");
82                  errors++;
83              }
84          }
85          assertEquals(0, errors, "Not expecting any missing test cases");
86      }
87  
88      private static boolean hasTestCase(final String re) {
89          for (final String s : TESTSET) {
90              if (s.matches(re)) {
91                  return true;
92              }
93          }
94          return false;
95      }
96  
97      // Allow command-line testing
98      public static void main(final String[] args) {
99          final ColognePhonetic coder = new ColognePhonetic();
100         for (final String arg : args) {
101             final String code = coder.encode(arg);
102             System.out.println("'" + arg + "' = '" + code + "'");
103         }
104     }
105 
106     static Stream<Arguments> testBasicEncoding() {
107         // @formatter:off
108         return Stream.of(
109             Arguments.arguments("01", "Aabjoe"),
110             Arguments.arguments("0856", "Aaclan"),
111             Arguments.arguments("04567", "Aychlmajr") // CODEC-122
112         );
113         // @formatter:on
114     }
115 
116     static Stream<Arguments> testEdgeCases() {
117         // @formatter:off
118         return Stream.of(
119             Arguments.arguments("a", "0"),
120             Arguments.arguments("e", "0"),
121             Arguments.arguments("i", "0"),
122             Arguments.arguments("o", "0"),
123             Arguments.arguments("u", "0"),
124             Arguments.arguments("\u00E4", "0"), // a-umlaut
125             Arguments.arguments("\u00F6", "0"), // o-umlaut
126             Arguments.arguments("\u00FC", "0"), // u-umlaut
127             Arguments.arguments("\u00DF", "8"), // small sharp s
128             Arguments.arguments("aa", "0"),
129             Arguments.arguments("ha", "0"),
130             Arguments.arguments("h", ""),
131             Arguments.arguments("aha", "0"),
132             Arguments.arguments("b", "1"),
133             Arguments.arguments("p", "1"),
134             Arguments.arguments("ph", "3"),
135             Arguments.arguments("f", "3"),
136             Arguments.arguments("v", "3"),
137             Arguments.arguments("w", "3"),
138             Arguments.arguments("g", "4"),
139             Arguments.arguments("k", "4"),
140             Arguments.arguments("q", "4"),
141             Arguments.arguments("x", "48"),
142             Arguments.arguments("ax", "048"),
143             Arguments.arguments("cx", "48"),
144             Arguments.arguments("l", "5"),
145             Arguments.arguments("cl", "45"),
146             Arguments.arguments("acl", "085"),
147             Arguments.arguments("mn", "6"),
148             Arguments.arguments("{mn}", "6"), // test chars above Z
149             Arguments.arguments("r", "7")
150         );
151         // @formatter:on
152     }
153 
154     static Stream<Arguments> testExamples() {
155         // @formatter:off
156         return Stream.of(
157             Arguments.arguments("m\u00DCller", "657"), // mÜller - why upper case U-umlaut?
158             Arguments.arguments("m\u00FCller", "657"), // müller - add equivalent lower-case
159             Arguments.arguments("schmidt", "862"),
160             Arguments.arguments("schneider", "8627"),
161             Arguments.arguments("fischer", "387"),
162             Arguments.arguments("weber", "317"),
163             Arguments.arguments("wagner", "3467"),
164             Arguments.arguments("becker", "147"),
165             Arguments.arguments("hoffmann", "036"),
166             Arguments.arguments("sch\u00C4fer", "837"), // schÄfer - why upper case A-umlaut ?
167             Arguments.arguments("sch\u00e4fer", "837"), // schäfer - add equivalent lower-case
168             Arguments.arguments("Breschnew", "17863"),
169             Arguments.arguments("Wikipedia", "3412"),
170             Arguments.arguments("peter", "127"),
171             Arguments.arguments("pharma", "376"),
172             Arguments.arguments("m\u00f6nchengladbach", "64645214"), // mönchengladbach
173             Arguments.arguments("deutsch", "28"),
174             Arguments.arguments("deutz", "28"),
175             Arguments.arguments("hamburg", "06174"),
176             Arguments.arguments("hannover", "0637"),
177             Arguments.arguments("christstollen", "478256"),
178             Arguments.arguments("Xanthippe", "48621"),
179             Arguments.arguments("Zacharias", "8478"),
180             Arguments.arguments("Holzbau", "0581"),
181             Arguments.arguments("matsch", "68"),
182             Arguments.arguments("matz", "68"),
183             Arguments.arguments("Arbeitsamt", "071862"),
184             Arguments.arguments("Eberhard", "0172"),
185             Arguments.arguments("Eberhardt", "0172"),
186             Arguments.arguments("Celsius", "858"),
187             Arguments.arguments("Ace", "08"),
188             Arguments.arguments("shch", "84"), // CODEC-254
189             Arguments.arguments("xch", "484"), // CODEC-255
190             Arguments.arguments("heithabu", "021")
191         );
192         // @formatter:on
193     }
194 
195     static Stream<Arguments> testIsEncodeEquals() {
196         // @formatter:off
197         return Stream.of(
198             Arguments.arguments("Muller", "M\u00fcller"), // Müller
199             Arguments.arguments("Meyer", "Mayr"),
200             Arguments.arguments("house", "house"),
201             Arguments.arguments("House", "house"),
202             Arguments.arguments("Haus", "house"),
203             Arguments.arguments("ganz", "Gans"),
204             Arguments.arguments("ganz", "G\u00e4nse"), // Gänse
205             Arguments.arguments("Miyagi", "Miyako")
206         );
207         // @formatter:on
208     }
209 
210     @Override
211     // Capture test strings for later checking
212     public void checkEncoding(final String expected, final String source) throws EncoderException {
213         // Note that the German letter Eszett is converted to SS by toUpperCase, so we don't need to replace it
214         TESTSET.add(source.toUpperCase(Locale.GERMAN).replace('Ä', 'A').replace('Ö', 'O').replace('Ü', 'U'));
215         super.checkEncoding(expected, source);
216     }
217 
218     @Override
219     protected ColognePhonetic createStringEncoder() {
220         return new ColognePhonetic();
221     }
222 
223     @ParameterizedTest
224     @MethodSource
225     void testBasicEncoding(final String expected, final String source) throws EncoderException {
226         checkEncoding(expected, source);
227     }
228 
229     @Test
230     // Ensure that override still allows tests to work
231     void testCanFail() {
232         assertThrows(AssertionFailedError.class, () -> checkEncoding("/", "Fehler"));
233     }
234 
235     @ParameterizedTest
236     @MethodSource
237     void testEdgeCases(final String source, final String expected) throws EncoderException {
238         checkEncoding(expected, source);
239     }
240 
241     @ParameterizedTest
242     @MethodSource
243     void testExamples(final String source, final String expected) throws EncoderException {
244         checkEncoding(expected, source);
245     }
246 
247     @Test
248     void testHyphen() throws EncoderException {
249         // Müller-Lüdenscheidt
250         checkEncodings(new String[][] { { "bergisch-gladbach", "174845214" }, { "M\u00fcller-L\u00fcdenscheidt", "65752682" } });
251     }
252 
253     @ParameterizedTest
254     @MethodSource
255     void testIsEncodeEquals(final String source, final String expected) {
256         final boolean encodeEqual = getStringEncoder().isEncodeEqual(expected, source);
257         assertTrue(encodeEqual, () -> expected + " != " + source);
258     }
259 
260     @Test
261     void testSpecialCharsBetweenSameLetters() throws EncoderException {
262         checkEncodingVariations("28282", "Test test", "Testtest", "Test-test", "TesT#Test", "TesT?test");
263     }
264 
265     @Test
266     void testVariationsMella() throws EncoderException {
267         checkEncodingVariations("65", "mella", "milah", "moulla", "mellah", "muehle", "mule");
268     }
269 
270     @Test
271     void testVariationsMeyer() throws EncoderException {
272         checkEncodingVariations("67", "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major");
273     }
274 }