1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import static org.junit.jupiter.api.Assertions.assertEquals;
21 import static org.junit.jupiter.api.Assertions.assertTrue;
22
23 import org.apache.commons.codec.AbstractStringEncoderTest;
24 import org.junit.jupiter.api.Test;
25
26 /**
27 * Tests {@link Nysiis}
28 */
29 class NysiisTest extends AbstractStringEncoderTest<Nysiis> {
30
31 private final Nysiis fullNysiis = new Nysiis(false);
32
33 /**
34 * Takes an array of String pairs where each pair's first element is the input and the second element the expected
35 * encoding.
36 *
37 * @param testValues
38 * an array of String pairs where each pair's first element is the input and the second element the
39 * expected encoding.
40 */
41 private void assertEncodings(final String[]... testValues) {
42 for (final String[] arr : testValues) {
43 assertEquals(arr[1], this.fullNysiis.encode(arr[0]), "Problem with " + arr[0]);
44 }
45 }
46
47 @Override
48 protected Nysiis createStringEncoder() {
49 return new Nysiis();
50 }
51
52 private void encodeAll(final String[] strings, final String expectedEncoding) {
53 for (final String string : strings) {
54 assertEquals(expectedEncoding, getStringEncoder().encode(string), "Problem with " + string);
55 }
56 }
57
58 @Test
59 void testBran() {
60 encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
61 }
62
63 @Test
64 void testCap() {
65 encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
66 }
67
68 @Test
69 void testDad() {
70 // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
71 // but it should be DAD, verified also with dropby.com
72 encodeAll(new String[] { "Dent" }, "DAD");
73 }
74
75 @Test
76 void testDan() {
77 encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
78 }
79
80 /**
81 * Tests data gathered from around the internet.
82 *
83 * @see <a href="https://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>*/
84 @Test
85 void testDropBy() {
86 // Explanation of differences between this implementation and the one at dropby.com is
87 // prepended to the test string. The referenced rules refer to the outlined steps the
88 // class description for Nysiis.
89
90 assertEncodings(
91 // 1. Transcode first characters of name
92 new String[] { "MACINTOSH", "MCANT" },
93 // violates 4j: the second N should not be added, as the first
94 // key char is already a N
95 new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
96 // O and E are transcoded to A because of rule 4a
97 // H also to A because of rule 4h
98 // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
99 // that skips the next char in such a case?
100 // the remaining A is removed because of rule 7
101 new String[] { "KOEHN", "CAN" }, // Original: C
102 // violates 4j: see also KNUTH
103 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
104 // violates 4j: see also KNUTH
105 new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
106 // violates 4j: see also KNUTH
107 new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
108 // 2. Transcode last characters of name:
109 new String[] { "MCKEE", "MCY" },
110 new String[] { "MACKIE", "MCY" },
111 new String[] { "HEITSCHMIDT", "HATSNAD" },
112 new String[] { "BART", "BAD" },
113 new String[] { "HURD", "HAD" },
114 new String[] { "HUNT", "HAD" },
115 new String[] { "WESTERLUND", "WASTARLAD" },
116 // 4. Transcode remaining characters by following these rules,
117 // incrementing by one character each time:
118 new String[] { "CASSTEVENS", "CASTAFAN" },
119 new String[] { "VASQUEZ", "VASG" },
120 new String[] { "FRAZIER", "FRASAR" },
121 new String[] { "BOWMAN", "BANAN" },
122 new String[] { "MCKNIGHT", "MCNAGT" },
123 new String[] { "RICKERT", "RACAD" },
124 // violates 5: the last S is not removed
125 // when comparing to DEUTS, which is phonetically similar
126 // the result it also DAT, which is correct for DEUTSCH too imo
127 new String[] { "DEUTSCH", "DAT" }, // Original: DATS
128 new String[] { "WESTPHAL", "WASTFAL" },
129 // violates 4h: the H should be transcoded to S and thus ignored as
130 // the first key character is also S
131 new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
132 // same as KOEHN, the L gets mysteriously lost
133 new String[] { "KUHL", "CAL" }, // Original: C
134 new String[] { "RAWSON", "RASAN" },
135 // If last character is S, remove it
136 new String[] { "JILES", "JAL" },
137 // violates 6: if the last two characters are AY, remove A
138 new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
139 new String[] { "YAMADA", "YANAD" });
140 }
141
142 @Test
143 void testFal() {
144 encodeAll(new String[] { "Phil" }, "FAL");
145 }
146
147 /**
148 * Tests data gathered from around the internets.*/
149 @Test
150 void testOthers() {
151 assertEncodings(
152 new String[] { "O'Daniel", "ODANAL" },
153 new String[] { "O'Donnel", "ODANAL" },
154 new String[] { "Cory", "CARY" },
155 new String[] { "Corey", "CARY" },
156 new String[] { "Kory", "CARY" },
157 //
158 new String[] { "FUZZY", "FASY" });
159 }
160
161 /**
162 * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS*/
163 @Test
164 void testRule1() {
165 assertEncodings(
166 new String[] { "MACX", "MCX" },
167 new String[] { "KNX", "NX" },
168 new String[] { "KX", "CX" },
169 new String[] { "PHX", "FX" },
170 new String[] { "PFX", "FX" },
171 new String[] { "SCHX", "SX" });
172 }
173
174 /**
175 * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D*/
176 @Test
177 void testRule2() {
178 assertEncodings(
179 new String[] { "XEE", "XY" },
180 new String[] { "XIE", "XY" },
181 new String[] { "XDT", "XD" },
182 new String[] { "XRT", "XD" },
183 new String[] { "XRD", "XD" },
184 new String[] { "XNT", "XD" },
185 new String[] { "XND", "XD" });
186 }
187
188 /**
189 * Tests rule 4.1: EV → AF else A, E, I, O, U → A*/
190 @Test
191 void testRule4Dot1() {
192 assertEncodings(
193 new String[] { "XEV", "XAF" },
194 new String[] { "XAX", "XAX" },
195 new String[] { "XEX", "XAX" },
196 new String[] { "XIX", "XAX" },
197 new String[] { "XOX", "XAX" },
198 new String[] { "XUX", "XAX" });
199 }
200
201 /**
202 * Tests rule 4.2: Q → G, Z → S, M → N*/
203 @Test
204 void testRule4Dot2() {
205 assertEncodings(
206 new String[] { "XQ", "XG" },
207 new String[] { "XZ", "X" },
208 new String[] { "XM", "XN" });
209 }
210
211 /**
212 * Tests rule 5: If last character is S, remove it.*/
213 @Test
214 void testRule5() {
215 assertEncodings(
216 new String[] { "XS", "X" },
217 new String[] { "XSS", "X" });
218 }
219
220 /**
221 * Tests rule 6: If last characters are AY, replace with Y.*/
222 @Test
223 void testRule6() {
224 assertEncodings(
225 new String[] { "XAY", "XY" },
226 new String[] { "XAYS", "XY" }); // Rules 5, 6
227 }
228
229 /**
230 * Tests rule 7: If last character is A, remove it.*/
231 @Test
232 void testRule7() {
233 assertEncodings(
234 new String[] { "XA", "X" },
235 new String[] { "XAS", "X" }); // Rules 5, 7
236 }
237 @Test
238 void testSnad() {
239 // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
240 // but it should be SNAD
241 encodeAll(new String[] { "Schmidt" }, "SNAD");
242 }
243
244 @Test
245 void testSnat() {
246 encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
247 }
248
249 @Test
250 void testSpecialBranches() {
251 encodeAll(new String[] { "Kobwick" }, "CABWAC");
252 encodeAll(new String[] { "Kocher" }, "CACAR");
253 encodeAll(new String[] { "Fesca" }, "FASC");
254 encodeAll(new String[] { "Shom" }, "SAN");
255 encodeAll(new String[] { "Ohlo" }, "OL");
256 encodeAll(new String[] { "Uhu" }, "UH");
257 encodeAll(new String[] { "Um" }, "UN");
258 }
259
260 @Test
261 void testTranan() {
262 encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
263 }
264
265 @Test
266 void testTrueVariant() {
267 final Nysiis encoder = new Nysiis(true);
268
269 final String encoded = encoder.encode("WESTERLUND");
270 assertTrue(encoded.length() <= 6);
271 assertEquals("WASTAR", encoded);
272 }
273
274 }