001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022 import org.apache.commons.codec.StringEncoderAbstractTest;
023 import org.junit.Assert;
024 import org.junit.Test;
025
026 /**
027 * Tests {@link Nysiis}
028 *
029 * @since 1.7
030 * @version $Id: NysiisTest.html 889935 2013-12-11 05:05:13Z ggregory $
031 */
032 public class NysiisTest extends StringEncoderAbstractTest {
033
034 private final Nysiis fullNysiis = new Nysiis(false);
035
036 /**
037 * Takes an array of String pairs where each pair's first element is the input and the second element the expected
038 * encoding.
039 *
040 * @param testValues
041 * an array of String pairs where each pair's first element is the input and the second element the
042 * expected encoding.
043 * @throws EncoderException
044 */
045 private void assertEncodings(String[]... testValues) throws EncoderException {
046 for (String[] arr : testValues) {
047 Assert.assertEquals("Problem with " + arr[0], arr[1], this.fullNysiis.encode(arr[0]));
048 }
049 }
050
051 @Override
052 protected StringEncoder createStringEncoder() {
053 return new Nysiis();
054 }
055
056 private void encodeAll(String[] strings, String expectedEncoding) throws EncoderException {
057 for (String string : strings) {
058 Assert.assertEquals("Problem with " + string, expectedEncoding, getStringEncoder().encode(string));
059 }
060 }
061
062 @Test
063 public void testBran() throws EncoderException {
064 encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
065 }
066
067 @Test
068 public void testCap() throws EncoderException {
069 this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
070 }
071
072 @Test
073 public void testDad() throws EncoderException {
074 // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
075 // but it should be DAD, verified also with dropby.com
076 this.encodeAll(new String[] { "Dent" }, "DAD");
077 }
078
079 @Test
080 public void testDan() throws EncoderException {
081 this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
082 }
083
084 /**
085 * Tests data gathered from around the internet.
086 *
087 * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
088 * @throws EncoderException
089 */
090 @Test
091 public void testDropBy() throws EncoderException {
092 // Explanation of differences between this implementation and the one at dropby.com is
093 // prepended to the test string. The referenced rules refer to the outlined steps the
094 // class description for Nysiis.
095
096 this.assertEncodings(
097 // 1. Transcode first characters of name
098 new String[] { "MACINTOSH", "MCANT" },
099 // violates 4j: the second N should not be added, as the first
100 // key char is already a N
101 new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH
102 // O and E are transcoded to A because of rule 4a
103 // H also to A because of rule 4h
104 // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
105 // that skips the next char in such a case?
106 // the remaining A is removed because of rule 7
107 new String[] { "KOEHN", "CAN" }, // Original: C
108 // violates 4j: see also KNUTH
109 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
110 // violates 4j: see also KNUTH
111 new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R]
112 // violates 4j: see also KNUTH
113 new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T]
114 // 2. Transcode last characters of name:
115 new String[] { "MCKEE", "MCY" },
116 new String[] { "MACKIE", "MCY" },
117 new String[] { "HEITSCHMIDT", "HATSNAD" },
118 new String[] { "BART", "BAD" },
119 new String[] { "HURD", "HAD" },
120 new String[] { "HUNT", "HAD" },
121 new String[] { "WESTERLUND", "WASTARLAD" },
122 // 4. Transcode remaining characters by following these rules,
123 // incrementing by one character each time:
124 new String[] { "CASSTEVENS", "CASTAFAN" },
125 new String[] { "VASQUEZ", "VASG" },
126 new String[] { "FRAZIER", "FRASAR" },
127 new String[] { "BOWMAN", "BANAN" },
128 new String[] { "MCKNIGHT", "MCNAGT" },
129 new String[] { "RICKERT", "RACAD" },
130 // violates 5: the last S is not removed
131 // when comparing to DEUTS, which is phonetically similar
132 // the result it also DAT, which is correct for DEUTSCH too imo
133 new String[] { "DEUTSCH", "DAT" }, // Original: DATS
134 new String[] { "WESTPHAL", "WASTFAL" },
135 // violates 4h: the H should be transcoded to S and thus ignored as
136 // the first key character is also S
137 new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R]
138 // same as KOEHN, the L gets mysteriously lost
139 new String[] { "KUHL", "CAL" }, // Original: C
140 new String[] { "RAWSON", "RASAN" },
141 // If last character is S, remove it
142 new String[] { "JILES", "JAL" },
143 // violates 6: if the last two characters are AY, remove A
144 new String[] { "CARRAWAY", "CARY" }, // Original: CARAY
145 new String[] { "YAMADA", "YANAD" });
146 }
147
148 @Test
149 public void testFal() throws EncoderException {
150 this.encodeAll(new String[] { "Phil" }, "FAL");
151 }
152
153 /**
154 * Tests data gathered from around the internets.
155 *
156 * @throws EncoderException
157 */
158 @Test
159 public void testOthers() throws EncoderException {
160 this.assertEncodings(
161 new String[] { "O'Daniel", "ODANAL" },
162 new String[] { "O'Donnel", "ODANAL" },
163 new String[] { "Cory", "CARY" },
164 new String[] { "Corey", "CARY" },
165 new String[] { "Kory", "CARY" },
166 //
167 new String[] { "FUZZY", "FASY" });
168 }
169
170 /**
171 * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS
172 *
173 * @throws EncoderException
174 */
175 @Test
176 public void testRule1() throws EncoderException {
177 this.assertEncodings(
178 new String[] { "MACX", "MCX" },
179 new String[] { "KNX", "NX" },
180 new String[] { "KX", "CX" },
181 new String[] { "PHX", "FX" },
182 new String[] { "PFX", "FX" },
183 new String[] { "SCHX", "SX" });
184 }
185
186 /**
187 * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D
188 *
189 * @throws EncoderException
190 */
191 @Test
192 public void testRule2() throws EncoderException {
193 this.assertEncodings(
194 new String[] { "XEE", "XY" },
195 new String[] { "XIE", "XY" },
196 new String[] { "XDT", "XD" },
197 new String[] { "XRT", "XD" },
198 new String[] { "XRD", "XD" },
199 new String[] { "XNT", "XD" },
200 new String[] { "XND", "XD" });
201 }
202
203 /**
204 * Tests rule 4.1: EV → AF else A, E, I, O, U → A
205 *
206 * @throws EncoderException
207 */
208 @Test
209 public void testRule4Dot1() throws EncoderException {
210 this.assertEncodings(
211 new String[] { "XEV", "XAF" },
212 new String[] { "XAX", "XAX" },
213 new String[] { "XEX", "XAX" },
214 new String[] { "XIX", "XAX" },
215 new String[] { "XOX", "XAX" },
216 new String[] { "XUX", "XAX" });
217 }
218
219 /**
220 * Tests rule 4.2: Q → G, Z → S, M → N
221 *
222 * @throws EncoderException
223 */
224 @Test
225 public void testRule4Dot2() throws EncoderException {
226 this.assertEncodings(
227 new String[] { "XQ", "XG" },
228 new String[] { "XZ", "X" },
229 new String[] { "XM", "XN" });
230 }
231
232 /**
233 * Tests rule 5: If last character is S, remove it.
234 *
235 * @throws EncoderException
236 */
237 @Test
238 public void testRule5() throws EncoderException {
239 this.assertEncodings(
240 new String[] { "XS", "X" },
241 new String[] { "XSS", "X" });
242 }
243
244 /**
245 * Tests rule 6: If last characters are AY, replace with Y.
246 *
247 * @throws EncoderException
248 */
249 @Test
250 public void testRule6() throws EncoderException {
251 this.assertEncodings(
252 new String[] { "XAY", "XY" },
253 new String[] { "XAYS", "XY" }); // Rules 5, 6
254 }
255
256 /**
257 * Tests rule 7: If last character is A, remove it.
258 *
259 * @throws EncoderException
260 */
261 @Test
262 public void testRule7() throws EncoderException {
263 this.assertEncodings(
264 new String[] { "XA", "X" },
265 new String[] { "XAS", "X" }); // Rules 5, 7
266 }
267 @Test
268 public void testSnad() throws EncoderException {
269 // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
270 // but it should be SNAD
271 this.encodeAll(new String[] { "Schmidt" }, "SNAD");
272 }
273
274 @Test
275 public void testSnat() throws EncoderException {
276 this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
277 }
278
279 @Test
280 public void testSpecialBranches() throws EncoderException {
281 this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
282 this.encodeAll(new String[] { "Kocher" }, "CACAR");
283 this.encodeAll(new String[] { "Fesca" }, "FASC");
284 this.encodeAll(new String[] { "Shom" }, "SAN");
285 this.encodeAll(new String[] { "Ohlo" }, "OL");
286 this.encodeAll(new String[] { "Uhu" }, "UH");
287 this.encodeAll(new String[] { "Um" }, "UN");
288 }
289
290 @Test
291 public void testTranan() throws EncoderException {
292 this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
293 }
294
295 @Test
296 public void testTrueVariant() {
297 Nysiis encoder = new Nysiis(true);
298
299 String encoded = encoder.encode("WESTERLUND");
300 Assert.assertTrue(encoded.length() <= 6);
301 Assert.assertEquals("WASTAR", encoded);
302 }
303
304 }