001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 import org.apache.commons.codec.StringEncoderAbstractTest; 023 import org.junit.Assert; 024 import org.junit.Test; 025 026 /** 027 * Tests {@link Nysiis} 028 * 029 * @since 1.7 030 * @version $Id: NysiisTest.html 889935 2013-12-11 05:05:13Z ggregory $ 031 */ 032 public class NysiisTest extends StringEncoderAbstractTest { 033 034 private final Nysiis fullNysiis = new Nysiis(false); 035 036 /** 037 * Takes an array of String pairs where each pair's first element is the input and the second element the expected 038 * encoding. 039 * 040 * @param testValues 041 * an array of String pairs where each pair's first element is the input and the second element the 042 * expected encoding. 043 * @throws EncoderException 044 */ 045 private void assertEncodings(String[]... testValues) throws EncoderException { 046 for (String[] arr : testValues) { 047 Assert.assertEquals("Problem with " + arr[0], arr[1], this.fullNysiis.encode(arr[0])); 048 } 049 } 050 051 @Override 052 protected StringEncoder createStringEncoder() { 053 return new Nysiis(); 054 } 055 056 private void encodeAll(String[] strings, String expectedEncoding) throws EncoderException { 057 for (String string : strings) { 058 Assert.assertEquals("Problem with " + string, expectedEncoding, getStringEncoder().encode(string)); 059 } 060 } 061 062 @Test 063 public void testBran() throws EncoderException { 064 encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN"); 065 } 066 067 @Test 068 public void testCap() throws EncoderException { 069 this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP"); 070 } 071 072 @Test 073 public void testDad() throws EncoderException { 074 // Data Quality and Record Linkage Techniques P.121 claims this is DAN, 075 // but it should be DAD, verified also with dropby.com 076 this.encodeAll(new String[] { "Dent" }, "DAD"); 077 } 078 079 @Test 080 public void testDan() throws EncoderException { 081 this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN"); 082 } 083 084 /** 085 * Tests data gathered from around the internet. 086 * 087 * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a> 088 * @throws EncoderException 089 */ 090 @Test 091 public void testDropBy() throws EncoderException { 092 // Explanation of differences between this implementation and the one at dropby.com is 093 // prepended to the test string. The referenced rules refer to the outlined steps the 094 // class description for Nysiis. 095 096 this.assertEncodings( 097 // 1. Transcode first characters of name 098 new String[] { "MACINTOSH", "MCANT" }, 099 // violates 4j: the second N should not be added, as the first 100 // key char is already a N 101 new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH 102 // O and E are transcoded to A because of rule 4a 103 // H also to A because of rule 4h 104 // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h 105 // that skips the next char in such a case? 106 // the remaining A is removed because of rule 7 107 new String[] { "KOEHN", "CAN" }, // Original: C 108 // violates 4j: see also KNUTH 109 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN] 110 // violates 4j: see also KNUTH 111 new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R] 112 // violates 4j: see also KNUTH 113 new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T] 114 // 2. Transcode last characters of name: 115 new String[] { "MCKEE", "MCY" }, 116 new String[] { "MACKIE", "MCY" }, 117 new String[] { "HEITSCHMIDT", "HATSNAD" }, 118 new String[] { "BART", "BAD" }, 119 new String[] { "HURD", "HAD" }, 120 new String[] { "HUNT", "HAD" }, 121 new String[] { "WESTERLUND", "WASTARLAD" }, 122 // 4. Transcode remaining characters by following these rules, 123 // incrementing by one character each time: 124 new String[] { "CASSTEVENS", "CASTAFAN" }, 125 new String[] { "VASQUEZ", "VASG" }, 126 new String[] { "FRAZIER", "FRASAR" }, 127 new String[] { "BOWMAN", "BANAN" }, 128 new String[] { "MCKNIGHT", "MCNAGT" }, 129 new String[] { "RICKERT", "RACAD" }, 130 // violates 5: the last S is not removed 131 // when comparing to DEUTS, which is phonetically similar 132 // the result it also DAT, which is correct for DEUTSCH too imo 133 new String[] { "DEUTSCH", "DAT" }, // Original: DATS 134 new String[] { "WESTPHAL", "WASTFAL" }, 135 // violates 4h: the H should be transcoded to S and thus ignored as 136 // the first key character is also S 137 new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R] 138 // same as KOEHN, the L gets mysteriously lost 139 new String[] { "KUHL", "CAL" }, // Original: C 140 new String[] { "RAWSON", "RASAN" }, 141 // If last character is S, remove it 142 new String[] { "JILES", "JAL" }, 143 // violates 6: if the last two characters are AY, remove A 144 new String[] { "CARRAWAY", "CARY" }, // Original: CARAY 145 new String[] { "YAMADA", "YANAD" }); 146 } 147 148 @Test 149 public void testFal() throws EncoderException { 150 this.encodeAll(new String[] { "Phil" }, "FAL"); 151 } 152 153 /** 154 * Tests data gathered from around the internets. 155 * 156 * @throws EncoderException 157 */ 158 @Test 159 public void testOthers() throws EncoderException { 160 this.assertEncodings( 161 new String[] { "O'Daniel", "ODANAL" }, 162 new String[] { "O'Donnel", "ODANAL" }, 163 new String[] { "Cory", "CARY" }, 164 new String[] { "Corey", "CARY" }, 165 new String[] { "Kory", "CARY" }, 166 // 167 new String[] { "FUZZY", "FASY" }); 168 } 169 170 /** 171 * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS 172 * 173 * @throws EncoderException 174 */ 175 @Test 176 public void testRule1() throws EncoderException { 177 this.assertEncodings( 178 new String[] { "MACX", "MCX" }, 179 new String[] { "KNX", "NX" }, 180 new String[] { "KX", "CX" }, 181 new String[] { "PHX", "FX" }, 182 new String[] { "PFX", "FX" }, 183 new String[] { "SCHX", "SX" }); 184 } 185 186 /** 187 * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D 188 * 189 * @throws EncoderException 190 */ 191 @Test 192 public void testRule2() throws EncoderException { 193 this.assertEncodings( 194 new String[] { "XEE", "XY" }, 195 new String[] { "XIE", "XY" }, 196 new String[] { "XDT", "XD" }, 197 new String[] { "XRT", "XD" }, 198 new String[] { "XRD", "XD" }, 199 new String[] { "XNT", "XD" }, 200 new String[] { "XND", "XD" }); 201 } 202 203 /** 204 * Tests rule 4.1: EV → AF else A, E, I, O, U → A 205 * 206 * @throws EncoderException 207 */ 208 @Test 209 public void testRule4Dot1() throws EncoderException { 210 this.assertEncodings( 211 new String[] { "XEV", "XAF" }, 212 new String[] { "XAX", "XAX" }, 213 new String[] { "XEX", "XAX" }, 214 new String[] { "XIX", "XAX" }, 215 new String[] { "XOX", "XAX" }, 216 new String[] { "XUX", "XAX" }); 217 } 218 219 /** 220 * Tests rule 4.2: Q → G, Z → S, M → N 221 * 222 * @throws EncoderException 223 */ 224 @Test 225 public void testRule4Dot2() throws EncoderException { 226 this.assertEncodings( 227 new String[] { "XQ", "XG" }, 228 new String[] { "XZ", "X" }, 229 new String[] { "XM", "XN" }); 230 } 231 232 /** 233 * Tests rule 5: If last character is S, remove it. 234 * 235 * @throws EncoderException 236 */ 237 @Test 238 public void testRule5() throws EncoderException { 239 this.assertEncodings( 240 new String[] { "XS", "X" }, 241 new String[] { "XSS", "X" }); 242 } 243 244 /** 245 * Tests rule 6: If last characters are AY, replace with Y. 246 * 247 * @throws EncoderException 248 */ 249 @Test 250 public void testRule6() throws EncoderException { 251 this.assertEncodings( 252 new String[] { "XAY", "XY" }, 253 new String[] { "XAYS", "XY" }); // Rules 5, 6 254 } 255 256 /** 257 * Tests rule 7: If last character is A, remove it. 258 * 259 * @throws EncoderException 260 */ 261 @Test 262 public void testRule7() throws EncoderException { 263 this.assertEncodings( 264 new String[] { "XA", "X" }, 265 new String[] { "XAS", "X" }); // Rules 5, 7 266 } 267 @Test 268 public void testSnad() throws EncoderException { 269 // Data Quality and Record Linkage Techniques P.121 claims this is SNAT, 270 // but it should be SNAD 271 this.encodeAll(new String[] { "Schmidt" }, "SNAD"); 272 } 273 274 @Test 275 public void testSnat() throws EncoderException { 276 this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT"); 277 } 278 279 @Test 280 public void testSpecialBranches() throws EncoderException { 281 this.encodeAll(new String[] { "Kobwick" }, "CABWAC"); 282 this.encodeAll(new String[] { "Kocher" }, "CACAR"); 283 this.encodeAll(new String[] { "Fesca" }, "FASC"); 284 this.encodeAll(new String[] { "Shom" }, "SAN"); 285 this.encodeAll(new String[] { "Ohlo" }, "OL"); 286 this.encodeAll(new String[] { "Uhu" }, "UH"); 287 this.encodeAll(new String[] { "Um" }, "UN"); 288 } 289 290 @Test 291 public void testTranan() throws EncoderException { 292 this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN"); 293 } 294 295 @Test 296 public void testTrueVariant() { 297 Nysiis encoder = new Nysiis(true); 298 299 String encoded = encoder.encode("WESTERLUND"); 300 Assert.assertTrue(encoded.length() <= 6); 301 Assert.assertEquals("WASTAR", encoded); 302 } 303 304 }