View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  import static org.junit.jupiter.api.Assertions.assertTrue;
22  
23  import org.apache.commons.codec.AbstractStringEncoderTest;
24  import org.junit.jupiter.api.Test;
25  
26  /**
27   * Tests {@link Nysiis}
28   */
29  public class NysiisTest extends AbstractStringEncoderTest<Nysiis> {
30  
31      private final Nysiis fullNysiis = new Nysiis(false);
32  
33      /**
34       * Takes an array of String pairs where each pair's first element is the input and the second element the expected
35       * encoding.
36       *
37       * @param testValues
38       *            an array of String pairs where each pair's first element is the input and the second element the
39       *            expected encoding.
40       */
41      private void assertEncodings(final String[]... testValues) {
42          for (final String[] arr : testValues) {
43              assertEquals(arr[1], this.fullNysiis.encode(arr[0]), "Problem with " + arr[0]);
44          }
45      }
46  
47      @Override
48      protected Nysiis createStringEncoder() {
49          return new Nysiis();
50      }
51  
52      private void encodeAll(final String[] strings, final String expectedEncoding) {
53          for (final String string : strings) {
54              assertEquals(expectedEncoding, getStringEncoder().encode(string), "Problem with " + string);
55          }
56      }
57  
58      @Test
59      public void testBran() {
60          encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
61      }
62  
63      @Test
64      public void testCap() {
65          this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
66      }
67  
68      @Test
69      public void testDad() {
70          // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
71          // but it should be DAD, verified also with dropby.com
72          this.encodeAll(new String[] { "Dent" }, "DAD");
73      }
74  
75      @Test
76      public void testDan() {
77          this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
78      }
79  
80      /**
81       * Tests data gathered from around the internet.
82       *
83       * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>*/
84      @Test
85      public void testDropBy() {
86          // Explanation of differences between this implementation and the one at dropby.com is
87          // prepended to the test string. The referenced rules refer to the outlined steps the
88          // class description for Nysiis.
89  
90          this.assertEncodings(
91                  // 1. Transcode first characters of name
92                  new String[] { "MACINTOSH", "MCANT" },
93                  // violates 4j: the second N should not be added, as the first
94                  //              key char is already a N
95                  new String[] { "KNUTH", "NAT" },           // Original: NNAT; modified: NATH
96                  // O and E are transcoded to A because of rule 4a
97                  // H also to A because of rule 4h
98                  // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
99                  // that skips the next char in such a case?
100                 // the remaining A is removed because of rule 7
101                 new String[] { "KOEHN", "CAN" },           // Original: C
102                 // violates 4j: see also KNUTH
103                 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
104                 // violates 4j: see also KNUTH
105                 new String[] { "PFEISTER", "FASTAR" },     // Original: FFASTA[R]
106                 // violates 4j: see also KNUTH
107                 new String[] { "SCHOENHOEFT", "SANAFT" },  // Original: SSANAF[T]
108                 // 2. Transcode last characters of name:
109                 new String[] { "MCKEE", "MCY" },
110                 new String[] { "MACKIE", "MCY" },
111                 new String[] { "HEITSCHMIDT", "HATSNAD" },
112                 new String[] { "BART", "BAD" },
113                 new String[] { "HURD", "HAD" },
114                 new String[] { "HUNT", "HAD" },
115                 new String[] { "WESTERLUND", "WASTARLAD" },
116                 // 4. Transcode remaining characters by following these rules,
117                 //    incrementing by one character each time:
118                 new String[] { "CASSTEVENS", "CASTAFAN" },
119                 new String[] { "VASQUEZ", "VASG" },
120                 new String[] { "FRAZIER", "FRASAR" },
121                 new String[] { "BOWMAN", "BANAN" },
122                 new String[] { "MCKNIGHT", "MCNAGT" },
123                 new String[] { "RICKERT", "RACAD" },
124                 // violates 5: the last S is not removed
125                 // when comparing to DEUTS, which is phonetically similar
126                 // the result it also DAT, which is correct for DEUTSCH too imo
127                 new String[] { "DEUTSCH", "DAT" },         // Original: DATS
128                 new String[] { "WESTPHAL", "WASTFAL" },
129                 // violates 4h: the H should be transcoded to S and thus ignored as
130                 // the first key character is also S
131                 new String[] { "SHRIVER", "SRAVAR" },      // Original: SHRAVA[R]
132                 // same as KOEHN, the L gets mysteriously lost
133                 new String[] { "KUHL", "CAL" },            // Original: C
134                 new String[] { "RAWSON", "RASAN" },
135                 // If last character is S, remove it
136                 new String[] { "JILES", "JAL" },
137                 // violates 6: if the last two characters are AY, remove A
138                 new String[] { "CARRAWAY", "CARY" },       // Original: CARAY
139                 new String[] { "YAMADA", "YANAD" });
140     }
141 
142     @Test
143     public void testFal() {
144         this.encodeAll(new String[] { "Phil" }, "FAL");
145     }
146 
147     /**
148      * Tests data gathered from around the internets.*/
149     @Test
150     public void testOthers() {
151         this.assertEncodings(
152                 new String[] { "O'Daniel", "ODANAL" },
153                 new String[] { "O'Donnel", "ODANAL" },
154                 new String[] { "Cory", "CARY" },
155                 new String[] { "Corey", "CARY" },
156                 new String[] { "Kory", "CARY" },
157                 //
158                 new String[] { "FUZZY", "FASY" });
159     }
160 
161     /**
162      * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS*/
163     @Test
164     public void testRule1() {
165         this.assertEncodings(
166                 new String[] { "MACX", "MCX" },
167                 new String[] { "KNX", "NX" },
168                 new String[] { "KX", "CX" },
169                 new String[] { "PHX", "FX" },
170                 new String[] { "PFX", "FX" },
171                 new String[] { "SCHX", "SX" });
172     }
173 
174     /**
175      * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D*/
176     @Test
177     public void testRule2() {
178         this.assertEncodings(
179                 new String[] { "XEE", "XY" },
180                 new String[] { "XIE", "XY" },
181                 new String[] { "XDT", "XD" },
182                 new String[] { "XRT", "XD" },
183                 new String[] { "XRD", "XD" },
184                 new String[] { "XNT", "XD" },
185                 new String[] { "XND", "XD" });
186     }
187 
188     /**
189      * Tests rule 4.1: EV → AF else A, E, I, O, U → A*/
190     @Test
191     public void testRule4Dot1() {
192         this.assertEncodings(
193                 new String[] { "XEV", "XAF" },
194                 new String[] { "XAX", "XAX" },
195                 new String[] { "XEX", "XAX" },
196                 new String[] { "XIX", "XAX" },
197                 new String[] { "XOX", "XAX" },
198                 new String[] { "XUX", "XAX" });
199     }
200 
201     /**
202      * Tests rule 4.2: Q → G, Z → S, M → N*/
203     @Test
204     public void testRule4Dot2() {
205         this.assertEncodings(
206                 new String[] { "XQ", "XG" },
207                 new String[] { "XZ", "X" },
208                 new String[] { "XM", "XN" });
209     }
210 
211     /**
212      * Tests rule 5: If last character is S, remove it.*/
213     @Test
214     public void testRule5() {
215         this.assertEncodings(
216                 new String[] { "XS", "X" },
217                 new String[] { "XSS", "X" });
218     }
219 
220     /**
221      * Tests rule 6: If last characters are AY, replace with Y.*/
222     @Test
223     public void testRule6() {
224         this.assertEncodings(
225                 new String[] { "XAY", "XY" },
226                 new String[] { "XAYS", "XY" }); // Rules 5, 6
227     }
228 
229     /**
230      * Tests rule 7: If last character is A, remove it.*/
231     @Test
232     public void testRule7() {
233         this.assertEncodings(
234                 new String[] { "XA", "X" },
235                 new String[] { "XAS", "X" }); // Rules 5, 7
236     }
237     @Test
238     public void testSnad() {
239         // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
240         // but it should be SNAD
241         this.encodeAll(new String[] { "Schmidt" }, "SNAD");
242     }
243 
244     @Test
245     public void testSnat() {
246         this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
247     }
248 
249     @Test
250     public void testSpecialBranches() {
251         this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
252         this.encodeAll(new String[] { "Kocher" }, "CACAR");
253         this.encodeAll(new String[] { "Fesca" }, "FASC");
254         this.encodeAll(new String[] { "Shom" }, "SAN");
255         this.encodeAll(new String[] { "Ohlo" }, "OL");
256         this.encodeAll(new String[] { "Uhu" }, "UH");
257         this.encodeAll(new String[] { "Um" }, "UN");
258     }
259 
260     @Test
261     public void testTranan() {
262         this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
263     }
264 
265     @Test
266     public void testTrueVariant() {
267         final Nysiis encoder = new Nysiis(true);
268 
269         final String encoded = encoder.encode("WESTERLUND");
270         assertTrue(encoded.length() <= 6);
271         assertEquals("WASTAR", encoded);
272     }
273 
274 }