001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    import org.apache.commons.codec.StringEncoderAbstractTest;
023    import org.junit.Assert;
024    import org.junit.Test;
025    
026    /**
027     * Tests {@link Nysiis}
028     *
029     * @since 1.7
030     * @version $Id: NysiisTest.html 889935 2013-12-11 05:05:13Z ggregory $
031     */
032    public class NysiisTest extends StringEncoderAbstractTest {
033    
034        private final Nysiis fullNysiis = new Nysiis(false);
035    
036        /**
037         * Takes an array of String pairs where each pair's first element is the input and the second element the expected
038         * encoding.
039         *
040         * @param testValues
041         *            an array of String pairs where each pair's first element is the input and the second element the
042         *            expected encoding.
043         * @throws EncoderException
044         */
045        private void assertEncodings(String[]... testValues) throws EncoderException {
046            for (String[] arr : testValues) {
047                Assert.assertEquals("Problem with " + arr[0], arr[1], this.fullNysiis.encode(arr[0]));
048            }
049        }
050    
051        @Override
052        protected StringEncoder createStringEncoder() {
053            return new Nysiis();
054        }
055    
056        private void encodeAll(String[] strings, String expectedEncoding) throws EncoderException {
057            for (String string : strings) {
058                Assert.assertEquals("Problem with " + string, expectedEncoding, getStringEncoder().encode(string));
059            }
060        }
061    
062        @Test
063        public void testBran() throws EncoderException {
064            encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
065        }
066    
067        @Test
068        public void testCap() throws EncoderException {
069            this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
070        }
071    
072        @Test
073        public void testDad() throws EncoderException {
074            // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
075            // but it should be DAD, verified also with dropby.com
076            this.encodeAll(new String[] { "Dent" }, "DAD");
077        }
078    
079        @Test
080        public void testDan() throws EncoderException {
081            this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
082        }
083    
084        /**
085         * Tests data gathered from around the internet.
086         *
087         * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
088         * @throws EncoderException
089         */
090        @Test
091        public void testDropBy() throws EncoderException {
092            // Explanation of differences between this implementation and the one at dropby.com is
093            // prepended to the test string. The referenced rules refer to the outlined steps the
094            // class description for Nysiis.
095    
096            this.assertEncodings(
097                    // 1. Transcode first characters of name
098                    new String[] { "MACINTOSH", "MCANT" },
099                    // violates 4j: the second N should not be added, as the first
100                    //              key char is already a N
101                    new String[] { "KNUTH", "NAT" },           // Original: NNAT; modified: NATH
102                    // O and E are transcoded to A because of rule 4a
103                    // H also to A because of rule 4h
104                    // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
105                    // that skips the next char in such a case?
106                    // the remaining A is removed because of rule 7
107                    new String[] { "KOEHN", "CAN" },           // Original: C
108                    // violates 4j: see also KNUTH
109                    new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
110                    // violates 4j: see also KNUTH
111                    new String[] { "PFEISTER", "FASTAR" },     // Original: FFASTA[R]
112                    // violates 4j: see also KNUTH
113                    new String[] { "SCHOENHOEFT", "SANAFT" },  // Original: SSANAF[T]
114                    // 2. Transcode last characters of name:
115                    new String[] { "MCKEE", "MCY" },
116                    new String[] { "MACKIE", "MCY" },
117                    new String[] { "HEITSCHMIDT", "HATSNAD" },
118                    new String[] { "BART", "BAD" },
119                    new String[] { "HURD", "HAD" },
120                    new String[] { "HUNT", "HAD" },
121                    new String[] { "WESTERLUND", "WASTARLAD" },
122                    // 4. Transcode remaining characters by following these rules,
123                    //    incrementing by one character each time:
124                    new String[] { "CASSTEVENS", "CASTAFAN" },
125                    new String[] { "VASQUEZ", "VASG" },
126                    new String[] { "FRAZIER", "FRASAR" },
127                    new String[] { "BOWMAN", "BANAN" },
128                    new String[] { "MCKNIGHT", "MCNAGT" },
129                    new String[] { "RICKERT", "RACAD" },
130                    // violates 5: the last S is not removed
131                    // when comparing to DEUTS, which is phonetically similar
132                    // the result it also DAT, which is correct for DEUTSCH too imo
133                    new String[] { "DEUTSCH", "DAT" },         // Original: DATS
134                    new String[] { "WESTPHAL", "WASTFAL" },
135                    // violates 4h: the H should be transcoded to S and thus ignored as
136                    // the first key character is also S
137                    new String[] { "SHRIVER", "SRAVAR" },      // Original: SHRAVA[R]
138                    // same as KOEHN, the L gets mysteriously lost
139                    new String[] { "KUHL", "CAL" },            // Original: C
140                    new String[] { "RAWSON", "RASAN" },
141                    // If last character is S, remove it
142                    new String[] { "JILES", "JAL" },
143                    // violates 6: if the last two characters are AY, remove A
144                    new String[] { "CARRAWAY", "CARY" },       // Original: CARAY
145                    new String[] { "YAMADA", "YANAD" });
146        }
147    
148        @Test
149        public void testFal() throws EncoderException {
150            this.encodeAll(new String[] { "Phil" }, "FAL");
151        }
152    
153        /**
154         * Tests data gathered from around the internets.
155         *
156         * @throws EncoderException
157         */
158        @Test
159        public void testOthers() throws EncoderException {
160            this.assertEncodings(
161                    new String[] { "O'Daniel", "ODANAL" },
162                    new String[] { "O'Donnel", "ODANAL" },
163                    new String[] { "Cory", "CARY" },
164                    new String[] { "Corey", "CARY" },
165                    new String[] { "Kory", "CARY" },
166                    //
167                    new String[] { "FUZZY", "FASY" });
168        }
169    
170        /**
171         * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS
172         *
173         * @throws EncoderException
174         */
175        @Test
176        public void testRule1() throws EncoderException {
177            this.assertEncodings(
178                    new String[] { "MACX", "MCX" },
179                    new String[] { "KNX", "NX" },
180                    new String[] { "KX", "CX" },
181                    new String[] { "PHX", "FX" },
182                    new String[] { "PFX", "FX" },
183                    new String[] { "SCHX", "SX" });
184        }
185    
186        /**
187         * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D
188         *
189         * @throws EncoderException
190         */
191        @Test
192        public void testRule2() throws EncoderException {
193            this.assertEncodings(
194                    new String[] { "XEE", "XY" },
195                    new String[] { "XIE", "XY" },
196                    new String[] { "XDT", "XD" },
197                    new String[] { "XRT", "XD" },
198                    new String[] { "XRD", "XD" },
199                    new String[] { "XNT", "XD" },
200                    new String[] { "XND", "XD" });
201        }
202    
203        /**
204         * Tests rule 4.1: EV → AF else A, E, I, O, U → A
205         *
206         * @throws EncoderException
207         */
208        @Test
209        public void testRule4Dot1() throws EncoderException {
210            this.assertEncodings(
211                    new String[] { "XEV", "XAF" },
212                    new String[] { "XAX", "XAX" },
213                    new String[] { "XEX", "XAX" },
214                    new String[] { "XIX", "XAX" },
215                    new String[] { "XOX", "XAX" },
216                    new String[] { "XUX", "XAX" });
217        }
218    
219        /**
220         * Tests rule 4.2: Q → G, Z → S, M → N
221         *
222         * @throws EncoderException
223         */
224        @Test
225        public void testRule4Dot2() throws EncoderException {
226            this.assertEncodings(
227                    new String[] { "XQ", "XG" },
228                    new String[] { "XZ", "X" },
229                    new String[] { "XM", "XN" });
230        }
231    
232        /**
233         * Tests rule 5: If last character is S, remove it.
234         *
235         * @throws EncoderException
236         */
237        @Test
238        public void testRule5() throws EncoderException {
239            this.assertEncodings(
240                    new String[] { "XS", "X" },
241                    new String[] { "XSS", "X" });
242        }
243    
244        /**
245         * Tests rule 6: If last characters are AY, replace with Y.
246         *
247         * @throws EncoderException
248         */
249        @Test
250        public void testRule6() throws EncoderException {
251            this.assertEncodings(
252                    new String[] { "XAY", "XY" },
253                    new String[] { "XAYS", "XY" }); // Rules 5, 6
254        }
255    
256        /**
257         * Tests rule 7: If last character is A, remove it.
258         *
259         * @throws EncoderException
260         */
261        @Test
262        public void testRule7() throws EncoderException {
263            this.assertEncodings(
264                    new String[] { "XA", "X" },
265                    new String[] { "XAS", "X" }); // Rules 5, 7
266        }
267        @Test
268        public void testSnad() throws EncoderException {
269            // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
270            // but it should be SNAD
271            this.encodeAll(new String[] { "Schmidt" }, "SNAD");
272        }
273    
274        @Test
275        public void testSnat() throws EncoderException {
276            this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
277        }
278    
279        @Test
280        public void testSpecialBranches() throws EncoderException {
281            this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
282            this.encodeAll(new String[] { "Kocher" }, "CACAR");
283            this.encodeAll(new String[] { "Fesca" }, "FASC");
284            this.encodeAll(new String[] { "Shom" }, "SAN");
285            this.encodeAll(new String[] { "Ohlo" }, "OL");
286            this.encodeAll(new String[] { "Uhu" }, "UH");
287            this.encodeAll(new String[] { "Um" }, "UN");
288        }
289    
290        @Test
291        public void testTranan() throws EncoderException {
292            this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
293        }
294    
295        @Test
296        public void testTrueVariant() {
297            Nysiis encoder = new Nysiis(true);
298    
299            String encoded = encoder.encode("WESTERLUND");
300            Assert.assertTrue(encoded.length() <= 6);
301            Assert.assertEquals("WASTAR", encoded);
302        }
303    
304    }