001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoderAbstractTest;
022import org.junit.Assert;
023import org.junit.Test;
024
025/**
026 * Tests {@link Nysiis}
027 *
028 * @since 1.7
029 * @version $Id: NysiisTest.html 891688 2013-12-24 20:49:46Z ggregory $
030 */
031public class NysiisTest extends StringEncoderAbstractTest<Nysiis> {
032
033    private final Nysiis fullNysiis = new Nysiis(false);
034
035    /**
036     * Takes an array of String pairs where each pair's first element is the input and the second element the expected
037     * encoding.
038     *
039     * @param testValues
040     *            an array of String pairs where each pair's first element is the input and the second element the
041     *            expected encoding.
042     * @throws EncoderException
043     */
044    private void assertEncodings(final String[]... testValues) throws EncoderException {
045        for (final String[] arr : testValues) {
046            Assert.assertEquals("Problem with " + arr[0], arr[1], this.fullNysiis.encode(arr[0]));
047        }
048    }
049
050    @Override
051    protected Nysiis createStringEncoder() {
052        return new Nysiis();
053    }
054
055    private void encodeAll(final String[] strings, final String expectedEncoding) {
056        for (final String string : strings) {
057            Assert.assertEquals("Problem with " + string, expectedEncoding, getStringEncoder().encode(string));
058        }
059    }
060
061    @Test
062    public void testBran() {
063        encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
064    }
065
066    @Test
067    public void testCap() {
068        this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
069    }
070
071    @Test
072    public void testDad() {
073        // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
074        // but it should be DAD, verified also with dropby.com
075        this.encodeAll(new String[] { "Dent" }, "DAD");
076    }
077
078    @Test
079    public void testDan() {
080        this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
081    }
082
083    /**
084     * Tests data gathered from around the internet.
085     *
086     * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
087     * @throws EncoderException
088     */
089    @Test
090    public void testDropBy() throws EncoderException {
091        // Explanation of differences between this implementation and the one at dropby.com is
092        // prepended to the test string. The referenced rules refer to the outlined steps the
093        // class description for Nysiis.
094
095        this.assertEncodings(
096                // 1. Transcode first characters of name
097                new String[] { "MACINTOSH", "MCANT" },
098                // violates 4j: the second N should not be added, as the first
099                //              key char is already a N
100                new String[] { "KNUTH", "NAT" },           // Original: NNAT; modified: NATH
101                // O and E are transcoded to A because of rule 4a
102                // H also to A because of rule 4h
103                // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
104                // that skips the next char in such a case?
105                // the remaining A is removed because of rule 7
106                new String[] { "KOEHN", "CAN" },           // Original: C
107                // violates 4j: see also KNUTH
108                new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
109                // violates 4j: see also KNUTH
110                new String[] { "PFEISTER", "FASTAR" },     // Original: FFASTA[R]
111                // violates 4j: see also KNUTH
112                new String[] { "SCHOENHOEFT", "SANAFT" },  // Original: SSANAF[T]
113                // 2. Transcode last characters of name:
114                new String[] { "MCKEE", "MCY" },
115                new String[] { "MACKIE", "MCY" },
116                new String[] { "HEITSCHMIDT", "HATSNAD" },
117                new String[] { "BART", "BAD" },
118                new String[] { "HURD", "HAD" },
119                new String[] { "HUNT", "HAD" },
120                new String[] { "WESTERLUND", "WASTARLAD" },
121                // 4. Transcode remaining characters by following these rules,
122                //    incrementing by one character each time:
123                new String[] { "CASSTEVENS", "CASTAFAN" },
124                new String[] { "VASQUEZ", "VASG" },
125                new String[] { "FRAZIER", "FRASAR" },
126                new String[] { "BOWMAN", "BANAN" },
127                new String[] { "MCKNIGHT", "MCNAGT" },
128                new String[] { "RICKERT", "RACAD" },
129                // violates 5: the last S is not removed
130                // when comparing to DEUTS, which is phonetically similar
131                // the result it also DAT, which is correct for DEUTSCH too imo
132                new String[] { "DEUTSCH", "DAT" },         // Original: DATS
133                new String[] { "WESTPHAL", "WASTFAL" },
134                // violates 4h: the H should be transcoded to S and thus ignored as
135                // the first key character is also S
136                new String[] { "SHRIVER", "SRAVAR" },      // Original: SHRAVA[R]
137                // same as KOEHN, the L gets mysteriously lost
138                new String[] { "KUHL", "CAL" },            // Original: C
139                new String[] { "RAWSON", "RASAN" },
140                // If last character is S, remove it
141                new String[] { "JILES", "JAL" },
142                // violates 6: if the last two characters are AY, remove A
143                new String[] { "CARRAWAY", "CARY" },       // Original: CARAY
144                new String[] { "YAMADA", "YANAD" });
145    }
146
147    @Test
148    public void testFal() {
149        this.encodeAll(new String[] { "Phil" }, "FAL");
150    }
151
152    /**
153     * Tests data gathered from around the internets.
154     *
155     * @throws EncoderException
156     */
157    @Test
158    public void testOthers() throws EncoderException {
159        this.assertEncodings(
160                new String[] { "O'Daniel", "ODANAL" },
161                new String[] { "O'Donnel", "ODANAL" },
162                new String[] { "Cory", "CARY" },
163                new String[] { "Corey", "CARY" },
164                new String[] { "Kory", "CARY" },
165                //
166                new String[] { "FUZZY", "FASY" });
167    }
168
169    /**
170     * Tests rule 1: Translate first characters of name: MAC ? MCC, KN ? N, K ? C, PH, PF ? FF, SCH ? SSS
171     *
172     * @throws EncoderException
173     */
174    @Test
175    public void testRule1() throws EncoderException {
176        this.assertEncodings(
177                new String[] { "MACX", "MCX" },
178                new String[] { "KNX", "NX" },
179                new String[] { "KX", "CX" },
180                new String[] { "PHX", "FX" },
181                new String[] { "PFX", "FX" },
182                new String[] { "SCHX", "SX" });
183    }
184
185    /**
186     * Tests rule 2: Translate last characters of name: EE ? Y, IE ? Y, DT, RT, RD, NT, ND ? D
187     *
188     * @throws EncoderException
189     */
190    @Test
191    public void testRule2() throws EncoderException {
192        this.assertEncodings(
193                new String[] { "XEE", "XY" },
194                new String[] { "XIE", "XY" },
195                new String[] { "XDT", "XD" },
196                new String[] { "XRT", "XD" },
197                new String[] { "XRD", "XD" },
198                new String[] { "XNT", "XD" },
199                new String[] { "XND", "XD" });
200    }
201
202    /**
203     * Tests rule 4.1: EV ? AF else A, E, I, O, U ? A
204     *
205     * @throws EncoderException
206     */
207    @Test
208    public void testRule4Dot1() throws EncoderException {
209        this.assertEncodings(
210                new String[] { "XEV", "XAF" },
211                new String[] { "XAX", "XAX" },
212                new String[] { "XEX", "XAX" },
213                new String[] { "XIX", "XAX" },
214                new String[] { "XOX", "XAX" },
215                new String[] { "XUX", "XAX" });
216    }
217
218    /**
219     * Tests rule 4.2: Q ? G, Z ? S, M ? N
220     *
221     * @throws EncoderException
222     */
223    @Test
224    public void testRule4Dot2() throws EncoderException {
225        this.assertEncodings(
226                new String[] { "XQ", "XG" },
227                new String[] { "XZ", "X" },
228                new String[] { "XM", "XN" });
229    }
230
231    /**
232     * Tests rule 5: If last character is S, remove it.
233     *
234     * @throws EncoderException
235     */
236    @Test
237    public void testRule5() throws EncoderException {
238        this.assertEncodings(
239                new String[] { "XS", "X" },
240                new String[] { "XSS", "X" });
241    }
242
243    /**
244     * Tests rule 6: If last characters are AY, replace with Y.
245     *
246     * @throws EncoderException
247     */
248    @Test
249    public void testRule6() throws EncoderException {
250        this.assertEncodings(
251                new String[] { "XAY", "XY" },
252                new String[] { "XAYS", "XY" }); // Rules 5, 6
253    }
254
255    /**
256     * Tests rule 7: If last character is A, remove it.
257     *
258     * @throws EncoderException
259     */
260    @Test
261    public void testRule7() throws EncoderException {
262        this.assertEncodings(
263                new String[] { "XA", "X" },
264                new String[] { "XAS", "X" }); // Rules 5, 7
265    }
266    @Test
267    public void testSnad() {
268        // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
269        // but it should be SNAD
270        this.encodeAll(new String[] { "Schmidt" }, "SNAD");
271    }
272
273    @Test
274    public void testSnat() {
275        this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
276    }
277
278    @Test
279    public void testSpecialBranches() {
280        this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
281        this.encodeAll(new String[] { "Kocher" }, "CACAR");
282        this.encodeAll(new String[] { "Fesca" }, "FASC");
283        this.encodeAll(new String[] { "Shom" }, "SAN");
284        this.encodeAll(new String[] { "Ohlo" }, "OL");
285        this.encodeAll(new String[] { "Uhu" }, "UH");
286        this.encodeAll(new String[] { "Um" }, "UN");
287    }
288
289    @Test
290    public void testTranan() {
291        this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
292    }
293
294    @Test
295    public void testTrueVariant() {
296        final Nysiis encoder = new Nysiis(true);
297
298        final String encoded = encoder.encode("WESTERLUND");
299        Assert.assertTrue(encoded.length() <= 6);
300        Assert.assertEquals("WASTAR", encoded);
301    }
302
303}