001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import java.util.regex.Pattern;
021
022import org.apache.commons.codec.EncoderException;
023import org.apache.commons.codec.StringEncoder;
024
025/**
026 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
027 * general purpose scheme to find word with similar phonemes.
028 * <p>
029 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
030 * </p>
031 * <p>
032 * Algorithm description:
033 * </p>
034 * <pre>
035 * 1. Transcode first characters of name
036 *   1a. MAC -&gt;   MCC
037 *   1b. KN  -&gt;   NN
038 *   1c. K   -&gt;   C
039 *   1d. PH  -&gt;   FF
040 *   1e. PF  -&gt;   FF
041 *   1f. SCH -&gt;   SSS
042 * 2. Transcode last characters of name
043 *   2a. EE, IE          -&gt;   Y
044 *   2b. DT,RT,RD,NT,ND  -&gt;   D
045 * 3. First character of key = first character of name
046 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
047 *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
048 *   4b. Q   -&gt;   G
049 *   4c. Z   -&gt;   S
050 *   4d. M   -&gt;   N
051 *   4e. KN  -&gt;   N   else K -&gt; C
052 *   4f. SCH -&gt;   SSS
053 *   4g. PH  -&gt;   FF
054 *   4h. H   -&gt;   If previous or next is non-vowel, previous
055 *   4i. W   -&gt;   If previous is vowel, previous
056 *   4j. Add current to key if current != last key character
057 * 5. If last character is S, remove it
058 * 6. If last characters are AY, replace with Y
059 * 7. If last character is A, remove it
060 * 8. Collapse all strings of repeated characters
061 * 9. Add original first character of name as first character of key
062 * </pre>
063 * <p>
064 * This class is immutable and thread-safe.
065 * </p>
066 *
067 * @see <a href="https://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
068 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
069 * @see Soundex
070 * @since 1.7
071 */
072public class Nysiis implements StringEncoder {
073
074    private static final char[] CHARS_A   = { 'A' };
075    private static final char[] CHARS_AF  = { 'A', 'F' };
076    private static final char[] CHARS_C   = { 'C' };
077    private static final char[] CHARS_FF  = { 'F', 'F' };
078    private static final char[] CHARS_G   = { 'G' };
079    private static final char[] CHARS_N   = { 'N' };
080    private static final char[] CHARS_NN  = { 'N', 'N' };
081    private static final char[] CHARS_S   = { 'S' };
082    private static final char[] CHARS_SSS = { 'S', 'S', 'S' };
083
084    private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
085    private static final Pattern PAT_KN     = Pattern.compile("^KN");
086    private static final Pattern PAT_K      = Pattern.compile("^K");
087    private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
088    private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
089    private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
090    private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
091
092    private static final char SPACE = ' ';
093    private static final int TRUE_LENGTH = 6;
094
095    /**
096     * Tests if the given character is a vowel.
097     *
098     * @param c
099     *            the character to test
100     * @return {@code true} if the character is a vowel, {@code false} otherwise
101     */
102    private static boolean isVowel(final char c) {
103        return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
104    }
105
106    /**
107     * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
108     * a time: [i-1, i, i+1, i+2].
109     *
110     * @param prev
111     *            the previous character
112     * @param curr
113     *            the current character
114     * @param next
115     *            the next character
116     * @param aNext
117     *            the after next character
118     * @return a transcoded array of characters, starting from the current position
119     */
120    private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
121        // 1. EV -> AF
122        if (curr == 'E' && next == 'V') {
123            return CHARS_AF;
124        }
125
126        // A, E, I, O, U -> A
127        if (isVowel(curr)) {
128            return CHARS_A;
129        }
130
131        // 2. Q -> G, Z -> S, M -> N
132
133        // 3. KN -> NN else K -> C
134        switch (curr) {
135        case 'Q':
136            return CHARS_G;
137        case 'Z':
138            return CHARS_S;
139        case 'M':
140            return CHARS_N;
141        case 'K':
142            if (next == 'N') {
143                return CHARS_NN;
144            }
145            return CHARS_C;
146        default:
147            break;
148        }
149
150        // 4. SCH -> SSS
151        if (curr == 'S' && next == 'C' && aNext == 'H') {
152            return CHARS_SSS;
153        }
154
155        // PH -> FF
156        if (curr == 'P' && next == 'H') {
157            return CHARS_FF;
158        }
159
160        // 5. H -> If previous or next is a non vowel, previous.
161        if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
162            return new char[] { prev };
163        }
164
165        // 6. W -> If previous is vowel, previous.
166        if (curr == 'W' && isVowel(prev)) {
167            return new char[] { prev };
168        }
169
170        return new char[] { curr };
171    }
172
173    /** Indicates the strict mode. */
174    private final boolean strict;
175
176    /**
177     * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
178     * i.e. encoded strings have a maximum length of 6.
179     */
180    public Nysiis() {
181        this(true);
182    }
183
184    /**
185     * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
186     *
187     * <ul>
188     *  <li>{@code true}: encoded strings have a maximum length of 6</li>
189     *  <li>{@code false}: encoded strings may have arbitrary length</li>
190     * </ul>
191     *
192     * @param strict
193     *            the strict mode
194     */
195    public Nysiis(final boolean strict) {
196        this.strict = strict;
197    }
198
199    /**
200     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
201     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
202     * {@link String}.
203     *
204     * @param obj
205     *            Object to encode
206     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
207     * @throws EncoderException
208     *            if the parameter supplied is not of a {@link String}
209     * @throws IllegalArgumentException
210     *            if a character is not mapped
211     */
212    @Override
213    public Object encode(final Object obj) throws EncoderException {
214        if (!(obj instanceof String)) {
215            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
216        }
217        return nysiis((String) obj);
218    }
219
220    /**
221     * Encodes a String using the NYSIIS algorithm.
222     *
223     * @param str
224     *            A String object to encode
225     * @return A Nysiis code corresponding to the String supplied
226     * @throws IllegalArgumentException
227     *            if a character is not mapped
228     */
229    @Override
230    public String encode(final String str) {
231        return nysiis(str);
232    }
233
234    /**
235     * Indicates the strict mode for this {@link Nysiis} encoder.
236     *
237     * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
238     */
239    public boolean isStrict() {
240        return this.strict;
241    }
242
243    /**
244     * Retrieves the NYSIIS code for a given String object.
245     *
246     * @param str
247     *            String to encode using the NYSIIS algorithm
248     * @return A NYSIIS code for the String supplied
249     */
250    public String nysiis(String str) {
251        if (str == null) {
252            return null;
253        }
254
255        // Use the same clean rules as Soundex
256        str = SoundexUtils.clean(str);
257
258        if (str.isEmpty()) {
259            return str;
260        }
261
262        // Translate first characters of name:
263        // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
264        str = PAT_MAC.matcher(str).replaceFirst("MCC");
265        str = PAT_KN.matcher(str).replaceFirst("NN");
266        str = PAT_K.matcher(str).replaceFirst("C");
267        str = PAT_PH_PF.matcher(str).replaceFirst("FF");
268        str = PAT_SCH.matcher(str).replaceFirst("SSS");
269
270        // Translate last characters of name:
271        // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
272        str = PAT_EE_IE.matcher(str).replaceFirst("Y");
273        str = PAT_DT_ETC.matcher(str).replaceFirst("D");
274
275        // First character of key = first character of name.
276        final StringBuilder key = new StringBuilder(str.length());
277        key.append(str.charAt(0));
278
279        // Transcode remaining characters, incrementing by one character each time
280        final char[] chars = str.toCharArray();
281        final int len = chars.length;
282
283        for (int i = 1; i < len; i++) {
284            final char next = i < len - 1 ? chars[i + 1] : SPACE;
285            final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
286            final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
287            System.arraycopy(transcoded, 0, chars, i, transcoded.length);
288
289            // only append the current char to the key if it is different from the last one
290            if (chars[i] != chars[i - 1]) {
291                key.append(chars[i]);
292            }
293        }
294
295        if (key.length() > 1) {
296            char lastChar = key.charAt(key.length() - 1);
297
298            // If last character is S, remove it.
299            if (lastChar == 'S') {
300                key.deleteCharAt(key.length() - 1);
301                lastChar = key.charAt(key.length() - 1);
302            }
303
304            if (key.length() > 2) {
305                final char last2Char = key.charAt(key.length() - 2);
306                // If last characters are AY, replace with Y.
307                if (last2Char == 'A' && lastChar == 'Y') {
308                    key.deleteCharAt(key.length() - 2);
309                }
310            }
311
312            // If last character is A, remove it.
313            if (lastChar == 'A') {
314                key.deleteCharAt(key.length() - 1);
315            }
316        }
317
318        final String string = key.toString();
319        return isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
320    }
321
322}