001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import java.util.regex.Pattern;
021    
022    import org.apache.commons.codec.EncoderException;
023    import org.apache.commons.codec.StringEncoder;
024    
025    /**
026     * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
027     * general purpose scheme to find word with similar phonemes.
028     * <p>
029     * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
030     * <p>
031     * Algorithm description:
032     * <pre>
033     * 1. Transcode first characters of name
034     *   1a. MAC ->   MCC
035     *   1b. KN  ->   NN
036     *   1c. K   ->   C
037     *   1d. PH  ->   FF
038     *   1e. PF  ->   FF
039     *   1f. SCH ->   SSS
040     * 2. Transcode last characters of name
041     *   2a. EE, IE          ->   Y
042     *   2b. DT,RT,RD,NT,ND  ->   D
043     * 3. First character of key = first character of name
044     * 4. Transcode remaining characters by following these rules, incrementing by one character each time
045     *   4a. EV  ->   AF  else A,E,I,O,U -> A
046     *   4b. Q   ->   G
047     *   4c. Z   ->   S
048     *   4d. M   ->   N
049     *   4e. KN  ->   N   else K -> C
050     *   4f. SCH ->   SSS
051     *   4g. PH  ->   FF
052     *   4h. H   ->   If previous or next is nonvowel, previous
053     *   4i. W   ->   If previous is vowel, previous
054     *   4j. Add current to key if current != last key character
055     * 5. If last character is S, remove it
056     * 6. If last characters are AY, replace with Y
057     * 7. If last character is A, remove it
058     * 8. Collapse all strings of repeated characters
059     * 9. Add original first character of name as first character of key
060     * </pre>
061     * <p>
062     * This class is immutable and thread-safe.
063     *
064     * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
065     * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
066     * @see Soundex
067     * @since 1.7
068     * @version $Id: Nysiis.html 889935 2013-12-11 05:05:13Z ggregory $
069     */
070    public class Nysiis implements StringEncoder {
071    
072        private static final char[] CHARS_A   = new char[] { 'A' };
073        private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
074        private static final char[] CHARS_C   = new char[] { 'C' };
075        private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
076        private static final char[] CHARS_G   = new char[] { 'G' };
077        private static final char[] CHARS_N   = new char[] { 'N' };
078        private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
079        private static final char[] CHARS_S   = new char[] { 'S' };
080        private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
081    
082        private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
083        private static final Pattern PAT_KN     = Pattern.compile("^KN");
084        private static final Pattern PAT_K      = Pattern.compile("^K");
085        private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
086        private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
087        private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
088        private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
089    
090        private static final char SPACE = ' ';
091        private static final int TRUE_LENGTH = 6;
092    
093        /**
094         * Tests if the given character is a vowel.
095         *
096         * @param c
097         *            the character to test
098         * @return {@code true} if the character is a vowel, {@code false} otherwise
099         */
100        private static boolean isVowel(final char c) {
101            return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
102        }
103    
104        /**
105         * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
106         * a time: [i-1, i, i+1, i+2].
107         *
108         * @param prev
109         *            the previous character
110         * @param curr
111         *            the current character
112         * @param next
113         *            the next character
114         * @param aNext
115         *            the after next character
116         * @return a transcoded array of characters, starting from the current position
117         */
118        private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
119            // 1. EV -> AF
120            if (curr == 'E' && next == 'V') {
121                return CHARS_AF;
122            }
123    
124            // A, E, I, O, U -> A
125            if (isVowel(curr)) {
126                return CHARS_A;
127            }
128    
129            // 2. Q -> G, Z -> S, M -> N
130            if (curr == 'Q') {
131                return CHARS_G;
132            } else if (curr == 'Z') {
133                return CHARS_S;
134            } else if (curr == 'M') {
135                return CHARS_N;
136            }
137    
138            // 3. KN -> NN else K -> C
139            if (curr == 'K') {
140                if (next == 'N') {
141                    return CHARS_NN;
142                } else {
143                    return CHARS_C;
144                }
145            }
146    
147            // 4. SCH -> SSS
148            if (curr == 'S' && next == 'C' && aNext == 'H') {
149                return CHARS_SSS;
150            }
151    
152            // PH -> FF
153            if (curr == 'P' && next == 'H') {
154                return CHARS_FF;
155            }
156    
157            // 5. H -> If previous or next is a non vowel, previous.
158            if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
159                return new char[] { prev };
160            }
161    
162            // 6. W -> If previous is vowel, previous.
163            if (curr == 'W' && isVowel(prev)) {
164                return new char[] { prev };
165            }
166    
167            return new char[] { curr };
168        }
169    
170        /** Indicates the strict mode. */
171        private final boolean strict;
172    
173        /**
174         * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
175         * i.e. encoded strings have a maximum length of 6.
176         */
177        public Nysiis() {
178            this(true);
179        }
180    
181        /**
182         * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
183         *
184         * <ul>
185         *  <li>{@code true}: encoded strings have a maximum length of 6</li>
186         *  <li>{@code false}: encoded strings may have arbitrary length</li>
187         * </ul>
188         *
189         * @param strict
190         *            the strict mode
191         */
192        public Nysiis(final boolean strict) {
193            this.strict = strict;
194        }
195    
196        /**
197         * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
198         * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
199         * {@link String}.
200         *
201         * @param obj
202         *            Object to encode
203         * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
204         * @throws EncoderException
205         *            if the parameter supplied is not of a {@link String}
206         * @throws IllegalArgumentException
207         *            if a character is not mapped
208         */
209        @Override
210        public Object encode(Object obj) throws EncoderException {
211            if (!(obj instanceof String)) {
212                throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
213            }
214            return this.nysiis((String) obj);
215        }
216    
217        /**
218         * Encodes a String using the NYSIIS algorithm.
219         *
220         * @param str
221         *            A String object to encode
222         * @return A Nysiis code corresponding to the String supplied
223         * @throws IllegalArgumentException
224         *            if a character is not mapped
225         */
226        @Override
227        public String encode(String str) {
228            return this.nysiis(str);
229        }
230    
231        /**
232         * Indicates the strict mode for this {@link Nysiis} encoder.
233         *
234         * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
235         */
236        public boolean isStrict() {
237            return this.strict;
238        }
239    
240        /**
241         * Retrieves the NYSIIS code for a given String object.
242         *
243         * @param str
244         *            String to encode using the NYSIIS algorithm
245         * @return A NYSIIS code for the String supplied
246         */
247        public String nysiis(String str) {
248            if (str == null) {
249                return null;
250            }
251    
252            // Use the same clean rules as Soundex
253            str = SoundexUtils.clean(str);
254    
255            if (str.length() == 0) {
256                return str;
257            }
258    
259            // Translate first characters of name:
260            // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
261            str = PAT_MAC.matcher(str).replaceFirst("MCC");
262            str = PAT_KN.matcher(str).replaceFirst("NN");
263            str = PAT_K.matcher(str).replaceFirst("C");
264            str = PAT_PH_PF.matcher(str).replaceFirst("FF");
265            str = PAT_SCH.matcher(str).replaceFirst("SSS");
266    
267            // Translate last characters of name:
268            // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
269            str = PAT_EE_IE.matcher(str).replaceFirst("Y");
270            str = PAT_DT_ETC.matcher(str).replaceFirst("D");
271    
272            // First character of key = first character of name.
273            StringBuilder key = new StringBuilder(str.length());
274            key.append(str.charAt(0));
275    
276            // Transcode remaining characters, incrementing by one character each time
277            final char[] chars = str.toCharArray();
278            final int len = chars.length;
279    
280            for (int i = 1; i < len; i++) {
281                final char next = i < len - 1 ? chars[i + 1] : SPACE;
282                final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
283                final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
284                System.arraycopy(transcoded, 0, chars, i, transcoded.length);
285    
286                // only append the current char to the key if it is different from the last one
287                if (chars[i] != chars[i - 1]) {
288                    key.append(chars[i]);
289                }
290            }
291    
292            if (key.length() > 1) {
293                char lastChar = key.charAt(key.length() - 1);
294    
295                // If last character is S, remove it.
296                if (lastChar == 'S') {
297                    key.deleteCharAt(key.length() - 1);
298                    lastChar = key.charAt(key.length() - 1);
299                }
300    
301                if (key.length() > 2) {
302                    final char last2Char = key.charAt(key.length() - 2);
303                    // If last characters are AY, replace with Y.
304                    if (last2Char == 'A' && lastChar == 'Y') {
305                        key.deleteCharAt(key.length() - 2);
306                    }
307                }
308    
309                // If last character is A, remove it.
310                if (lastChar == 'A') {
311                    key.deleteCharAt(key.length() - 1);
312                }
313            }
314    
315            final String string = key.toString();
316            return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
317        }
318    
319    }