View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.regex.Pattern;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
27   * general purpose scheme to find word with similar phonemes.
28   * <p>
29   * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
30   * <p>
31   * Algorithm description:
32   * <pre>
33   * 1. Transcode first characters of name
34   *   1a. MAC -&gt;   MCC
35   *   1b. KN  -&gt;   NN
36   *   1c. K   -&gt;   C
37   *   1d. PH  -&gt;   FF
38   *   1e. PF  -&gt;   FF
39   *   1f. SCH -&gt;   SSS
40   * 2. Transcode last characters of name
41   *   2a. EE, IE          -&gt;   Y
42   *   2b. DT,RT,RD,NT,ND  -&gt;   D
43   * 3. First character of key = first character of name
44   * 4. Transcode remaining characters by following these rules, incrementing by one character each time
45   *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
46   *   4b. Q   -&gt;   G
47   *   4c. Z   -&gt;   S
48   *   4d. M   -&gt;   N
49   *   4e. KN  -&gt;   N   else K -&gt; C
50   *   4f. SCH -&gt;   SSS
51   *   4g. PH  -&gt;   FF
52   *   4h. H   -&gt;   If previous or next is nonvowel, previous
53   *   4i. W   -&gt;   If previous is vowel, previous
54   *   4j. Add current to key if current != last key character
55   * 5. If last character is S, remove it
56   * 6. If last characters are AY, replace with Y
57   * 7. If last character is A, remove it
58   * 8. Collapse all strings of repeated characters
59   * 9. Add original first character of name as first character of key
60   * </pre>
61   * <p>
62   * This class is immutable and thread-safe.
63   *
64   * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
65   * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
66   * @see Soundex
67   * @since 1.7
68   * @version $Id: Nysiis.java 1725161 2016-01-18 01:08:56Z ggregory $
69   */
70  public class Nysiis implements StringEncoder {
71  
72      private static final char[] CHARS_A   = new char[] { 'A' };
73      private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
74      private static final char[] CHARS_C   = new char[] { 'C' };
75      private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
76      private static final char[] CHARS_G   = new char[] { 'G' };
77      private static final char[] CHARS_N   = new char[] { 'N' };
78      private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
79      private static final char[] CHARS_S   = new char[] { 'S' };
80      private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
81  
82      private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
83      private static final Pattern PAT_KN     = Pattern.compile("^KN");
84      private static final Pattern PAT_K      = Pattern.compile("^K");
85      private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
86      private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
87      private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
88      private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
89  
90      private static final char SPACE = ' ';
91      private static final int TRUE_LENGTH = 6;
92  
93      /**
94       * Tests if the given character is a vowel.
95       *
96       * @param c
97       *            the character to test
98       * @return <code>true</code> if the character is a vowel, <code>false</code> otherwise
99       */
100     private static boolean isVowel(final char c) {
101         return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
102     }
103 
104     /**
105      * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
106      * a time: [i-1, i, i+1, i+2].
107      *
108      * @param prev
109      *            the previous character
110      * @param curr
111      *            the current character
112      * @param next
113      *            the next character
114      * @param aNext
115      *            the after next character
116      * @return a transcoded array of characters, starting from the current position
117      */
118     private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
119         // 1. EV -> AF
120         if (curr == 'E' && next == 'V') {
121             return CHARS_AF;
122         }
123 
124         // A, E, I, O, U -> A
125         if (isVowel(curr)) {
126             return CHARS_A;
127         }
128 
129         // 2. Q -> G, Z -> S, M -> N
130         if (curr == 'Q') {
131             return CHARS_G;
132         } else if (curr == 'Z') {
133             return CHARS_S;
134         } else if (curr == 'M') {
135             return CHARS_N;
136         }
137 
138         // 3. KN -> NN else K -> C
139         if (curr == 'K') {
140             if (next == 'N') {
141                 return CHARS_NN;
142             }
143             return CHARS_C;
144         }
145 
146         // 4. SCH -> SSS
147         if (curr == 'S' && next == 'C' && aNext == 'H') {
148             return CHARS_SSS;
149         }
150 
151         // PH -> FF
152         if (curr == 'P' && next == 'H') {
153             return CHARS_FF;
154         }
155 
156         // 5. H -> If previous or next is a non vowel, previous.
157         if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
158             return new char[] { prev };
159         }
160 
161         // 6. W -> If previous is vowel, previous.
162         if (curr == 'W' && isVowel(prev)) {
163             return new char[] { prev };
164         }
165 
166         return new char[] { curr };
167     }
168 
169     /** Indicates the strict mode. */
170     private final boolean strict;
171 
172     /**
173      * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
174      * i.e. encoded strings have a maximum length of 6.
175      */
176     public Nysiis() {
177         this(true);
178     }
179 
180     /**
181      * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
182      *
183      * <ul>
184      *  <li><code>true</code>: encoded strings have a maximum length of 6</li>
185      *  <li><code>false</code>: encoded strings may have arbitrary length</li>
186      * </ul>
187      *
188      * @param strict
189      *            the strict mode
190      */
191     public Nysiis(final boolean strict) {
192         this.strict = strict;
193     }
194 
195     /**
196      * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
197      * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
198      * {@link String}.
199      *
200      * @param obj
201      *            Object to encode
202      * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
203      * @throws EncoderException
204      *            if the parameter supplied is not of a {@link String}
205      * @throws IllegalArgumentException
206      *            if a character is not mapped
207      */
208     @Override
209     public Object encode(final Object obj) throws EncoderException {
210         if (!(obj instanceof String)) {
211             throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
212         }
213         return this.nysiis((String) obj);
214     }
215 
216     /**
217      * Encodes a String using the NYSIIS algorithm.
218      *
219      * @param str
220      *            A String object to encode
221      * @return A Nysiis code corresponding to the String supplied
222      * @throws IllegalArgumentException
223      *            if a character is not mapped
224      */
225     @Override
226     public String encode(final String str) {
227         return this.nysiis(str);
228     }
229 
230     /**
231      * Indicates the strict mode for this {@link Nysiis} encoder.
232      *
233      * @return <code>true</code> if the encoder is configured for strict mode, <code>false</code> otherwise
234      */
235     public boolean isStrict() {
236         return this.strict;
237     }
238 
239     /**
240      * Retrieves the NYSIIS code for a given String object.
241      *
242      * @param str
243      *            String to encode using the NYSIIS algorithm
244      * @return A NYSIIS code for the String supplied
245      */
246     public String nysiis(String str) {
247         if (str == null) {
248             return null;
249         }
250 
251         // Use the same clean rules as Soundex
252         str = SoundexUtils.clean(str);
253 
254         if (str.length() == 0) {
255             return str;
256         }
257 
258         // Translate first characters of name:
259         // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
260         str = PAT_MAC.matcher(str).replaceFirst("MCC");
261         str = PAT_KN.matcher(str).replaceFirst("NN");
262         str = PAT_K.matcher(str).replaceFirst("C");
263         str = PAT_PH_PF.matcher(str).replaceFirst("FF");
264         str = PAT_SCH.matcher(str).replaceFirst("SSS");
265 
266         // Translate last characters of name:
267         // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
268         str = PAT_EE_IE.matcher(str).replaceFirst("Y");
269         str = PAT_DT_ETC.matcher(str).replaceFirst("D");
270 
271         // First character of key = first character of name.
272         final StringBuilder key = new StringBuilder(str.length());
273         key.append(str.charAt(0));
274 
275         // Transcode remaining characters, incrementing by one character each time
276         final char[] chars = str.toCharArray();
277         final int len = chars.length;
278 
279         for (int i = 1; i < len; i++) {
280             final char next = i < len - 1 ? chars[i + 1] : SPACE;
281             final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
282             final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
283             System.arraycopy(transcoded, 0, chars, i, transcoded.length);
284 
285             // only append the current char to the key if it is different from the last one
286             if (chars[i] != chars[i - 1]) {
287                 key.append(chars[i]);
288             }
289         }
290 
291         if (key.length() > 1) {
292             char lastChar = key.charAt(key.length() - 1);
293 
294             // If last character is S, remove it.
295             if (lastChar == 'S') {
296                 key.deleteCharAt(key.length() - 1);
297                 lastChar = key.charAt(key.length() - 1);
298             }
299 
300             if (key.length() > 2) {
301                 final char last2Char = key.charAt(key.length() - 2);
302                 // If last characters are AY, replace with Y.
303                 if (last2Char == 'A' && lastChar == 'Y') {
304                     key.deleteCharAt(key.length() - 2);
305                 }
306             }
307 
308             // If last character is A, remove it.
309             if (lastChar == 'A') {
310                 key.deleteCharAt(key.length() - 1);
311             }
312         }
313 
314         final String string = key.toString();
315         return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
316     }
317 
318 }