1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import java.util.regex.Pattern;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25 /**
26 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
27 * general purpose scheme to find word with similar phonemes.
28 * <p>
29 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
30 * </p>
31 * <p>
32 * Algorithm description:
33 * </p>
34 * <pre>
35 * 1. Transcode first characters of name
36 * 1a. MAC -> MCC
37 * 1b. KN -> NN
38 * 1c. K -> C
39 * 1d. PH -> FF
40 * 1e. PF -> FF
41 * 1f. SCH -> SSS
42 * 2. Transcode last characters of name
43 * 2a. EE, IE -> Y
44 * 2b. DT,RT,RD,NT,ND -> D
45 * 3. First character of key = first character of name
46 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
47 * 4a. EV -> AF else A,E,I,O,U -> A
48 * 4b. Q -> G
49 * 4c. Z -> S
50 * 4d. M -> N
51 * 4e. KN -> N else K -> C
52 * 4f. SCH -> SSS
53 * 4g. PH -> FF
54 * 4h. H -> If previous or next is non-vowel, previous
55 * 4i. W -> If previous is vowel, previous
56 * 4j. Add current to key if current != last key character
57 * 5. If last character is S, remove it
58 * 6. If last characters are AY, replace with Y
59 * 7. If last character is A, remove it
60 * 8. Collapse all strings of repeated characters
61 * 9. Add original first character of name as first character of key
62 * </pre>
63 * <p>
64 * This class is immutable and thread-safe.
65 * </p>
66 *
67 * @see <a href="https://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
68 * @see <a href="https://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
69 * @see Soundex
70 * @since 1.7
71 */
72 public class Nysiis implements StringEncoder {
73
74 private static final char[] CHARS_A = { 'A' };
75 private static final char[] CHARS_AF = { 'A', 'F' };
76 private static final char[] CHARS_C = { 'C' };
77 private static final char[] CHARS_FF = { 'F', 'F' };
78 private static final char[] CHARS_G = { 'G' };
79 private static final char[] CHARS_N = { 'N' };
80 private static final char[] CHARS_NN = { 'N', 'N' };
81 private static final char[] CHARS_S = { 'S' };
82 private static final char[] CHARS_SSS = { 'S', 'S', 'S' };
83
84 private static final Pattern PAT_MAC = Pattern.compile("^MAC");
85 private static final Pattern PAT_KN = Pattern.compile("^KN");
86 private static final Pattern PAT_K = Pattern.compile("^K");
87 private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
88 private static final Pattern PAT_SCH = Pattern.compile("^SCH");
89 private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
90 private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
91
92 private static final char SPACE = ' ';
93 private static final int TRUE_LENGTH = 6;
94
95 /**
96 * Tests if the given character is a vowel.
97 *
98 * @param c
99 * the character to test.
100 * @return {@code true} if the character is a vowel, {@code false} otherwise.
101 */
102 private static boolean isVowel(final char c) {
103 return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
104 }
105
106 /**
107 * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
108 * a time: [i-1, i, i+1, i+2].
109 *
110 * @param prev
111 * the previous character.
112 * @param curr
113 * the current character.
114 * @param next
115 * the next character.
116 * @param aNext
117 * the after next character.
118 * @return a transcoded array of characters, starting from the current position.
119 */
120 private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
121 // 1. EV -> AF
122 if (curr == 'E' && next == 'V') {
123 return CHARS_AF;
124 }
125
126 // A, E, I, O, U -> A
127 if (isVowel(curr)) {
128 return CHARS_A;
129 }
130
131 // 2. Q -> G, Z -> S, M -> N
132
133 // 3. KN -> NN else K -> C
134 switch (curr) {
135 case 'Q':
136 return CHARS_G;
137 case 'Z':
138 return CHARS_S;
139 case 'M':
140 return CHARS_N;
141 case 'K':
142 if (next == 'N') {
143 return CHARS_NN;
144 }
145 return CHARS_C;
146 default:
147 break;
148 }
149
150 // 4. SCH -> SSS
151 if (curr == 'S' && next == 'C' && aNext == 'H') {
152 return CHARS_SSS;
153 }
154
155 // PH -> FF
156 if (curr == 'P' && next == 'H') {
157 return CHARS_FF;
158 }
159
160 // 5. H -> If previous or next is a non vowel, previous.
161 if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
162 return new char[] { prev };
163 }
164
165 // 6. W -> If previous is vowel, previous.
166 if (curr == 'W' && isVowel(prev)) {
167 return new char[] { prev };
168 }
169
170 return new char[] { curr };
171 }
172
173 /** Indicates the strict mode. */
174 private final boolean strict;
175
176 /**
177 * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
178 * i.e. encoded strings have a maximum length of 6.
179 */
180 public Nysiis() {
181 this(true);
182 }
183
184 /**
185 * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
186 *
187 * <ul>
188 * <li>{@code true}: encoded strings have a maximum length of 6</li>
189 * <li>{@code false}: encoded strings may have arbitrary length</li>
190 * </ul>
191 *
192 * @param strict
193 * the strict mode.
194 */
195 public Nysiis(final boolean strict) {
196 this.strict = strict;
197 }
198
199 /**
200 * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
201 * {@link EncoderException} if the supplied object is not of type {@link String}.
202 *
203 * @param obj Object to encode.
204 * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
205 * @throws EncoderException if the parameter supplied is not of a {@link String}.
206 * @throws IllegalArgumentException if a character is not mapped.
207 */
208 @Override
209 public Object encode(final Object obj) throws EncoderException {
210 if (!(obj instanceof String)) {
211 throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
212 }
213 return nysiis((String) obj);
214 }
215
216 /**
217 * Encodes a String using the NYSIIS algorithm.
218 *
219 * @param str A String object to encode.
220 * @return A Nysiis code corresponding to the String supplied.
221 * @throws IllegalArgumentException if a character is not mapped.
222 */
223 @Override
224 public String encode(final String str) {
225 return nysiis(str);
226 }
227
228 /**
229 * Indicates the strict mode for this {@link Nysiis} encoder.
230 *
231 * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
232 */
233 public boolean isStrict() {
234 return this.strict;
235 }
236
237 /**
238 * Retrieves the NYSIIS code for a given String object.
239 *
240 * @param str String to encode using the NYSIIS algorithm.
241 * @return A NYSIIS code for the String supplied.
242 */
243 public String nysiis(String str) {
244 if (str == null) {
245 return null;
246 }
247
248 // Use the same clean rules as Soundex
249 str = SoundexUtils.clean(str);
250
251 if (str.isEmpty()) {
252 return str;
253 }
254
255 // Translate first characters of name:
256 // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
257 str = PAT_MAC.matcher(str).replaceFirst("MCC");
258 str = PAT_KN.matcher(str).replaceFirst("NN");
259 str = PAT_K.matcher(str).replaceFirst("C");
260 str = PAT_PH_PF.matcher(str).replaceFirst("FF");
261 str = PAT_SCH.matcher(str).replaceFirst("SSS");
262
263 // Translate last characters of name:
264 // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
265 str = PAT_EE_IE.matcher(str).replaceFirst("Y");
266 str = PAT_DT_ETC.matcher(str).replaceFirst("D");
267
268 // First character of key = first character of name.
269 final StringBuilder key = new StringBuilder(str.length());
270 key.append(str.charAt(0));
271
272 // Transcode remaining characters, incrementing by one character each time
273 final char[] chars = str.toCharArray();
274 final int len = chars.length;
275
276 for (int i = 1; i < len; i++) {
277 final char next = i < len - 1 ? chars[i + 1] : SPACE;
278 final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
279 final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
280 System.arraycopy(transcoded, 0, chars, i, transcoded.length);
281
282 // only append the current char to the key if it is different from the last one
283 if (chars[i] != chars[i - 1]) {
284 key.append(chars[i]);
285 }
286 }
287
288 if (key.length() > 1) {
289 char lastChar = key.charAt(key.length() - 1);
290
291 // If last character is S, remove it.
292 if (lastChar == 'S') {
293 key.deleteCharAt(key.length() - 1);
294 lastChar = key.charAt(key.length() - 1);
295 }
296
297 if (key.length() > 2) {
298 final char last2Char = key.charAt(key.length() - 2);
299 // If last characters are AY, replace with Y.
300 if (last2Char == 'A' && lastChar == 'Y') {
301 key.deleteCharAt(key.length() - 2);
302 }
303 }
304
305 // If last character is A, remove it.
306 if (lastChar == 'A') {
307 key.deleteCharAt(key.length() - 1);
308 }
309 }
310
311 final String string = key.toString();
312 return isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
313 }
314
315 }