1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import java.util.regex.Pattern;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public class Nysiis implements StringEncoder {
71
72 private static final char[] CHARS_A = new char[] { 'A' };
73 private static final char[] CHARS_AF = new char[] { 'A', 'F' };
74 private static final char[] CHARS_C = new char[] { 'C' };
75 private static final char[] CHARS_FF = new char[] { 'F', 'F' };
76 private static final char[] CHARS_G = new char[] { 'G' };
77 private static final char[] CHARS_N = new char[] { 'N' };
78 private static final char[] CHARS_NN = new char[] { 'N', 'N' };
79 private static final char[] CHARS_S = new char[] { 'S' };
80 private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
81
82 private static final Pattern PAT_MAC = Pattern.compile("^MAC");
83 private static final Pattern PAT_KN = Pattern.compile("^KN");
84 private static final Pattern PAT_K = Pattern.compile("^K");
85 private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
86 private static final Pattern PAT_SCH = Pattern.compile("^SCH");
87 private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
88 private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
89
90 private static final char SPACE = ' ';
91 private static final int TRUE_LENGTH = 6;
92
93
94
95
96
97
98
99
100 private static boolean isVowel(final char c) {
101 return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
102 }
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
119
120 if (curr == 'E' && next == 'V') {
121 return CHARS_AF;
122 }
123
124
125 if (isVowel(curr)) {
126 return CHARS_A;
127 }
128
129
130 if (curr == 'Q') {
131 return CHARS_G;
132 } else if (curr == 'Z') {
133 return CHARS_S;
134 } else if (curr == 'M') {
135 return CHARS_N;
136 }
137
138
139 if (curr == 'K') {
140 if (next == 'N') {
141 return CHARS_NN;
142 }
143 return CHARS_C;
144 }
145
146
147 if (curr == 'S' && next == 'C' && aNext == 'H') {
148 return CHARS_SSS;
149 }
150
151
152 if (curr == 'P' && next == 'H') {
153 return CHARS_FF;
154 }
155
156
157 if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
158 return new char[] { prev };
159 }
160
161
162 if (curr == 'W' && isVowel(prev)) {
163 return new char[] { prev };
164 }
165
166 return new char[] { curr };
167 }
168
169
170 private final boolean strict;
171
172
173
174
175
176 public Nysiis() {
177 this(true);
178 }
179
180
181
182
183
184
185
186
187
188
189
190
191 public Nysiis(final boolean strict) {
192 this.strict = strict;
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208 @Override
209 public Object encode(final Object obj) throws EncoderException {
210 if (!(obj instanceof String)) {
211 throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
212 }
213 return this.nysiis((String) obj);
214 }
215
216
217
218
219
220
221
222
223
224
225 @Override
226 public String encode(final String str) {
227 return this.nysiis(str);
228 }
229
230
231
232
233
234
235 public boolean isStrict() {
236 return this.strict;
237 }
238
239
240
241
242
243
244
245
246 public String nysiis(String str) {
247 if (str == null) {
248 return null;
249 }
250
251
252 str = SoundexUtils.clean(str);
253
254 if (str.length() == 0) {
255 return str;
256 }
257
258
259
260 str = PAT_MAC.matcher(str).replaceFirst("MCC");
261 str = PAT_KN.matcher(str).replaceFirst("NN");
262 str = PAT_K.matcher(str).replaceFirst("C");
263 str = PAT_PH_PF.matcher(str).replaceFirst("FF");
264 str = PAT_SCH.matcher(str).replaceFirst("SSS");
265
266
267
268 str = PAT_EE_IE.matcher(str).replaceFirst("Y");
269 str = PAT_DT_ETC.matcher(str).replaceFirst("D");
270
271
272 final StringBuilder key = new StringBuilder(str.length());
273 key.append(str.charAt(0));
274
275
276 final char[] chars = str.toCharArray();
277 final int len = chars.length;
278
279 for (int i = 1; i < len; i++) {
280 final char next = i < len - 1 ? chars[i + 1] : SPACE;
281 final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
282 final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
283 System.arraycopy(transcoded, 0, chars, i, transcoded.length);
284
285
286 if (chars[i] != chars[i - 1]) {
287 key.append(chars[i]);
288 }
289 }
290
291 if (key.length() > 1) {
292 char lastChar = key.charAt(key.length() - 1);
293
294
295 if (lastChar == 'S') {
296 key.deleteCharAt(key.length() - 1);
297 lastChar = key.charAt(key.length() - 1);
298 }
299
300 if (key.length() > 2) {
301 final char last2Char = key.charAt(key.length() - 2);
302
303 if (last2Char == 'A' && lastChar == 'Y') {
304 key.deleteCharAt(key.length() - 2);
305 }
306 }
307
308
309 if (lastChar == 'A') {
310 key.deleteCharAt(key.length() - 1);
311 }
312 }
313
314 final String string = key.toString();
315 return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
316 }
317
318 }