1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import java.util.Locale;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 public class Metaphone implements StringEncoder {
58
59
60
61
62 private static final String VOWELS = "AEIOU";
63
64
65
66
67 private static final String FRONTV = "EIY";
68
69
70
71
72 private static final String VARSON = "CSPTG";
73
74
75
76
77 private int maxCodeLen = 4;
78
79
80
81
82 public Metaphone() {
83
84 }
85
86
87
88
89
90
91
92
93
94 @Override
95 public Object encode(final Object obj) throws EncoderException {
96 if (!(obj instanceof String)) {
97 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
98 }
99 return metaphone((String) obj);
100 }
101
102
103
104
105
106
107
108 @Override
109 public String encode(final String str) {
110 return metaphone(str);
111 }
112
113
114
115
116
117
118 public int getMaxCodeLen() {
119 return this.maxCodeLen;
120 }
121
122 private boolean isLastChar(final int wdsz, final int n) {
123 return n + 1 == wdsz;
124 }
125
126
127
128
129
130
131
132
133 public boolean isMetaphoneEqual(final String str1, final String str2) {
134 return metaphone(str1).equals(metaphone(str2));
135 }
136
137 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
138 boolean matches = false;
139 if (index >= 0 && index < string.length() - 1) {
140 matches = string.charAt(index + 1) == c;
141 }
142 return matches;
143 }
144
145 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
146 boolean matches = false;
147 if (index > 0 && index < string.length()) {
148 matches = string.charAt(index - 1) == c;
149 }
150 return matches;
151 }
152
153 private boolean isVowel(final StringBuilder string, final int index) {
154 return VOWELS.indexOf(string.charAt(index)) >= 0;
155 }
156
157
158
159
160
161
162
163
164
165
166
167 public String metaphone(final String txt) {
168 boolean hard = false;
169 final int txtLength;
170 if (txt == null || (txtLength = txt.length()) == 0) {
171 return "";
172 }
173
174 if (txtLength == 1) {
175 return txt.toUpperCase(Locale.ENGLISH);
176 }
177
178 final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray();
179
180 final StringBuilder local = new StringBuilder(40);
181 final StringBuilder code = new StringBuilder(10);
182
183 switch (inwd[0]) {
184 case 'K':
185 case 'G':
186 case 'P':
187 if (inwd[1] == 'N') {
188 local.append(inwd, 1, inwd.length - 1);
189 } else {
190 local.append(inwd);
191 }
192 break;
193 case 'A':
194 if (inwd[1] == 'E') {
195 local.append(inwd, 1, inwd.length - 1);
196 } else {
197 local.append(inwd);
198 }
199 break;
200 case 'W':
201 if (inwd[1] == 'R') {
202 local.append(inwd, 1, inwd.length - 1);
203 break;
204 }
205 if (inwd[1] == 'H') {
206 local.append(inwd, 1, inwd.length - 1);
207 local.setCharAt(0, 'W');
208 } else {
209 local.append(inwd);
210 }
211 break;
212 case 'X':
213 inwd[0] = 'S';
214 local.append(inwd);
215 break;
216 default:
217 local.append(inwd);
218 }
219
220 final int wdsz = local.length();
221 int n = 0;
222
223 while (code.length() < getMaxCodeLen() && n < wdsz) {
224 final char symb = local.charAt(n);
225
226 if (symb == 'C' || !isPreviousChar(local, n, symb)) {
227
228 switch (symb) {
229 case 'A':
230 case 'E':
231 case 'I':
232 case 'O':
233 case 'U':
234 if (n == 0) {
235 code.append(symb);
236 }
237 break;
238 case 'B':
239 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) {
240 break;
241 }
242 code.append(symb);
243 break;
244 case 'C':
245
246 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247 break;
248 }
249 if (regionMatch(local, n, "CIA")) {
250 code.append('X');
251 break;
252 }
253 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
254 code.append('S');
255 break;
256 }
257 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) {
258 code.append('K');
259 break;
260 }
261 if (!isNextChar(local, n, 'H') || n == 0 && wdsz >= 3 && isVowel(local, 2)) {
262 code.append('K');
263 } else {
264 code.append('X');
265 }
266 break;
267 case 'D':
268 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
269 code.append('J');
270 n += 2;
271 } else {
272 code.append('T');
273 }
274 break;
275 case 'G':
276 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
277 break;
278 }
279 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
280 break;
281 }
282 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
283 break;
284 }
285
286 hard = isPreviousChar(local, n, 'G');
287 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
288 code.append('J');
289 } else {
290 code.append('K');
291 }
292 break;
293 case 'H':
294 if (isLastChar(wdsz, n)) {
295 break;
296 }
297 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
298 break;
299 }
300 if (isVowel(local, n + 1)) {
301 code.append('H');
302 }
303 break;
304 case 'F':
305 case 'J':
306 case 'L':
307 case 'M':
308 case 'N':
309 case 'R':
310 code.append(symb);
311 break;
312 case 'K':
313 if (n > 0) {
314 if (!isPreviousChar(local, n, 'C')) {
315 code.append(symb);
316 }
317 } else {
318 code.append(symb);
319 }
320 break;
321 case 'P':
322 if (isNextChar(local, n, 'H')) {
323
324 code.append('F');
325 } else {
326 code.append(symb);
327 }
328 break;
329 case 'Q':
330 code.append('K');
331 break;
332 case 'S':
333 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
334 code.append('X');
335 } else {
336 code.append('S');
337 }
338 break;
339 case 'T':
340 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
341 code.append('X');
342 break;
343 }
344 if (regionMatch(local, n, "TCH")) {
345
346 break;
347 }
348
349 if (regionMatch(local, n, "TH")) {
350 code.append('0');
351 } else {
352 code.append('T');
353 }
354 break;
355 case 'V':
356 code.append('F');
357 break;
358 case 'W':
359 case 'Y':
360 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
361 code.append(symb);
362 }
363 break;
364 case 'X':
365 code.append('K');
366 code.append('S');
367 break;
368 case 'Z':
369 code.append('S');
370 break;
371 default:
372
373 break;
374 }
375 }
376 n++;
377 if (code.length() > getMaxCodeLen()) {
378 code.setLength(getMaxCodeLen());
379 }
380 }
381 return code.toString();
382 }
383
384 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
385 boolean matches = false;
386 if (index >= 0 && index + test.length() - 1 < string.length()) {
387 final String substring = string.substring(index, index + test.length());
388 matches = substring.equals(test);
389 }
390 return matches;
391 }
392
393
394
395
396
397
398 public void setMaxCodeLen(final int maxCodeLen) {
399 this.maxCodeLen = maxCodeLen;
400 }
401
402 }