1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53 public class Metaphone implements StringEncoder {
54
55
56
57
58 private static final String VOWELS = "AEIOU";
59
60
61
62
63 private static final String FRONTV = "EIY";
64
65
66
67
68 private static final String VARSON = "CSPTG";
69
70
71
72
73 private int maxCodeLen = 4;
74
75
76
77
78 public Metaphone() {
79 super();
80 }
81
82
83
84
85
86
87
88
89
90
91
92 public String metaphone(final String txt) {
93 boolean hard = false;
94 int txtLength;
95 if (txt == null || (txtLength = txt.length()) == 0) {
96 return "";
97 }
98
99 if (txtLength == 1) {
100 return txt.toUpperCase(java.util.Locale.ENGLISH);
101 }
102
103 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
104
105 final StringBuilder local = new StringBuilder(40);
106 final StringBuilder code = new StringBuilder(10);
107
108 switch(inwd[0]) {
109 case 'K':
110 case 'G':
111 case 'P':
112 if (inwd[1] == 'N') {
113 local.append(inwd, 1, inwd.length - 1);
114 } else {
115 local.append(inwd);
116 }
117 break;
118 case 'A':
119 if (inwd[1] == 'E') {
120 local.append(inwd, 1, inwd.length - 1);
121 } else {
122 local.append(inwd);
123 }
124 break;
125 case 'W':
126 if (inwd[1] == 'R') {
127 local.append(inwd, 1, inwd.length - 1);
128 break;
129 }
130 if (inwd[1] == 'H') {
131 local.append(inwd, 1, inwd.length - 1);
132 local.setCharAt(0, 'W');
133 } else {
134 local.append(inwd);
135 }
136 break;
137 case 'X':
138 inwd[0] = 'S';
139 local.append(inwd);
140 break;
141 default:
142 local.append(inwd);
143 }
144
145 final int wdsz = local.length();
146 int n = 0;
147
148 while (code.length() < this.getMaxCodeLen() &&
149 n < wdsz ) {
150 final char symb = local.charAt(n);
151
152 if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
153 n++;
154 } else {
155 switch(symb) {
156 case 'A':
157 case 'E':
158 case 'I':
159 case 'O':
160 case 'U':
161 if (n == 0) {
162 code.append(symb);
163 }
164 break;
165 case 'B':
166 if ( isPreviousChar(local, n, 'M') &&
167 isLastChar(wdsz, n) ) {
168 break;
169 }
170 code.append(symb);
171 break;
172 case 'C':
173
174 if ( isPreviousChar(local, n, 'S') &&
175 !isLastChar(wdsz, n) &&
176 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
177 break;
178 }
179 if (regionMatch(local, n, "CIA")) {
180 code.append('X');
181 break;
182 }
183 if (!isLastChar(wdsz, n) &&
184 FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
185 code.append('S');
186 break;
187 }
188 if (isPreviousChar(local, n, 'S') &&
189 isNextChar(local, n, 'H') ) {
190 code.append('K');
191 break;
192 }
193 if (isNextChar(local, n, 'H')) {
194 if (n == 0 &&
195 wdsz >= 3 &&
196 isVowel(local,2) ) {
197 code.append('K');
198 } else {
199 code.append('X');
200 }
201 } else {
202 code.append('K');
203 }
204 break;
205 case 'D':
206 if (!isLastChar(wdsz, n + 1) &&
207 isNextChar(local, n, 'G') &&
208 FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
209 code.append('J'); n += 2;
210 } else {
211 code.append('T');
212 }
213 break;
214 case 'G':
215 if (isLastChar(wdsz, n + 1) &&
216 isNextChar(local, n, 'H')) {
217 break;
218 }
219 if (!isLastChar(wdsz, n + 1) &&
220 isNextChar(local,n,'H') &&
221 !isVowel(local,n+2)) {
222 break;
223 }
224 if (n > 0 &&
225 ( regionMatch(local, n, "GN") ||
226 regionMatch(local, n, "GNED") ) ) {
227 break;
228 }
229 if (isPreviousChar(local, n, 'G')) {
230
231 hard = true;
232 } else {
233 hard = false;
234 }
235 if (!isLastChar(wdsz, n) &&
236 FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
237 !hard) {
238 code.append('J');
239 } else {
240 code.append('K');
241 }
242 break;
243 case 'H':
244 if (isLastChar(wdsz, n)) {
245 break;
246 }
247 if (n > 0 &&
248 VARSON.indexOf(local.charAt(n - 1)) >= 0) {
249 break;
250 }
251 if (isVowel(local,n+1)) {
252 code.append('H');
253 }
254 break;
255 case 'F':
256 case 'J':
257 case 'L':
258 case 'M':
259 case 'N':
260 case 'R':
261 code.append(symb);
262 break;
263 case 'K':
264 if (n > 0) {
265 if (!isPreviousChar(local, n, 'C')) {
266 code.append(symb);
267 }
268 } else {
269 code.append(symb);
270 }
271 break;
272 case 'P':
273 if (isNextChar(local,n,'H')) {
274
275 code.append('F');
276 } else {
277 code.append(symb);
278 }
279 break;
280 case 'Q':
281 code.append('K');
282 break;
283 case 'S':
284 if (regionMatch(local,n,"SH") ||
285 regionMatch(local,n,"SIO") ||
286 regionMatch(local,n,"SIA")) {
287 code.append('X');
288 } else {
289 code.append('S');
290 }
291 break;
292 case 'T':
293 if (regionMatch(local,n,"TIA") ||
294 regionMatch(local,n,"TIO")) {
295 code.append('X');
296 break;
297 }
298 if (regionMatch(local,n,"TCH")) {
299
300 break;
301 }
302
303 if (regionMatch(local,n,"TH")) {
304 code.append('0');
305 } else {
306 code.append('T');
307 }
308 break;
309 case 'V':
310 code.append('F'); break;
311 case 'W':
312 case 'Y':
313 if (!isLastChar(wdsz,n) &&
314 isVowel(local,n+1)) {
315 code.append(symb);
316 }
317 break;
318 case 'X':
319 code.append('K');
320 code.append('S');
321 break;
322 case 'Z':
323 code.append('S');
324 break;
325 default:
326
327 break;
328 }
329 n++;
330 }
331 if (code.length() > this.getMaxCodeLen()) {
332 code.setLength(this.getMaxCodeLen());
333 }
334 }
335 return code.toString();
336 }
337
338 private boolean isVowel(final StringBuilder string, final int index) {
339 return VOWELS.indexOf(string.charAt(index)) >= 0;
340 }
341
342 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
343 boolean matches = false;
344 if( index > 0 &&
345 index < string.length() ) {
346 matches = string.charAt(index - 1) == c;
347 }
348 return matches;
349 }
350
351 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
352 boolean matches = false;
353 if( index >= 0 &&
354 index < string.length() - 1 ) {
355 matches = string.charAt(index + 1) == c;
356 }
357 return matches;
358 }
359
360 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
361 boolean matches = false;
362 if( index >= 0 &&
363 index + test.length() - 1 < string.length() ) {
364 final String substring = string.substring( index, index + test.length());
365 matches = substring.equals( test );
366 }
367 return matches;
368 }
369
370 private boolean isLastChar(final int wdsz, final int n) {
371 return n + 1 == wdsz;
372 }
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387 @Override
388 public Object encode(final Object obj) throws EncoderException {
389 if (!(obj instanceof String)) {
390 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
391 }
392 return metaphone((String) obj);
393 }
394
395
396
397
398
399
400
401 @Override
402 public String encode(final String str) {
403 return metaphone(str);
404 }
405
406
407
408
409
410
411
412
413
414 public boolean isMetaphoneEqual(final String str1, final String str2) {
415 return metaphone(str1).equals(metaphone(str2));
416 }
417
418
419
420
421
422 public int getMaxCodeLen() { return this.maxCodeLen; }
423
424
425
426
427
428 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
429
430 }