1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53 public class Metaphone implements StringEncoder {
54
55
56
57
58 private static final String VOWELS = "AEIOU";
59
60
61
62
63 private static final String FRONTV = "EIY";
64
65
66
67
68 private static final String VARSON = "CSPTG";
69
70
71
72
73 private int maxCodeLen = 4;
74
75
76
77
78 public Metaphone() {
79 super();
80 }
81
82
83
84
85
86
87
88
89
90
91
92 public String metaphone(final String txt) {
93 boolean hard = false;
94 if (txt == null || txt.length() == 0) {
95 return "";
96 }
97
98 if (txt.length() == 1) {
99 return txt.toUpperCase(java.util.Locale.ENGLISH);
100 }
101
102 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
103
104 final StringBuilder local = new StringBuilder(40);
105 final StringBuilder code = new StringBuilder(10);
106
107 switch(inwd[0]) {
108 case 'K':
109 case 'G':
110 case 'P':
111 if (inwd[1] == 'N') {
112 local.append(inwd, 1, inwd.length - 1);
113 } else {
114 local.append(inwd);
115 }
116 break;
117 case 'A':
118 if (inwd[1] == 'E') {
119 local.append(inwd, 1, inwd.length - 1);
120 } else {
121 local.append(inwd);
122 }
123 break;
124 case 'W':
125 if (inwd[1] == 'R') {
126 local.append(inwd, 1, inwd.length - 1);
127 break;
128 }
129 if (inwd[1] == 'H') {
130 local.append(inwd, 1, inwd.length - 1);
131 local.setCharAt(0, 'W');
132 } else {
133 local.append(inwd);
134 }
135 break;
136 case 'X':
137 inwd[0] = 'S';
138 local.append(inwd);
139 break;
140 default:
141 local.append(inwd);
142 }
143
144 final int wdsz = local.length();
145 int n = 0;
146
147 while (code.length() < this.getMaxCodeLen() &&
148 n < wdsz ) {
149 final char symb = local.charAt(n);
150
151 if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
152 n++;
153 } else {
154 switch(symb) {
155 case 'A':
156 case 'E':
157 case 'I':
158 case 'O':
159 case 'U':
160 if (n == 0) {
161 code.append(symb);
162 }
163 break;
164 case 'B':
165 if ( isPreviousChar(local, n, 'M') &&
166 isLastChar(wdsz, n) ) {
167 break;
168 }
169 code.append(symb);
170 break;
171 case 'C':
172
173 if ( isPreviousChar(local, n, 'S') &&
174 !isLastChar(wdsz, n) &&
175 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
176 break;
177 }
178 if (regionMatch(local, n, "CIA")) {
179 code.append('X');
180 break;
181 }
182 if (!isLastChar(wdsz, n) &&
183 FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
184 code.append('S');
185 break;
186 }
187 if (isPreviousChar(local, n, 'S') &&
188 isNextChar(local, n, 'H') ) {
189 code.append('K');
190 break;
191 }
192 if (isNextChar(local, n, 'H')) {
193 if (n == 0 &&
194 wdsz >= 3 &&
195 isVowel(local,2) ) {
196 code.append('K');
197 } else {
198 code.append('X');
199 }
200 } else {
201 code.append('K');
202 }
203 break;
204 case 'D':
205 if (!isLastChar(wdsz, n + 1) &&
206 isNextChar(local, n, 'G') &&
207 FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
208 code.append('J'); n += 2;
209 } else {
210 code.append('T');
211 }
212 break;
213 case 'G':
214 if (isLastChar(wdsz, n + 1) &&
215 isNextChar(local, n, 'H')) {
216 break;
217 }
218 if (!isLastChar(wdsz, n + 1) &&
219 isNextChar(local,n,'H') &&
220 !isVowel(local,n+2)) {
221 break;
222 }
223 if (n > 0 &&
224 ( regionMatch(local, n, "GN") ||
225 regionMatch(local, n, "GNED") ) ) {
226 break;
227 }
228 if (isPreviousChar(local, n, 'G')) {
229
230 hard = true;
231 } else {
232 hard = false;
233 }
234 if (!isLastChar(wdsz, n) &&
235 FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
236 !hard) {
237 code.append('J');
238 } else {
239 code.append('K');
240 }
241 break;
242 case 'H':
243 if (isLastChar(wdsz, n)) {
244 break;
245 }
246 if (n > 0 &&
247 VARSON.indexOf(local.charAt(n - 1)) >= 0) {
248 break;
249 }
250 if (isVowel(local,n+1)) {
251 code.append('H');
252 }
253 break;
254 case 'F':
255 case 'J':
256 case 'L':
257 case 'M':
258 case 'N':
259 case 'R':
260 code.append(symb);
261 break;
262 case 'K':
263 if (n > 0) {
264 if (!isPreviousChar(local, n, 'C')) {
265 code.append(symb);
266 }
267 } else {
268 code.append(symb);
269 }
270 break;
271 case 'P':
272 if (isNextChar(local,n,'H')) {
273
274 code.append('F');
275 } else {
276 code.append(symb);
277 }
278 break;
279 case 'Q':
280 code.append('K');
281 break;
282 case 'S':
283 if (regionMatch(local,n,"SH") ||
284 regionMatch(local,n,"SIO") ||
285 regionMatch(local,n,"SIA")) {
286 code.append('X');
287 } else {
288 code.append('S');
289 }
290 break;
291 case 'T':
292 if (regionMatch(local,n,"TIA") ||
293 regionMatch(local,n,"TIO")) {
294 code.append('X');
295 break;
296 }
297 if (regionMatch(local,n,"TCH")) {
298
299 break;
300 }
301
302 if (regionMatch(local,n,"TH")) {
303 code.append('0');
304 } else {
305 code.append('T');
306 }
307 break;
308 case 'V':
309 code.append('F'); break;
310 case 'W':
311 case 'Y':
312 if (!isLastChar(wdsz,n) &&
313 isVowel(local,n+1)) {
314 code.append(symb);
315 }
316 break;
317 case 'X':
318 code.append('K');
319 code.append('S');
320 break;
321 case 'Z':
322 code.append('S');
323 break;
324 default:
325
326 break;
327 }
328 n++;
329 }
330 if (code.length() > this.getMaxCodeLen()) {
331 code.setLength(this.getMaxCodeLen());
332 }
333 }
334 return code.toString();
335 }
336
337 private boolean isVowel(final StringBuilder string, final int index) {
338 return VOWELS.indexOf(string.charAt(index)) >= 0;
339 }
340
341 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
342 boolean matches = false;
343 if( index > 0 &&
344 index < string.length() ) {
345 matches = string.charAt(index - 1) == c;
346 }
347 return matches;
348 }
349
350 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
351 boolean matches = false;
352 if( index >= 0 &&
353 index < string.length() - 1 ) {
354 matches = string.charAt(index + 1) == c;
355 }
356 return matches;
357 }
358
359 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
360 boolean matches = false;
361 if( index >= 0 &&
362 index + test.length() - 1 < string.length() ) {
363 final String substring = string.substring( index, index + test.length());
364 matches = substring.equals( test );
365 }
366 return matches;
367 }
368
369 private boolean isLastChar(final int wdsz, final int n) {
370 return n + 1 == wdsz;
371 }
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386 @Override
387 public Object encode(final Object obj) throws EncoderException {
388 if (!(obj instanceof String)) {
389 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
390 }
391 return metaphone((String) obj);
392 }
393
394
395
396
397
398
399
400 @Override
401 public String encode(final String str) {
402 return metaphone(str);
403 }
404
405
406
407
408
409
410
411
412
413 public boolean isMetaphoneEqual(final String str1, final String str2) {
414 return metaphone(str1).equals(metaphone(str2));
415 }
416
417
418
419
420
421 public int getMaxCodeLen() { return this.maxCodeLen; }
422
423
424
425
426
427 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
428
429 }