001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language;
019
020 import org.apache.commons.codec.EncoderException;
021 import org.apache.commons.codec.StringEncoder;
022
023 /**
024 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
025 * Philips</CITE>.
026 * <p>
027 * This class is conditionally thread-safe. The instance field {@link #maxCodeLen} is mutable
028 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
029 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
030 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
031 *
032 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a>
033 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a>
034 *
035 * @version $Id: DoubleMetaphone.DoubleMetaphoneResult.html 889935 2013-12-11 05:05:13Z ggregory $
036 */
037 public class DoubleMetaphone implements StringEncoder {
038
039 /**
040 * "Vowels" to test for
041 */
042 private static final String VOWELS = "AEIOUY";
043
044 /**
045 * Prefixes when present which are not pronounced
046 */
047 private static final String[] SILENT_START =
048 { "GN", "KN", "PN", "WR", "PS" };
049 private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
050 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
051 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
052 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
053 private static final String[] L_T_K_S_N_M_B_Z =
054 { "L", "T", "K", "S", "N", "M", "B", "Z" };
055
056 /**
057 * Maximum length of an encoding, default is 4
058 */
059 private int maxCodeLen = 4;
060
061 /**
062 * Creates an instance of this DoubleMetaphone encoder
063 */
064 public DoubleMetaphone() {
065 super();
066 }
067
068 /**
069 * Encode a value with Double Metaphone.
070 *
071 * @param value String to encode
072 * @return an encoded string
073 */
074 public String doubleMetaphone(String value) {
075 return doubleMetaphone(value, false);
076 }
077
078 /**
079 * Encode a value with Double Metaphone, optionally using the alternate encoding.
080 *
081 * @param value String to encode
082 * @param alternate use alternate encode
083 * @return an encoded string
084 */
085 public String doubleMetaphone(String value, boolean alternate) {
086 value = cleanInput(value);
087 if (value == null) {
088 return null;
089 }
090
091 boolean slavoGermanic = isSlavoGermanic(value);
092 int index = isSilentStart(value) ? 1 : 0;
093
094 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
095
096 while (!result.isComplete() && index <= value.length() - 1) {
097 switch (value.charAt(index)) {
098 case 'A':
099 case 'E':
100 case 'I':
101 case 'O':
102 case 'U':
103 case 'Y':
104 index = handleAEIOUY(result, index);
105 break;
106 case 'B':
107 result.append('P');
108 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109 break;
110 case '\u00C7':
111 // A C with a Cedilla
112 result.append('S');
113 index++;
114 break;
115 case 'C':
116 index = handleC(value, result, index);
117 break;
118 case 'D':
119 index = handleD(value, result, index);
120 break;
121 case 'F':
122 result.append('F');
123 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124 break;
125 case 'G':
126 index = handleG(value, result, index, slavoGermanic);
127 break;
128 case 'H':
129 index = handleH(value, result, index);
130 break;
131 case 'J':
132 index = handleJ(value, result, index, slavoGermanic);
133 break;
134 case 'K':
135 result.append('K');
136 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137 break;
138 case 'L':
139 index = handleL(value, result, index);
140 break;
141 case 'M':
142 result.append('M');
143 index = conditionM0(value, index) ? index + 2 : index + 1;
144 break;
145 case 'N':
146 result.append('N');
147 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148 break;
149 case '\u00D1':
150 // N with a tilde (spanish ene)
151 result.append('N');
152 index++;
153 break;
154 case 'P':
155 index = handleP(value, result, index);
156 break;
157 case 'Q':
158 result.append('K');
159 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160 break;
161 case 'R':
162 index = handleR(value, result, index, slavoGermanic);
163 break;
164 case 'S':
165 index = handleS(value, result, index, slavoGermanic);
166 break;
167 case 'T':
168 index = handleT(value, result, index);
169 break;
170 case 'V':
171 result.append('F');
172 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173 break;
174 case 'W':
175 index = handleW(value, result, index);
176 break;
177 case 'X':
178 index = handleX(value, result, index);
179 break;
180 case 'Z':
181 index = handleZ(value, result, index, slavoGermanic);
182 break;
183 default:
184 index++;
185 break;
186 }
187 }
188
189 return alternate ? result.getAlternate() : result.getPrimary();
190 }
191
192 /**
193 * Encode the value using DoubleMetaphone. It will only work if
194 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
195 *
196 * @param obj Object to encode (should be of type String)
197 * @return An encoded Object (will be of type String)
198 * @throws EncoderException encode parameter is not of type String
199 */
200 @Override
201 public Object encode(Object obj) throws EncoderException {
202 if (!(obj instanceof String)) {
203 throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
204 }
205 return doubleMetaphone((String) obj);
206 }
207
208 /**
209 * Encode the value using DoubleMetaphone.
210 *
211 * @param value String to encode
212 * @return An encoded String
213 */
214 @Override
215 public String encode(String value) {
216 return doubleMetaphone(value);
217 }
218
219 /**
220 * Check if the Double Metaphone values of two <code>String</code> values
221 * are equal.
222 *
223 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
224 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
225 * @return {@code true} if the encoded <code>String</code>s are equal;
226 * {@code false} otherwise.
227 * @see #isDoubleMetaphoneEqual(String,String,boolean)
228 */
229 public boolean isDoubleMetaphoneEqual(String value1, String value2) {
230 return isDoubleMetaphoneEqual(value1, value2, false);
231 }
232
233 /**
234 * Check if the Double Metaphone values of two <code>String</code> values
235 * are equal, optionally using the alternate value.
236 *
237 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
238 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
239 * @param alternate use the alternate value if {@code true}.
240 * @return {@code true} if the encoded <code>String</code>s are equal;
241 * {@code false} otherwise.
242 */
243 public boolean isDoubleMetaphoneEqual(String value1, String value2, boolean alternate) {
244 return doubleMetaphone(value1, alternate).equals(doubleMetaphone(value2, alternate));
245 }
246
247 /**
248 * Returns the maxCodeLen.
249 * @return int
250 */
251 public int getMaxCodeLen() {
252 return this.maxCodeLen;
253 }
254
255 /**
256 * Sets the maxCodeLen.
257 * @param maxCodeLen The maxCodeLen to set
258 */
259 public void setMaxCodeLen(int maxCodeLen) {
260 this.maxCodeLen = maxCodeLen;
261 }
262
263 //-- BEGIN HANDLERS --//
264
265 /**
266 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
267 */
268 private int handleAEIOUY(DoubleMetaphoneResult result, int index) {
269 if (index == 0) {
270 result.append('A');
271 }
272 return index + 1;
273 }
274
275 /**
276 * Handles 'C' cases.
277 */
278 private int handleC(String value, DoubleMetaphoneResult result, int index) {
279 if (conditionC0(value, index)) { // very confusing, moved out
280 result.append('K');
281 index += 2;
282 } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
283 result.append('S');
284 index += 2;
285 } else if (contains(value, index, 2, "CH")) {
286 index = handleCH(value, result, index);
287 } else if (contains(value, index, 2, "CZ") &&
288 !contains(value, index - 2, 4, "WICZ")) {
289 //-- "Czerny" --//
290 result.append('S', 'X');
291 index += 2;
292 } else if (contains(value, index + 1, 3, "CIA")) {
293 //-- "focaccia" --//
294 result.append('X');
295 index += 3;
296 } else if (contains(value, index, 2, "CC") &&
297 !(index == 1 && charAt(value, 0) == 'M')) {
298 //-- double "cc" but not "McClelland" --//
299 return handleCC(value, result, index);
300 } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
301 result.append('K');
302 index += 2;
303 } else if (contains(value, index, 2, "CI", "CE", "CY")) {
304 //-- Italian vs. English --//
305 if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
306 result.append('S', 'X');
307 } else {
308 result.append('S');
309 }
310 index += 2;
311 } else {
312 result.append('K');
313 if (contains(value, index + 1, 2, " C", " Q", " G")) {
314 //-- Mac Caffrey, Mac Gregor --//
315 index += 3;
316 } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
317 !contains(value, index + 1, 2, "CE", "CI")) {
318 index += 2;
319 } else {
320 index++;
321 }
322 }
323
324 return index;
325 }
326
327 /**
328 * Handles 'CC' cases.
329 */
330 private int handleCC(String value, DoubleMetaphoneResult result, int index) {
331 if (contains(value, index + 2, 1, "I", "E", "H") &&
332 !contains(value, index + 2, 2, "HU")) {
333 //-- "bellocchio" but not "bacchus" --//
334 if ((index == 1 && charAt(value, index - 1) == 'A') ||
335 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
336 //-- "accident", "accede", "succeed" --//
337 result.append("KS");
338 } else {
339 //-- "bacci", "bertucci", other Italian --//
340 result.append('X');
341 }
342 index += 3;
343 } else { // Pierce's rule
344 result.append('K');
345 index += 2;
346 }
347
348 return index;
349 }
350
351 /**
352 * Handles 'CH' cases.
353 */
354 private int handleCH(String value, DoubleMetaphoneResult result, int index) {
355 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
356 result.append('K', 'X');
357 return index + 2;
358 } else if (conditionCH0(value, index)) {
359 //-- Greek roots ("chemistry", "chorus", etc.) --//
360 result.append('K');
361 return index + 2;
362 } else if (conditionCH1(value, index)) {
363 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
364 result.append('K');
365 return index + 2;
366 } else {
367 if (index > 0) {
368 if (contains(value, 0, 2, "MC")) {
369 result.append('K');
370 } else {
371 result.append('X', 'K');
372 }
373 } else {
374 result.append('X');
375 }
376 return index + 2;
377 }
378 }
379
380 /**
381 * Handles 'D' cases.
382 */
383 private int handleD(String value, DoubleMetaphoneResult result, int index) {
384 if (contains(value, index, 2, "DG")) {
385 //-- "Edge" --//
386 if (contains(value, index + 2, 1, "I", "E", "Y")) {
387 result.append('J');
388 index += 3;
389 //-- "Edgar" --//
390 } else {
391 result.append("TK");
392 index += 2;
393 }
394 } else if (contains(value, index, 2, "DT", "DD")) {
395 result.append('T');
396 index += 2;
397 } else {
398 result.append('T');
399 index++;
400 }
401 return index;
402 }
403
404 /**
405 * Handles 'G' cases.
406 */
407 private int handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
408 if (charAt(value, index + 1) == 'H') {
409 index = handleGH(value, result, index);
410 } else if (charAt(value, index + 1) == 'N') {
411 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
412 result.append("KN", "N");
413 } else if (!contains(value, index + 2, 2, "EY") &&
414 charAt(value, index + 1) != 'Y' && !slavoGermanic) {
415 result.append("N", "KN");
416 } else {
417 result.append("KN");
418 }
419 index = index + 2;
420 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
421 result.append("KL", "L");
422 index += 2;
423 } else if (index == 0 &&
424 (charAt(value, index + 1) == 'Y' ||
425 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
426 //-- -ges-, -gep-, -gel-, -gie- at beginning --//
427 result.append('K', 'J');
428 index += 2;
429 } else if ((contains(value, index + 1, 2, "ER") ||
430 charAt(value, index + 1) == 'Y') &&
431 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
432 !contains(value, index - 1, 1, "E", "I") &&
433 !contains(value, index - 1, 3, "RGY", "OGY")) {
434 //-- -ger-, -gy- --//
435 result.append('K', 'J');
436 index += 2;
437 } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
438 contains(value, index - 1, 4, "AGGI", "OGGI")) {
439 //-- Italian "biaggi" --//
440 if (contains(value, 0 ,4, "VAN ", "VON ") ||
441 contains(value, 0, 3, "SCH") ||
442 contains(value, index + 1, 2, "ET")) {
443 //-- obvious germanic --//
444 result.append('K');
445 } else if (contains(value, index + 1, 3, "IER")) {
446 result.append('J');
447 } else {
448 result.append('J', 'K');
449 }
450 index += 2;
451 } else if (charAt(value, index + 1) == 'G') {
452 index += 2;
453 result.append('K');
454 } else {
455 index++;
456 result.append('K');
457 }
458 return index;
459 }
460
461 /**
462 * Handles 'GH' cases.
463 */
464 private int handleGH(String value, DoubleMetaphoneResult result, int index) {
465 if (index > 0 && !isVowel(charAt(value, index - 1))) {
466 result.append('K');
467 index += 2;
468 } else if (index == 0) {
469 if (charAt(value, index + 2) == 'I') {
470 result.append('J');
471 } else {
472 result.append('K');
473 }
474 index += 2;
475 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
476 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
477 (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
478 //-- Parker's rule (with some further refinements) - "hugh"
479 index += 2;
480 } else {
481 if (index > 2 && charAt(value, index - 1) == 'U' &&
482 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
483 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
484 result.append('F');
485 } else if (index > 0 && charAt(value, index - 1) != 'I') {
486 result.append('K');
487 }
488 index += 2;
489 }
490 return index;
491 }
492
493 /**
494 * Handles 'H' cases.
495 */
496 private int handleH(String value, DoubleMetaphoneResult result, int index) {
497 //-- only keep if first & before vowel or between 2 vowels --//
498 if ((index == 0 || isVowel(charAt(value, index - 1))) &&
499 isVowel(charAt(value, index + 1))) {
500 result.append('H');
501 index += 2;
502 //-- also takes car of "HH" --//
503 } else {
504 index++;
505 }
506 return index;
507 }
508
509 /**
510 * Handles 'J' cases.
511 */
512 private int handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
513 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
514 //-- obvious Spanish, "Jose", "San Jacinto" --//
515 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
516 value.length() == 4) || contains(value, 0, 4, "SAN ")) {
517 result.append('H');
518 } else {
519 result.append('J', 'H');
520 }
521 index++;
522 } else {
523 if (index == 0 && !contains(value, index, 4, "JOSE")) {
524 result.append('J', 'A');
525 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
526 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
527 result.append('J', 'H');
528 } else if (index == value.length() - 1) {
529 result.append('J', ' ');
530 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
531 !contains(value, index - 1, 1, "S", "K", "L")) {
532 result.append('J');
533 }
534
535 if (charAt(value, index + 1) == 'J') {
536 index += 2;
537 } else {
538 index++;
539 }
540 }
541 return index;
542 }
543
544 /**
545 * Handles 'L' cases.
546 */
547 private int handleL(String value, DoubleMetaphoneResult result, int index) {
548 if (charAt(value, index + 1) == 'L') {
549 if (conditionL0(value, index)) {
550 result.appendPrimary('L');
551 } else {
552 result.append('L');
553 }
554 index += 2;
555 } else {
556 index++;
557 result.append('L');
558 }
559 return index;
560 }
561
562 /**
563 * Handles 'P' cases.
564 */
565 private int handleP(String value, DoubleMetaphoneResult result, int index) {
566 if (charAt(value, index + 1) == 'H') {
567 result.append('F');
568 index += 2;
569 } else {
570 result.append('P');
571 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
572 }
573 return index;
574 }
575
576 /**
577 * Handles 'R' cases.
578 */
579 private int handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
580 if (index == value.length() - 1 && !slavoGermanic &&
581 contains(value, index - 2, 2, "IE") &&
582 !contains(value, index - 4, 2, "ME", "MA")) {
583 result.appendAlternate('R');
584 } else {
585 result.append('R');
586 }
587 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
588 }
589
590 /**
591 * Handles 'S' cases.
592 */
593 private int handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
594 if (contains(value, index - 1, 3, "ISL", "YSL")) {
595 //-- special cases "island", "isle", "carlisle", "carlysle" --//
596 index++;
597 } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
598 //-- special case "sugar-" --//
599 result.append('X', 'S');
600 index++;
601 } else if (contains(value, index, 2, "SH")) {
602 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
603 //-- germanic --//
604 result.append('S');
605 } else {
606 result.append('X');
607 }
608 index += 2;
609 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
610 //-- Italian and Armenian --//
611 if (slavoGermanic) {
612 result.append('S');
613 } else {
614 result.append('S', 'X');
615 }
616 index += 3;
617 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) ||
618 contains(value, index + 1, 1, "Z")) {
619 //-- german & anglicisations, e.g. "smith" match "schmidt" //
620 // "snider" match "schneider" --//
621 //-- also, -sz- in slavic language altho in hungarian it //
622 // is pronounced "s" --//
623 result.append('S', 'X');
624 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
625 } else if (contains(value, index, 2, "SC")) {
626 index = handleSC(value, result, index);
627 } else {
628 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
629 //-- french e.g. "resnais", "artois" --//
630 result.appendAlternate('S');
631 } else {
632 result.append('S');
633 }
634 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
635 }
636 return index;
637 }
638
639 /**
640 * Handles 'SC' cases.
641 */
642 private int handleSC(String value, DoubleMetaphoneResult result, int index) {
643 if (charAt(value, index + 2) == 'H') {
644 //-- Schlesinger's rule --//
645 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
646 //-- Dutch origin, e.g. "school", "schooner" --//
647 if (contains(value, index + 3, 2, "ER", "EN")) {
648 //-- "schermerhorn", "schenker" --//
649 result.append("X", "SK");
650 } else {
651 result.append("SK");
652 }
653 } else {
654 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
655 result.append('X', 'S');
656 } else {
657 result.append('X');
658 }
659 }
660 } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
661 result.append('S');
662 } else {
663 result.append("SK");
664 }
665 return index + 3;
666 }
667
668 /**
669 * Handles 'T' cases.
670 */
671 private int handleT(String value, DoubleMetaphoneResult result, int index) {
672 if (contains(value, index, 4, "TION")) {
673 result.append('X');
674 index += 3;
675 } else if (contains(value, index, 3, "TIA", "TCH")) {
676 result.append('X');
677 index += 3;
678 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
679 if (contains(value, index + 2, 2, "OM", "AM") ||
680 //-- special case "thomas", "thames" or germanic --//
681 contains(value, 0, 4, "VAN ", "VON ") ||
682 contains(value, 0, 3, "SCH")) {
683 result.append('T');
684 } else {
685 result.append('0', 'T');
686 }
687 index += 2;
688 } else {
689 result.append('T');
690 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
691 }
692 return index;
693 }
694
695 /**
696 * Handles 'W' cases.
697 */
698 private int handleW(String value, DoubleMetaphoneResult result, int index) {
699 if (contains(value, index, 2, "WR")) {
700 //-- can also be in middle of word --//
701 result.append('R');
702 index += 2;
703 } else {
704 if (index == 0 && (isVowel(charAt(value, index + 1)) ||
705 contains(value, index, 2, "WH"))) {
706 if (isVowel(charAt(value, index + 1))) {
707 //-- Wasserman should match Vasserman --//
708 result.append('A', 'F');
709 } else {
710 //-- need Uomo to match Womo --//
711 result.append('A');
712 }
713 index++;
714 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
715 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
716 contains(value, 0, 3, "SCH")) {
717 //-- Arnow should match Arnoff --//
718 result.appendAlternate('F');
719 index++;
720 } else if (contains(value, index, 4, "WICZ", "WITZ")) {
721 //-- Polish e.g. "filipowicz" --//
722 result.append("TS", "FX");
723 index += 4;
724 } else {
725 index++;
726 }
727 }
728 return index;
729 }
730
731 /**
732 * Handles 'X' cases.
733 */
734 private int handleX(String value, DoubleMetaphoneResult result, int index) {
735 if (index == 0) {
736 result.append('S');
737 index++;
738 } else {
739 if (!((index == value.length() - 1) &&
740 (contains(value, index - 3, 3, "IAU", "EAU") ||
741 contains(value, index - 2, 2, "AU", "OU")))) {
742 //-- French e.g. breaux --//
743 result.append("KS");
744 }
745 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
746 }
747 return index;
748 }
749
750 /**
751 * Handles 'Z' cases.
752 */
753 private int handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
754 if (charAt(value, index + 1) == 'H') {
755 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
756 result.append('J');
757 index += 2;
758 } else {
759 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
760 (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
761 result.append("S", "TS");
762 } else {
763 result.append('S');
764 }
765 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
766 }
767 return index;
768 }
769
770 //-- BEGIN CONDITIONS --//
771
772 /**
773 * Complex condition 0 for 'C'.
774 */
775 private boolean conditionC0(String value, int index) {
776 if (contains(value, index, 4, "CHIA")) {
777 return true;
778 } else if (index <= 1) {
779 return false;
780 } else if (isVowel(charAt(value, index - 2))) {
781 return false;
782 } else if (!contains(value, index - 1, 3, "ACH")) {
783 return false;
784 } else {
785 char c = charAt(value, index + 2);
786 return (c != 'I' && c != 'E') ||
787 contains(value, index - 2, 6, "BACHER", "MACHER");
788 }
789 }
790
791 /**
792 * Complex condition 0 for 'CH'.
793 */
794 private boolean conditionCH0(String value, int index) {
795 if (index != 0) {
796 return false;
797 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
798 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
799 return false;
800 } else if (contains(value, 0, 5, "CHORE")) {
801 return false;
802 } else {
803 return true;
804 }
805 }
806
807 /**
808 * Complex condition 1 for 'CH'.
809 */
810 private boolean conditionCH1(String value, int index) {
811 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) ||
812 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
813 contains(value, index + 2, 1, "T", "S") ||
814 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
815 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
816 }
817
818 /**
819 * Complex condition 0 for 'L'.
820 */
821 private boolean conditionL0(String value, int index) {
822 if (index == value.length() - 3 &&
823 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
824 return true;
825 } else if ((contains(value, value.length() - 2, 2, "AS", "OS") ||
826 contains(value, value.length() - 1, 1, "A", "O")) &&
827 contains(value, index - 1, 4, "ALLE")) {
828 return true;
829 } else {
830 return false;
831 }
832 }
833
834 /**
835 * Complex condition 0 for 'M'.
836 */
837 private boolean conditionM0(String value, int index) {
838 if (charAt(value, index + 1) == 'M') {
839 return true;
840 }
841 return contains(value, index - 1, 3, "UMB") &&
842 ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER"));
843 }
844
845 //-- BEGIN HELPER FUNCTIONS --//
846
847 /**
848 * Determines whether or not a value is of slavo-germanic orgin. A value is
849 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
850 */
851 private boolean isSlavoGermanic(String value) {
852 return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
853 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
854 }
855
856 /**
857 * Determines whether or not a character is a vowel or not
858 */
859 private boolean isVowel(char ch) {
860 return VOWELS.indexOf(ch) != -1;
861 }
862
863 /**
864 * Determines whether or not the value starts with a silent letter. It will
865 * return {@code true} if the value starts with any of 'GN', 'KN',
866 * 'PN', 'WR' or 'PS'.
867 */
868 private boolean isSilentStart(String value) {
869 boolean result = false;
870 for (String element : SILENT_START) {
871 if (value.startsWith(element)) {
872 result = true;
873 break;
874 }
875 }
876 return result;
877 }
878
879 /**
880 * Cleans the input.
881 */
882 private String cleanInput(String input) {
883 if (input == null) {
884 return null;
885 }
886 input = input.trim();
887 if (input.length() == 0) {
888 return null;
889 }
890 return input.toUpperCase(java.util.Locale.ENGLISH);
891 }
892
893 /**
894 * Gets the character at index <code>index</code> if available, otherwise
895 * it returns <code>Character.MIN_VALUE</code> so that there is some sort
896 * of a default.
897 */
898 protected char charAt(String value, int index) {
899 if (index < 0 || index >= value.length()) {
900 return Character.MIN_VALUE;
901 }
902 return value.charAt(index);
903 }
904
905 /**
906 * Shortcut method with 1 criteria.
907 */
908 private static boolean contains(String value, int start, int length, String criteria) {
909 return contains(value, start, length, new String[] { criteria });
910 }
911
912 /**
913 * Shortcut method with 2 criteria.
914 */
915 private static boolean contains(String value, int start, int length,
916 String criteria1, String criteria2) {
917 return contains(value, start, length, new String[] { criteria1, criteria2 });
918 }
919
920 /**
921 * Shortcut method with 3 criteria.
922 */
923 private static boolean contains(String value, int start, int length,
924 String criteria1, String criteria2, String criteria3) {
925 return contains(value, start, length, new String[] { criteria1, criteria2, criteria3 });
926 }
927
928 /**
929 * Shortcut method with 4 criteria.
930 */
931 private static boolean contains(String value, int start, int length,
932 String criteria1, String criteria2,
933 String criteria3, String criteria4) {
934 return contains(value, start, length,
935 new String[] { criteria1, criteria2, criteria3, criteria4 });
936 }
937
938 /**
939 * Shortcut method with 5 criteria.
940 */
941 private static boolean contains(String value, int start, int length,
942 String criteria1, String criteria2,
943 String criteria3, String criteria4,
944 String criteria5) {
945 return contains(value, start, length,
946 new String[] { criteria1, criteria2, criteria3,
947 criteria4, criteria5 });
948 }
949
950 /**
951 * Shortcut method with 6 criteria.
952 */
953 private static boolean contains(String value, int start, int length,
954 String criteria1, String criteria2,
955 String criteria3, String criteria4,
956 String criteria5, String criteria6) {
957 return contains(value, start, length,
958 new String[] { criteria1, criteria2, criteria3,
959 criteria4, criteria5, criteria6 });
960 }
961
962 /**
963 * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and
964 * matching up to length <code>length</code>.
965 */
966 protected static boolean contains(String value, int start, int length,
967 String[] criteria) {
968 boolean result = false;
969 if (start >= 0 && start + length <= value.length()) {
970 String target = value.substring(start, start + length);
971
972 for (String element : criteria) {
973 if (target.equals(element)) {
974 result = true;
975 break;
976 }
977 }
978 }
979 return result;
980 }
981
982 //-- BEGIN INNER CLASSES --//
983
984 /**
985 * Inner class for storing results, since there is the optional alternate encoding.
986 */
987 public class DoubleMetaphoneResult {
988
989 private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
990 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
991 private final int maxLength;
992
993 public DoubleMetaphoneResult(int maxLength) {
994 this.maxLength = maxLength;
995 }
996
997 public void append(char value) {
998 appendPrimary(value);
999 appendAlternate(value);
1000 }
1001
1002 public void append(char primary, char alternate) {
1003 appendPrimary(primary);
1004 appendAlternate(alternate);
1005 }
1006
1007 public void appendPrimary(char value) {
1008 if (this.primary.length() < this.maxLength) {
1009 this.primary.append(value);
1010 }
1011 }
1012
1013 public void appendAlternate(char value) {
1014 if (this.alternate.length() < this.maxLength) {
1015 this.alternate.append(value);
1016 }
1017 }
1018
1019 public void append(String value) {
1020 appendPrimary(value);
1021 appendAlternate(value);
1022 }
1023
1024 public void append(String primary, String alternate) {
1025 appendPrimary(primary);
1026 appendAlternate(alternate);
1027 }
1028
1029 public void appendPrimary(String value) {
1030 int addChars = this.maxLength - this.primary.length();
1031 if (value.length() <= addChars) {
1032 this.primary.append(value);
1033 } else {
1034 this.primary.append(value.substring(0, addChars));
1035 }
1036 }
1037
1038 public void appendAlternate(String value) {
1039 int addChars = this.maxLength - this.alternate.length();
1040 if (value.length() <= addChars) {
1041 this.alternate.append(value);
1042 } else {
1043 this.alternate.append(value.substring(0, addChars));
1044 }
1045 }
1046
1047 public String getPrimary() {
1048 return this.primary.toString();
1049 }
1050
1051 public String getAlternate() {
1052 return this.alternate.toString();
1053 }
1054
1055 public boolean isComplete() {
1056 return this.primary.length() >= this.maxLength &&
1057 this.alternate.length() >= this.maxLength;
1058 }
1059 }
1060 }