001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022import org.apache.commons.codec.binary.StringUtils; 023 024/** 025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 026 * Philips</CITE>. 027 * <p> 028 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable 029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 032 * </p> 033 * 034 * @see <a href="https://drdobbs.com/the-double-metaphone-search-algorithm/184401251?pgno=2">Dr. Dobbs Original Article</a> 035 * @see <a href="https://en.wikipedia.org/wiki/Metaphone">Wikipedia Metaphone</a> 036 */ 037public class DoubleMetaphone implements StringEncoder { 038 039 /** 040 * Inner class for storing results, since there is the optional alternate encoding. 041 */ 042 public class DoubleMetaphoneResult { 043 044 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 045 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 046 private final int maxLength; 047 048 public DoubleMetaphoneResult(final int maxLength) { 049 this.maxLength = maxLength; 050 } 051 052 public void append(final char value) { 053 appendPrimary(value); 054 appendAlternate(value); 055 } 056 057 public void append(final char primary, final char alternate) { 058 appendPrimary(primary); 059 appendAlternate(alternate); 060 } 061 062 public void append(final String value) { 063 appendPrimary(value); 064 appendAlternate(value); 065 } 066 067 public void append(final String primary, final String alternate) { 068 appendPrimary(primary); 069 appendAlternate(alternate); 070 } 071 072 public void appendAlternate(final char value) { 073 if (this.alternate.length() < this.maxLength) { 074 this.alternate.append(value); 075 } 076 } 077 078 public void appendAlternate(final String value) { 079 final int addChars = this.maxLength - this.alternate.length(); 080 if (value.length() <= addChars) { 081 this.alternate.append(value); 082 } else { 083 this.alternate.append(value, 0, addChars); 084 } 085 } 086 087 public void appendPrimary(final char value) { 088 if (this.primary.length() < this.maxLength) { 089 this.primary.append(value); 090 } 091 } 092 093 public void appendPrimary(final String value) { 094 final int addChars = this.maxLength - this.primary.length(); 095 if (value.length() <= addChars) { 096 this.primary.append(value); 097 } else { 098 this.primary.append(value, 0, addChars); 099 } 100 } 101 102 public String getAlternate() { 103 return this.alternate.toString(); 104 } 105 106 public String getPrimary() { 107 return this.primary.toString(); 108 } 109 110 public boolean isComplete() { 111 return this.primary.length() >= this.maxLength && 112 this.alternate.length() >= this.maxLength; 113 } 114 } 115 116 /** 117 * "Vowels" to test for 118 */ 119 private static final String VOWELS = "AEIOUY"; 120 /** 121 * Prefixes when present which are not pronounced 122 */ 123 private static final String[] SILENT_START = 124 { "GN", "KN", "PN", "WR", "PS" }; 125 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 126 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 127 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 128 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 129 130 private static final String[] L_T_K_S_N_M_B_Z = 131 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 132 133 /* 134 * Determines whether {@code value} contains any of the criteria starting at index {@code start} and 135 * matching up to length {@code length}. 136 */ 137 protected static boolean contains(final String value, final int start, final int length, 138 final String... criteria) { 139 boolean result = false; 140 if (start >= 0 && start + length <= value.length()) { 141 final String target = value.substring(start, start + length); 142 143 for (final String element : criteria) { 144 if (target.equals(element)) { 145 result = true; 146 break; 147 } 148 } 149 } 150 return result; 151 } 152 153 /** 154 * Maximum length of an encoding, default is 4 155 */ 156 private int maxCodeLen = 4; 157 158 /* 159 * Gets the character at index {@code index} if available, otherwise 160 * it returns {@code Character.MIN_VALUE} so that there is some sort 161 * of default. 162 */ 163 protected char charAt(final String value, final int index) { 164 if (index < 0 || index >= value.length()) { 165 return Character.MIN_VALUE; 166 } 167 return value.charAt(index); 168 } 169 170 /** 171 * Cleans the input. 172 */ 173 private String cleanInput(String input) { 174 if (input == null) { 175 return null; 176 } 177 input = input.trim(); 178 if (input.isEmpty()) { 179 return null; 180 } 181 return input.toUpperCase(java.util.Locale.ENGLISH); 182 } 183 184 /** 185 * Complex condition 0 for 'C'. 186 */ 187 private boolean conditionC0(final String value, final int index) { 188 if (contains(value, index, 4, "CHIA")) { 189 return true; 190 } 191 if (index <= 1) { 192 return false; 193 } 194 if (isVowel(charAt(value, index - 2))) { 195 return false; 196 } 197 if (!contains(value, index - 1, 3, "ACH")) { 198 return false; 199 } 200 final char c = charAt(value, index + 2); 201 return c != 'I' && c != 'E' || 202 contains(value, index - 2, 6, "BACHER", "MACHER"); 203 } 204 205 /** 206 * Complex condition 0 for 'CH'. 207 */ 208 private boolean conditionCH0(final String value, final int index) { 209 if (index != 0) { 210 return false; 211 } 212 if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 213 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 214 return false; 215 } 216 return !contains(value, 0, 5, "CHORE"); 217 } 218 219 /** 220 * Complex condition 1 for 'CH'. 221 */ 222 private boolean conditionCH1(final String value, final int index) { 223 return contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH") || 224 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 225 contains(value, index + 2, 1, "T", "S") || 226 (contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 227 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1); 228 } 229 230 /** 231 * Complex condition 0 for 'L'. 232 */ 233 private boolean conditionL0(final String value, final int index) { 234 if (index == value.length() - 3 && 235 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 236 return true; 237 } 238 return (contains(value, value.length() - 2, 2, "AS", "OS") || 239 contains(value, value.length() - 1, 1, "A", "O")) && 240 contains(value, index - 1, 4, "ALLE"); 241 } 242 243 //-- BEGIN HANDLERS --// 244 245 /** 246 * Complex condition 0 for 'M'. 247 */ 248 private boolean conditionM0(final String value, final int index) { 249 if (charAt(value, index + 1) == 'M') { 250 return true; 251 } 252 return contains(value, index - 1, 3, "UMB") && 253 (index + 1 == value.length() - 1 || contains(value, index + 2, 2, "ER")); 254 } 255 256 /** 257 * Encode a value with Double Metaphone. 258 * 259 * @param value String to encode 260 * @return an encoded string 261 */ 262 public String doubleMetaphone(final String value) { 263 return doubleMetaphone(value, false); 264 } 265 266 /** 267 * Encode a value with Double Metaphone, optionally using the alternate encoding. 268 * 269 * @param value String to encode 270 * @param alternate use alternate encode 271 * @return an encoded string 272 */ 273 public String doubleMetaphone(String value, final boolean alternate) { 274 value = cleanInput(value); 275 if (value == null) { 276 return null; 277 } 278 279 final boolean slavoGermanic = isSlavoGermanic(value); 280 int index = isSilentStart(value) ? 1 : 0; 281 282 final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 283 284 while (!result.isComplete() && index <= value.length() - 1) { 285 switch (value.charAt(index)) { 286 case 'A': 287 case 'E': 288 case 'I': 289 case 'O': 290 case 'U': 291 case 'Y': 292 index = handleAEIOUY(result, index); 293 break; 294 case 'B': 295 result.append('P'); 296 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 297 break; 298 case '\u00C7': 299 // A C with a Cedilla 300 result.append('S'); 301 index++; 302 break; 303 case 'C': 304 index = handleC(value, result, index); 305 break; 306 case 'D': 307 index = handleD(value, result, index); 308 break; 309 case 'F': 310 result.append('F'); 311 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 312 break; 313 case 'G': 314 index = handleG(value, result, index, slavoGermanic); 315 break; 316 case 'H': 317 index = handleH(value, result, index); 318 break; 319 case 'J': 320 index = handleJ(value, result, index, slavoGermanic); 321 break; 322 case 'K': 323 result.append('K'); 324 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 325 break; 326 case 'L': 327 index = handleL(value, result, index); 328 break; 329 case 'M': 330 result.append('M'); 331 index = conditionM0(value, index) ? index + 2 : index + 1; 332 break; 333 case 'N': 334 result.append('N'); 335 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 336 break; 337 case '\u00D1': 338 // N with a tilde (spanish ene) 339 result.append('N'); 340 index++; 341 break; 342 case 'P': 343 index = handleP(value, result, index); 344 break; 345 case 'Q': 346 result.append('K'); 347 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 348 break; 349 case 'R': 350 index = handleR(value, result, index, slavoGermanic); 351 break; 352 case 'S': 353 index = handleS(value, result, index, slavoGermanic); 354 break; 355 case 'T': 356 index = handleT(value, result, index); 357 break; 358 case 'V': 359 result.append('F'); 360 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 361 break; 362 case 'W': 363 index = handleW(value, result, index); 364 break; 365 case 'X': 366 index = handleX(value, result, index); 367 break; 368 case 'Z': 369 index = handleZ(value, result, index, slavoGermanic); 370 break; 371 default: 372 index++; 373 break; 374 } 375 } 376 377 return alternate ? result.getAlternate() : result.getPrimary(); 378 } 379 380 /** 381 * Encode the value using DoubleMetaphone. It will only work if 382 * {@code obj} is a {@code String} (like {@code Metaphone}). 383 * 384 * @param obj Object to encode (should be of type String) 385 * @return An encoded Object (will be of type String) 386 * @throws EncoderException encode parameter is not of type String 387 */ 388 @Override 389 public Object encode(final Object obj) throws EncoderException { 390 if (!(obj instanceof String)) { 391 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 392 } 393 return doubleMetaphone((String) obj); 394 } 395 396 /** 397 * Encode the value using DoubleMetaphone. 398 * 399 * @param value String to encode 400 * @return An encoded String 401 */ 402 @Override 403 public String encode(final String value) { 404 return doubleMetaphone(value); 405 } 406 407 /** 408 * Returns the maxCodeLen. 409 * @return int 410 */ 411 public int getMaxCodeLen() { 412 return this.maxCodeLen; 413 } 414 415 /** 416 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 417 */ 418 private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { 419 if (index == 0) { 420 result.append('A'); 421 } 422 return index + 1; 423 } 424 425 /** 426 * Handles 'C' cases. 427 */ 428 private int handleC(final String value, final DoubleMetaphoneResult result, int index) { 429 if (conditionC0(value, index)) { // very confusing, moved out 430 result.append('K'); 431 index += 2; 432 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 433 result.append('S'); 434 index += 2; 435 } else if (contains(value, index, 2, "CH")) { 436 index = handleCH(value, result, index); 437 } else if (contains(value, index, 2, "CZ") && 438 !contains(value, index - 2, 4, "WICZ")) { 439 //-- "Czerny" --// 440 result.append('S', 'X'); 441 index += 2; 442 } else if (contains(value, index + 1, 3, "CIA")) { 443 //-- "focaccia" --// 444 result.append('X'); 445 index += 3; 446 } else if (contains(value, index, 2, "CC") && 447 !(index == 1 && charAt(value, 0) == 'M')) { 448 //-- double "cc" but not "McClelland" --// 449 return handleCC(value, result, index); 450 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 451 result.append('K'); 452 index += 2; 453 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 454 //-- Italian vs. English --// 455 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 456 result.append('S', 'X'); 457 } else { 458 result.append('S'); 459 } 460 index += 2; 461 } else { 462 result.append('K'); 463 if (contains(value, index + 1, 2, " C", " Q", " G")) { 464 //-- Mac Caffrey, Mac Gregor --// 465 index += 3; 466 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 467 !contains(value, index + 1, 2, "CE", "CI")) { 468 index += 2; 469 } else { 470 index++; 471 } 472 } 473 474 return index; 475 } 476 477 /** 478 * Handles 'CC' cases. 479 */ 480 private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { 481 if (contains(value, index + 2, 1, "I", "E", "H") && 482 !contains(value, index + 2, 2, "HU")) { 483 //-- "bellocchio" but not "bacchus" --// 484 if (index == 1 && charAt(value, index - 1) == 'A' || 485 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 486 //-- "accident", "accede", "succeed" --// 487 result.append("KS"); 488 } else { 489 //-- "bacci", "bertucci", other Italian --// 490 result.append('X'); 491 } 492 index += 3; 493 } else { // Pierce's rule 494 result.append('K'); 495 index += 2; 496 } 497 498 return index; 499 } 500 501 /** 502 * Handles 'CH' cases. 503 */ 504 private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { 505 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 506 result.append('K', 'X'); 507 return index + 2; 508 } 509 if (conditionCH0(value, index)) { 510 //-- Greek roots ("chemistry", "chorus", etc.) --// 511 result.append('K'); 512 return index + 2; 513 } 514 if (conditionCH1(value, index)) { 515 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 516 result.append('K'); 517 return index + 2; 518 } 519 if (index > 0) { 520 if (contains(value, 0, 2, "MC")) { 521 result.append('K'); 522 } else { 523 result.append('X', 'K'); 524 } 525 } else { 526 result.append('X'); 527 } 528 return index + 2; 529 } 530 531 /** 532 * Handles 'D' cases. 533 */ 534 private int handleD(final String value, final DoubleMetaphoneResult result, int index) { 535 if (contains(value, index, 2, "DG")) { 536 //-- "Edge" --// 537 if (contains(value, index + 2, 1, "I", "E", "Y")) { 538 result.append('J'); 539 index += 3; 540 //-- "Edgar" --// 541 } else { 542 result.append("TK"); 543 index += 2; 544 } 545 } else if (contains(value, index, 2, "DT", "DD")) { 546 result.append('T'); 547 index += 2; 548 } else { 549 result.append('T'); 550 index++; 551 } 552 return index; 553 } 554 555 /** 556 * Handles 'G' cases. 557 */ 558 private int handleG(final String value, final DoubleMetaphoneResult result, int index, 559 final boolean slavoGermanic) { 560 if (charAt(value, index + 1) == 'H') { 561 index = handleGH(value, result, index); 562 } else if (charAt(value, index + 1) == 'N') { 563 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 564 result.append("KN", "N"); 565 } else if (!contains(value, index + 2, 2, "EY") && 566 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 567 result.append("N", "KN"); 568 } else { 569 result.append("KN"); 570 } 571 index += 2; 572 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 573 result.append("KL", "L"); 574 index += 2; 575 } else if (index == 0 && 576 (charAt(value, index + 1) == 'Y' || 577 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 578 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 579 result.append('K', 'J'); 580 index += 2; 581 } else if ((contains(value, index + 1, 2, "ER") || 582 charAt(value, index + 1) == 'Y') && 583 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 584 !contains(value, index - 1, 1, "E", "I") && 585 !contains(value, index - 1, 3, "RGY", "OGY")) { 586 //-- -ger-, -gy- --// 587 result.append('K', 'J'); 588 index += 2; 589 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 590 contains(value, index - 1, 4, "AGGI", "OGGI")) { 591 //-- Italian "biaggi" --// 592 if (contains(value, 0, 4, "VAN ", "VON ") || 593 contains(value, 0, 3, "SCH") || 594 contains(value, index + 1, 2, "ET")) { 595 //-- obvious germanic --// 596 result.append('K'); 597 } else if (contains(value, index + 1, 3, "IER")) { 598 result.append('J'); 599 } else { 600 result.append('J', 'K'); 601 } 602 index += 2; 603 } else { 604 if (charAt(value, index + 1) == 'G') { 605 index += 2; 606 } else { 607 index++; 608 } 609 result.append('K'); 610 } 611 return index; 612 } 613 614 /** 615 * Handles 'GH' cases. 616 */ 617 private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { 618 if (index > 0 && !isVowel(charAt(value, index - 1))) { 619 result.append('K'); 620 index += 2; 621 } else if (index == 0) { 622 if (charAt(value, index + 2) == 'I') { 623 result.append('J'); 624 } else { 625 result.append('K'); 626 } 627 index += 2; 628 } else if (index > 1 && contains(value, index - 2, 1, "B", "H", "D") || 629 index > 2 && contains(value, index - 3, 1, "B", "H", "D") || 630 index > 3 && contains(value, index - 4, 1, "B", "H")) { 631 //-- Parker's rule (with some further refinements) - "hugh" 632 index += 2; 633 } else { 634 if (index > 2 && charAt(value, index - 1) == 'U' && 635 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 636 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 637 result.append('F'); 638 } else if (index > 0 && charAt(value, index - 1) != 'I') { 639 result.append('K'); 640 } 641 index += 2; 642 } 643 return index; 644 } 645 646 /** 647 * Handles 'H' cases. 648 */ 649 private int handleH(final String value, final DoubleMetaphoneResult result, int index) { 650 //-- only keep if first & before vowel or between 2 vowels --// 651 if ((index == 0 || isVowel(charAt(value, index - 1))) && 652 isVowel(charAt(value, index + 1))) { 653 result.append('H'); 654 index += 2; 655 //-- also takes car of "HH" --// 656 } else { 657 index++; 658 } 659 return index; 660 } 661 662 /** 663 * Handles 'J' cases. 664 */ 665 private int handleJ(final String value, final DoubleMetaphoneResult result, int index, 666 final boolean slavoGermanic) { 667 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 668 //-- obvious Spanish, "Jose", "San Jacinto" --// 669 if (index == 0 && charAt(value, index + 4) == ' ' || 670 value.length() == 4 || contains(value, 0, 4, "SAN ")) { 671 result.append('H'); 672 } else { 673 result.append('J', 'H'); 674 } 675 index++; 676 } else { 677 if (index == 0 && !contains(value, index, 4, "JOSE")) { 678 result.append('J', 'A'); 679 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 680 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 681 result.append('J', 'H'); 682 } else if (index == value.length() - 1) { 683 result.append('J', ' '); 684 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 685 !contains(value, index - 1, 1, "S", "K", "L")) { 686 result.append('J'); 687 } 688 689 if (charAt(value, index + 1) == 'J') { 690 index += 2; 691 } else { 692 index++; 693 } 694 } 695 return index; 696 } 697 698 /** 699 * Handles 'L' cases. 700 */ 701 private int handleL(final String value, final DoubleMetaphoneResult result, int index) { 702 if (charAt(value, index + 1) == 'L') { 703 if (conditionL0(value, index)) { 704 result.appendPrimary('L'); 705 } else { 706 result.append('L'); 707 } 708 index += 2; 709 } else { 710 index++; 711 result.append('L'); 712 } 713 return index; 714 } 715 716 /** 717 * Handles 'P' cases. 718 */ 719 private int handleP(final String value, final DoubleMetaphoneResult result, int index) { 720 if (charAt(value, index + 1) == 'H') { 721 result.append('F'); 722 index += 2; 723 } else { 724 result.append('P'); 725 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 726 } 727 return index; 728 } 729 730 /** 731 * Handles 'R' cases. 732 */ 733 private int handleR(final String value, final DoubleMetaphoneResult result, final int index, 734 final boolean slavoGermanic) { 735 if (index == value.length() - 1 && !slavoGermanic && 736 contains(value, index - 2, 2, "IE") && 737 !contains(value, index - 4, 2, "ME", "MA")) { 738 result.appendAlternate('R'); 739 } else { 740 result.append('R'); 741 } 742 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 743 } 744 745 //-- BEGIN CONDITIONS --// 746 747 /** 748 * Handles 'S' cases. 749 */ 750 private int handleS(final String value, final DoubleMetaphoneResult result, int index, 751 final boolean slavoGermanic) { 752 if (contains(value, index - 1, 3, "ISL", "YSL")) { 753 //-- special cases "island", "isle", "carlisle", "carlysle" --// 754 index++; 755 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 756 //-- special case "sugar-" --// 757 result.append('X', 'S'); 758 index++; 759 } else if (contains(value, index, 2, "SH")) { 760 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 761 //-- germanic --// 762 result.append('S'); 763 } else { 764 result.append('X'); 765 } 766 index += 2; 767 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 768 //-- Italian and Armenian --// 769 if (slavoGermanic) { 770 result.append('S'); 771 } else { 772 result.append('S', 'X'); 773 } 774 index += 3; 775 } else if (index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W") || 776 contains(value, index + 1, 1, "Z")) { 777 //-- german & anglicisations, e.g. "smith" match "schmidt" // 778 // "snider" match "schneider" --// 779 //-- also, -sz- in slavic language although in hungarian it // 780 // is pronounced "s" --// 781 result.append('S', 'X'); 782 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 783 } else if (contains(value, index, 2, "SC")) { 784 index = handleSC(value, result, index); 785 } else { 786 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 787 //-- french e.g. "resnais", "artois" --// 788 result.appendAlternate('S'); 789 } else { 790 result.append('S'); 791 } 792 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 793 } 794 return index; 795 } 796 797 /** 798 * Handles 'SC' cases. 799 */ 800 private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { 801 if (charAt(value, index + 2) == 'H') { 802 //-- Schlesinger's rule --// 803 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 804 //-- Dutch origin, e.g. "school", "schooner" --// 805 if (contains(value, index + 3, 2, "ER", "EN")) { 806 //-- "schermerhorn", "schenker" --// 807 result.append("X", "SK"); 808 } else { 809 result.append("SK"); 810 } 811 } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 812 result.append('X', 'S'); 813 } else { 814 result.append('X'); 815 } 816 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 817 result.append('S'); 818 } else { 819 result.append("SK"); 820 } 821 return index + 3; 822 } 823 824 /** 825 * Handles 'T' cases. 826 */ 827 private int handleT(final String value, final DoubleMetaphoneResult result, int index) { 828 if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) { 829 result.append('X'); 830 index += 3; 831 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 832 if (contains(value, index + 2, 2, "OM", "AM") || 833 //-- special case "thomas", "thames" or germanic --// 834 contains(value, 0, 4, "VAN ", "VON ") || 835 contains(value, 0, 3, "SCH")) { 836 result.append('T'); 837 } else { 838 result.append('0', 'T'); 839 } 840 index += 2; 841 } else { 842 result.append('T'); 843 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 844 } 845 return index; 846 } 847 848 /** 849 * Handles 'W' cases. 850 */ 851 private int handleW(final String value, final DoubleMetaphoneResult result, int index) { 852 if (contains(value, index, 2, "WR")) { 853 //-- can also be in middle of word --// 854 result.append('R'); 855 index += 2; 856 } else if (index == 0 && (isVowel(charAt(value, index + 1)) || 857 contains(value, index, 2, "WH"))) { 858 if (isVowel(charAt(value, index + 1))) { 859 //-- Wasserman should match Vasserman --// 860 result.append('A', 'F'); 861 } else { 862 //-- need Uomo to match Womo --// 863 result.append('A'); 864 } 865 index++; 866 } else if (index == value.length() - 1 && isVowel(charAt(value, index - 1)) || 867 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 868 contains(value, 0, 3, "SCH")) { 869 //-- Arnow should match Arnoff --// 870 result.appendAlternate('F'); 871 index++; 872 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 873 //-- Polish e.g. "filipowicz" --// 874 result.append("TS", "FX"); 875 index += 4; 876 } else { 877 index++; 878 } 879 return index; 880 } 881 882 /** 883 * Handles 'X' cases. 884 */ 885 private int handleX(final String value, final DoubleMetaphoneResult result, int index) { 886 if (index == 0) { 887 result.append('S'); 888 index++; 889 } else { 890 if (!(index == value.length() - 1 && 891 (contains(value, index - 3, 3, "IAU", "EAU") || 892 contains(value, index - 2, 2, "AU", "OU")))) { 893 //-- French e.g. breaux --// 894 result.append("KS"); 895 } 896 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 897 } 898 return index; 899 } 900 901 //-- BEGIN HELPER FUNCTIONS --// 902 903 /** 904 * Handles 'Z' cases. 905 */ 906 private int handleZ(final String value, final DoubleMetaphoneResult result, int index, 907 final boolean slavoGermanic) { 908 if (charAt(value, index + 1) == 'H') { 909 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 910 result.append('J'); 911 index += 2; 912 } else { 913 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 914 slavoGermanic && index > 0 && charAt(value, index - 1) != 'T') { 915 result.append("S", "TS"); 916 } else { 917 result.append('S'); 918 } 919 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 920 } 921 return index; 922 } 923 924 /** 925 * Check if the Double Metaphone values of two {@code String} values 926 * are equal. 927 * 928 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 929 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 930 * @return {@code true} if the encoded {@code String}s are equal; 931 * {@code false} otherwise. 932 * @see #isDoubleMetaphoneEqual(String,String,boolean) 933 */ 934 public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { 935 return isDoubleMetaphoneEqual(value1, value2, false); 936 } 937 938 /** 939 * Check if the Double Metaphone values of two {@code String} values 940 * are equal, optionally using the alternate value. 941 * 942 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 943 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 944 * @param alternate use the alternate value if {@code true}. 945 * @return {@code true} if the encoded {@code String}s are equal; 946 * {@code false} otherwise. 947 */ 948 public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { 949 return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate)); 950 } 951 952 /** 953 * Determines whether or not the value starts with a silent letter. It will 954 * return {@code true} if the value starts with any of 'GN', 'KN', 955 * 'PN', 'WR' or 'PS'. 956 */ 957 private boolean isSilentStart(final String value) { 958 boolean result = false; 959 for (final String element : SILENT_START) { 960 if (value.startsWith(element)) { 961 result = true; 962 break; 963 } 964 } 965 return result; 966 } 967 968 /** 969 * Determines whether or not a value is of slavo-germanic origin. A value is 970 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 971 */ 972 private boolean isSlavoGermanic(final String value) { 973 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 974 value.contains("CZ") || value.contains("WITZ"); 975 } 976 977 /** 978 * Determines whether or not a character is a vowel or not 979 */ 980 private boolean isVowel(final char ch) { 981 return VOWELS.indexOf(ch) != -1; 982 } 983 984 //-- BEGIN INNER CLASSES --// 985 986 /** 987 * Sets the maxCodeLen. 988 * @param maxCodeLen The maxCodeLen to set 989 */ 990 public void setMaxCodeLen(final int maxCodeLen) { 991 this.maxCodeLen = maxCodeLen; 992 } 993}