001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.Locale; 021 022import org.apache.commons.codec.EncoderException; 023import org.apache.commons.codec.StringEncoder; 024import org.apache.commons.codec.binary.StringUtils; 025 026/** 027 * Encodes a string into a Double Metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 028 * Philips</CITE>. 029 * <p> 030 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable 031 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 032 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 033 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 034 * </p> 035 * 036 * @see <a href="https://drdobbs.com/the-double-metaphone-search-algorithm/184401251?pgno=2">Dr. Dobbs Original Article</a> 037 * @see <a href="https://en.wikipedia.org/wiki/Metaphone">Wikipedia Metaphone</a> 038 * @see <a href="http://aspell.net/metaphone/dmetaph.cpp">Double Metaphone C++ source by Lawrence Philips</a> 039 */ 040public class DoubleMetaphone implements StringEncoder { 041 042 /** 043 * Stores results, since there is the optional alternate encoding. 044 */ 045 public class DoubleMetaphoneResult { 046 047 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 048 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 049 private final int maxLength; 050 051 /** 052 * Constructs a new instance. 053 * 054 * @param maxLength The maximum length. 055 */ 056 public DoubleMetaphoneResult(final int maxLength) { 057 this.maxLength = maxLength; 058 } 059 060 /** 061 * Appends the given value as primary and alternative. 062 * 063 * @param value The value to append. 064 */ 065 public void append(final char value) { 066 appendPrimary(value); 067 appendAlternate(value); 068 } 069 070 /** 071 * Appends the given primary and alternative values. 072 * 073 * @param primary The primary value. 074 * @param alternate The alternate value. 075 */ 076 public void append(final char primary, final char alternate) { 077 appendPrimary(primary); 078 appendAlternate(alternate); 079 } 080 081 /** 082 * Appends the given value as primary and alternative. 083 * 084 * @param value The value to append. 085 */ 086 public void append(final String value) { 087 appendPrimary(value); 088 appendAlternate(value); 089 } 090 091 /** 092 * Appends the given primary and alternative values. 093 * 094 * @param primary The primary value. 095 * @param alternate The alternate value. 096 */ 097 public void append(final String primary, final String alternate) { 098 appendPrimary(primary); 099 appendAlternate(alternate); 100 } 101 102 /** 103 * Appends the given value as alternative. 104 * 105 * @param value The value to append. 106 */ 107 public void appendAlternate(final char value) { 108 if (alternate.length() < maxLength) { 109 alternate.append(value); 110 } 111 } 112 113 /** 114 * Appends the given value as alternative. 115 * 116 * @param value The value to append. 117 */ 118 public void appendAlternate(final String value) { 119 final int addChars = maxLength - alternate.length(); 120 if (value.length() <= addChars) { 121 alternate.append(value); 122 } else { 123 alternate.append(value, 0, addChars); 124 } 125 } 126 127 /** 128 * Appends the given value as primary. 129 * 130 * @param value The value to append. 131 */ 132 public void appendPrimary(final char value) { 133 if (primary.length() < maxLength) { 134 primary.append(value); 135 } 136 } 137 138 /** 139 * Appends the given value as primary. 140 * 141 * @param value The value to append. 142 */ 143 public void appendPrimary(final String value) { 144 final int addChars = maxLength - primary.length(); 145 if (value.length() <= addChars) { 146 primary.append(value); 147 } else { 148 primary.append(value, 0, addChars); 149 } 150 } 151 152 /** 153 * Gets the alternate string. 154 * 155 * @return the alternate string. 156 */ 157 public String getAlternate() { 158 return alternate.toString(); 159 } 160 161 /** 162 * Gets the primary string. 163 * 164 * @return the primary string. 165 */ 166 public String getPrimary() { 167 return primary.toString(); 168 } 169 170 /** 171 * Tests whether this result is complete. 172 * 173 * @return whether this result is complete. 174 */ 175 public boolean isComplete() { 176 return primary.length() >= maxLength && alternate.length() >= maxLength; 177 } 178 } 179 180 /** 181 * "Vowels" to test. 182 */ 183 private static final String VOWELS = "AEIOUY"; 184 185 /** 186 * Prefixes when present which are not pronounced. 187 */ 188 private static final String[] SILENT_START = { "GN", "KN", "PN", "WR", "PS" }; 189 190 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 191 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 192 private static final String[] L_T_K_S_N_M_B_Z = { "L", "T", "K", "S", "N", "M", "B", "Z" }; 193 194 /** 195 * Tests whether {@code value} contains any of the {@code criteria} starting at index {@code start} and matching up to length {@code length}. 196 * 197 * @param value The value to test. 198 * @param start Where in {@code value} to start testing. 199 * @param length How many to test. 200 * @param criteria The search criteria. 201 * @return Whether there was a match. 202 */ 203 protected static boolean contains(final String value, final int start, final int length, final String... criteria) { 204 boolean result = false; 205 if (start >= 0 && start + length <= value.length()) { 206 final String target = value.substring(start, start + length); 207 for (final String element : criteria) { 208 if (target.equals(element)) { 209 result = true; 210 break; 211 } 212 } 213 } 214 return result; 215 } 216 217 /** 218 * Maximum length of an encoding, default is 4. 219 */ 220 private int maxCodeLen = 4; 221 222 /** 223 * Constructs a new instance. 224 */ 225 public DoubleMetaphone() { 226 // empty 227 } 228 229 /** 230 * Gets the character at index {@code index} if available, or {@link Character#MIN_VALUE} if out of bounds. 231 * 232 * @param value The String to query. 233 * @param index A string index. 234 * @return The character at the index or {@link Character#MIN_VALUE} if out of bounds. 235 */ 236 protected char charAt(final String value, final int index) { 237 if (index < 0 || index >= value.length()) { 238 return Character.MIN_VALUE; 239 } 240 return value.charAt(index); 241 } 242 243 /** 244 * Cleans the input. 245 */ 246 private String cleanInput(String input) { 247 if (input == null) { 248 return null; 249 } 250 input = input.trim(); 251 if (input.isEmpty()) { 252 return null; 253 } 254 return input.toUpperCase(Locale.ENGLISH); 255 } 256 257 /** 258 * Complex condition 0 for 'C'. 259 */ 260 private boolean conditionC0(final String value, final int index) { 261 if (contains(value, index, 4, "CHIA")) { 262 return true; 263 } 264 if (index <= 1) { 265 return false; 266 } 267 if (isVowel(charAt(value, index - 2))) { 268 return false; 269 } 270 if (!contains(value, index - 1, 3, "ACH")) { 271 return false; 272 } 273 final char c = charAt(value, index + 2); 274 return c != 'I' && c != 'E' || 275 contains(value, index - 2, 6, "BACHER", "MACHER"); 276 } 277 278 /** 279 * Complex condition 0 for 'CH'. 280 */ 281 private boolean conditionCH0(final String value, final int index) { 282 if (index != 0) { 283 return false; 284 } 285 if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 286 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 287 return false; 288 } 289 return !contains(value, 0, 5, "CHORE"); 290 } 291 292 /** 293 * Complex condition 1 for 'CH'. 294 */ 295 private boolean conditionCH1(final String value, final int index) { 296 return contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH") || 297 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 298 contains(value, index + 2, 1, "T", "S") || 299 (contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 300 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1); 301 } 302 303 /** 304 * Complex condition 0 for 'L'. 305 */ 306 private boolean conditionL0(final String value, final int index) { 307 if (index == value.length() - 3 && 308 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 309 return true; 310 } 311 return (contains(value, value.length() - 2, 2, "AS", "OS") || 312 contains(value, value.length() - 1, 1, "A", "O")) && 313 contains(value, index - 1, 4, "ALLE"); 314 } 315 316 /** 317 * Complex condition 0 for 'M'. 318 */ 319 private boolean conditionM0(final String value, final int index) { 320 if (charAt(value, index + 1) == 'M') { 321 return true; 322 } 323 return contains(value, index - 1, 3, "UMB") && 324 (index + 1 == value.length() - 1 || contains(value, index + 2, 2, "ER")); 325 } 326 327 /** 328 * Encodes a value with Double Metaphone. 329 * 330 * @param value String to encode. 331 * @return an encoded string. 332 */ 333 public String doubleMetaphone(final String value) { 334 return doubleMetaphone(value, false); 335 } 336 337 /** 338 * Encodes a value with Double Metaphone, optionally using the alternate encoding. 339 * 340 * @param value String to encode. 341 * @param alternate use alternate encode. 342 * @return an encoded string. 343 */ 344 public String doubleMetaphone(String value, final boolean alternate) { 345 value = cleanInput(value); 346 if (value == null) { 347 return null; 348 } 349 350 final boolean slavoGermanic = isSlavoGermanic(value); 351 int index = isSilentStart(value) ? 1 : 0; 352 353 final DoubleMetaphoneResult result = new DoubleMetaphoneResult(getMaxCodeLen()); 354 355 while (!result.isComplete() && index <= value.length() - 1) { 356 switch (value.charAt(index)) { 357 case 'A': 358 case 'E': 359 case 'I': 360 case 'O': 361 case 'U': 362 case 'Y': 363 index = handleAEIOUY(result, index); 364 break; 365 case 'B': 366 result.append('P'); 367 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 368 break; 369 case '\u00C7': 370 // C with a Cedilla 371 result.append('S'); 372 index++; 373 break; 374 case 'C': 375 index = handleC(value, result, index); 376 break; 377 case 'D': 378 index = handleD(value, result, index); 379 break; 380 case 'F': 381 result.append('F'); 382 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 383 break; 384 case 'G': 385 index = handleG(value, result, index, slavoGermanic); 386 break; 387 case 'H': 388 index = handleH(value, result, index); 389 break; 390 case 'J': 391 index = handleJ(value, result, index, slavoGermanic); 392 break; 393 case 'K': 394 result.append('K'); 395 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 396 break; 397 case 'L': 398 index = handleL(value, result, index); 399 break; 400 case 'M': 401 result.append('M'); 402 index = conditionM0(value, index) ? index + 2 : index + 1; 403 break; 404 case 'N': 405 result.append('N'); 406 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 407 break; 408 case '\u00D1': 409 // N with a tilde (Spanish ene) 410 result.append('N'); 411 index++; 412 break; 413 case 'P': 414 index = handleP(value, result, index); 415 break; 416 case 'Q': 417 result.append('K'); 418 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 419 break; 420 case 'R': 421 index = handleR(value, result, index, slavoGermanic); 422 break; 423 case 'S': 424 index = handleS(value, result, index, slavoGermanic); 425 break; 426 case 'T': 427 index = handleT(value, result, index); 428 break; 429 case 'V': 430 result.append('F'); 431 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 432 break; 433 case 'W': 434 index = handleW(value, result, index); 435 break; 436 case 'X': 437 index = handleX(value, result, index); 438 break; 439 case 'Z': 440 index = handleZ(value, result, index, slavoGermanic); 441 break; 442 default: 443 index++; 444 break; 445 } 446 } 447 448 return alternate ? result.getAlternate() : result.getPrimary(); 449 } 450 451 /** 452 * Encodes the value using DoubleMetaphone. It will only work if 453 * {@code obj} is a {@code String} (like {@code Metaphone}). 454 * 455 * @param obj Object to encode (should be of type String). 456 * @return An encoded Object (will be of type String). 457 * @throws EncoderException encode parameter is not of type String. 458 */ 459 @Override 460 public Object encode(final Object obj) throws EncoderException { 461 if (!(obj instanceof String)) { 462 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 463 } 464 return doubleMetaphone((String) obj); 465 } 466 467 /** 468 * Encodes the value using DoubleMetaphone. 469 * 470 * @param value String to encode. 471 * @return An encoded String. 472 */ 473 @Override 474 public String encode(final String value) { 475 return doubleMetaphone(value); 476 } 477 478 /** 479 * Gets the maxCodeLen. 480 * 481 * @return the maxCodeLen. 482 */ 483 public int getMaxCodeLen() { 484 return maxCodeLen; 485 } 486 487 /** 488 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 489 */ 490 private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { 491 if (index == 0) { 492 result.append('A'); 493 } 494 return index + 1; 495 } 496 497 /** 498 * Handles 'C' cases. 499 */ 500 private int handleC(final String value, final DoubleMetaphoneResult result, int index) { 501 if (conditionC0(value, index)) { // very confusing, moved out 502 result.append('K'); 503 index += 2; 504 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 505 result.append('S'); 506 index += 2; 507 } else if (contains(value, index, 2, "CH")) { 508 index = handleCH(value, result, index); 509 } else if (contains(value, index, 2, "CZ") && 510 !contains(value, index - 2, 4, "WICZ")) { 511 //-- "Czerny" --// 512 result.append('S', 'X'); 513 index += 2; 514 } else if (contains(value, index + 1, 3, "CIA")) { 515 //-- "focaccia" --// 516 result.append('X'); 517 index += 3; 518 } else if (contains(value, index, 2, "CC") && 519 !(index == 1 && charAt(value, 0) == 'M')) { 520 //-- double "cc" but not "McClelland" --// 521 return handleCC(value, result, index); 522 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 523 result.append('K'); 524 index += 2; 525 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 526 //-- Italian vs. English --// 527 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 528 result.append('S', 'X'); 529 } else { 530 result.append('S'); 531 } 532 index += 2; 533 } else { 534 result.append('K'); 535 if (contains(value, index + 1, 2, " C", " Q", " G")) { 536 //-- Mac Caffrey, Mac Gregor --// 537 index += 3; 538 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 539 !contains(value, index + 1, 2, "CE", "CI")) { 540 index += 2; 541 } else { 542 index++; 543 } 544 } 545 546 return index; 547 } 548 549 /** 550 * Handles 'CC' cases. 551 */ 552 private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { 553 if (contains(value, index + 2, 1, "I", "E", "H") && 554 !contains(value, index + 2, 2, "HU")) { 555 //-- "bellocchio" but not "bacchus" --// 556 if (index == 1 && charAt(value, index - 1) == 'A' || 557 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 558 //-- "accident", "accede", "succeed" --// 559 result.append("KS"); 560 } else { 561 //-- "bacci", "bertucci", other Italian --// 562 result.append('X'); 563 } 564 index += 3; 565 } else { // Pierce's rule 566 result.append('K'); 567 index += 2; 568 } 569 570 return index; 571 } 572 573 /** 574 * Handles 'CH' cases. 575 */ 576 private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { 577 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 578 result.append('K', 'X'); 579 return index + 2; 580 } 581 if (conditionCH0(value, index)) { 582 //-- Greek roots ("chemistry", "chorus", etc.) --// 583 result.append('K'); 584 return index + 2; 585 } 586 if (conditionCH1(value, index)) { 587 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 588 result.append('K'); 589 return index + 2; 590 } 591 if (index > 0) { 592 if (contains(value, 0, 2, "MC")) { 593 result.append('K'); 594 } else { 595 result.append('X', 'K'); 596 } 597 } else { 598 result.append('X'); 599 } 600 return index + 2; 601 } 602 603 /** 604 * Handles 'D' cases. 605 */ 606 private int handleD(final String value, final DoubleMetaphoneResult result, int index) { 607 if (contains(value, index, 2, "DG")) { 608 //-- "Edge" --// 609 if (contains(value, index + 2, 1, "I", "E", "Y")) { 610 result.append('J'); 611 index += 3; 612 //-- "Edgar" --// 613 } else { 614 result.append("TK"); 615 index += 2; 616 } 617 } else if (contains(value, index, 2, "DT", "DD")) { 618 result.append('T'); 619 index += 2; 620 } else { 621 result.append('T'); 622 index++; 623 } 624 return index; 625 } 626 627 /** 628 * Handles 'G' cases. 629 */ 630 private int handleG(final String value, final DoubleMetaphoneResult result, int index, final boolean slavoGermanic) { 631 if (charAt(value, index + 1) == 'H') { 632 index = handleGH(value, result, index); 633 } else if (charAt(value, index + 1) == 'N') { 634 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 635 result.append("KN", "N"); 636 } else if (!contains(value, index + 2, 2, "EY") && 637 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 638 result.append("N", "KN"); 639 } else { 640 result.append("KN"); 641 } 642 index += 2; 643 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 644 result.append("KL", "L"); 645 index += 2; 646 } else if (index == 0 && 647 (charAt(value, index + 1) == 'Y' || 648 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 649 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 650 result.append('K', 'J'); 651 index += 2; 652 } else if ((contains(value, index + 1, 2, "ER") || 653 charAt(value, index + 1) == 'Y') && 654 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 655 !contains(value, index - 1, 1, "E", "I") && 656 !contains(value, index - 1, 3, "RGY", "OGY")) { 657 //-- -ger-, -gy- --// 658 result.append('K', 'J'); 659 index += 2; 660 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 661 contains(value, index - 1, 4, "AGGI", "OGGI")) { 662 //-- Italian "biaggi" --// 663 if (contains(value, 0, 4, "VAN ", "VON ") || 664 contains(value, 0, 3, "SCH") || 665 contains(value, index + 1, 2, "ET")) { 666 //-- obvious germanic --// 667 result.append('K'); 668 } else if (contains(value, index + 1, 3, "IER")) { 669 result.append('J'); 670 } else { 671 result.append('J', 'K'); 672 } 673 index += 2; 674 } else { 675 if (charAt(value, index + 1) == 'G') { 676 index += 2; 677 } else { 678 index++; 679 } 680 result.append('K'); 681 } 682 return index; 683 } 684 685 /** 686 * Handles 'GH' cases. 687 */ 688 private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { 689 if (index > 0 && !isVowel(charAt(value, index - 1))) { 690 result.append('K'); 691 index += 2; 692 } else if (index == 0) { 693 if (charAt(value, index + 2) == 'I') { 694 result.append('J'); 695 } else { 696 result.append('K'); 697 } 698 index += 2; 699 } else if (index > 1 && contains(value, index - 2, 1, "B", "H", "D") || 700 index > 2 && contains(value, index - 3, 1, "B", "H", "D") || 701 index > 3 && contains(value, index - 4, 1, "B", "H")) { 702 //-- Parker's rule (with some further refinements) - "hugh" 703 index += 2; 704 } else { 705 if (index > 2 && charAt(value, index - 1) == 'U' && 706 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 707 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 708 result.append('F'); 709 } else if (index > 0 && charAt(value, index - 1) != 'I') { 710 result.append('K'); 711 } 712 index += 2; 713 } 714 return index; 715 } 716 717 /** 718 * Handles 'H' cases. 719 */ 720 private int handleH(final String value, final DoubleMetaphoneResult result, int index) { 721 //-- only keep if first & before vowel or between 2 vowels --// 722 if ((index == 0 || isVowel(charAt(value, index - 1))) && 723 isVowel(charAt(value, index + 1))) { 724 result.append('H'); 725 index += 2; 726 //-- also takes car of "HH" --// 727 } else { 728 index++; 729 } 730 return index; 731 } 732 733 /** 734 * Handles 'J' cases. 735 */ 736 private int handleJ(final String value, final DoubleMetaphoneResult result, int index, final boolean slavoGermanic) { 737 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 738 //-- obvious Spanish, "Jose", "San Jacinto" --// 739 if (index == 0 && charAt(value, index + 4) == ' ' || 740 value.length() == 4 || contains(value, 0, 4, "SAN ")) { 741 result.append('H'); 742 } else { 743 result.append('J', 'H'); 744 } 745 index++; 746 } else { 747 if (index == 0 && !contains(value, index, 4, "JOSE")) { 748 result.append('J', 'A'); 749 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 750 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 751 result.append('J', 'H'); 752 } else if (index == value.length() - 1) { 753 result.append('J', ' '); 754 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 755 !contains(value, index - 1, 1, "S", "K", "L")) { 756 result.append('J'); 757 } 758 759 if (charAt(value, index + 1) == 'J') { 760 index += 2; 761 } else { 762 index++; 763 } 764 } 765 return index; 766 } 767 768 /** 769 * Handles 'L' cases. 770 */ 771 private int handleL(final String value, final DoubleMetaphoneResult result, int index) { 772 if (charAt(value, index + 1) == 'L') { 773 if (conditionL0(value, index)) { 774 result.appendPrimary('L'); 775 } else { 776 result.append('L'); 777 } 778 index += 2; 779 } else { 780 index++; 781 result.append('L'); 782 } 783 return index; 784 } 785 786 /** 787 * Handles 'P' cases. 788 */ 789 private int handleP(final String value, final DoubleMetaphoneResult result, int index) { 790 if (charAt(value, index + 1) == 'H') { 791 result.append('F'); 792 index += 2; 793 } else { 794 result.append('P'); 795 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 796 } 797 return index; 798 } 799 800 /** 801 * Handles 'R' cases. 802 */ 803 private int handleR(final String value, final DoubleMetaphoneResult result, final int index, final boolean slavoGermanic) { 804 if (index == value.length() - 1 && !slavoGermanic && 805 contains(value, index - 2, 2, "IE") && 806 !contains(value, index - 4, 2, "ME", "MA")) { 807 result.appendAlternate('R'); 808 } else { 809 result.append('R'); 810 } 811 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 812 } 813 814 /** 815 * Handles 'S' cases. 816 */ 817 private int handleS(final String value, final DoubleMetaphoneResult result, int index, final boolean slavoGermanic) { 818 if (contains(value, index - 1, 3, "ISL", "YSL")) { 819 //-- special cases "island", "isle", "carlisle", "carlysle" --// 820 index++; 821 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 822 //-- special case "sugar-" --// 823 result.append('X', 'S'); 824 index++; 825 } else if (contains(value, index, 2, "SH")) { 826 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 827 //-- germanic --// 828 result.append('S'); 829 } else { 830 result.append('X'); 831 } 832 index += 2; 833 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 834 //-- Italian and Armenian --// 835 if (slavoGermanic) { 836 result.append('S'); 837 } else { 838 result.append('S', 'X'); 839 } 840 index += 3; 841 } else if (index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W") || 842 contains(value, index + 1, 1, "Z")) { 843 //-- german & anglicisations, for example "smith" match "schmidt" // 844 // "snider" match "schneider" --// 845 //-- also, -sz- in slavic language although in hungarian it // 846 // is pronounced "s" --// 847 result.append('S', 'X'); 848 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 849 } else if (contains(value, index, 2, "SC")) { 850 index = handleSC(value, result, index); 851 } else { 852 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 853 //-- french for example "resnais", "artois" --// 854 result.appendAlternate('S'); 855 } else { 856 result.append('S'); 857 } 858 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 859 } 860 return index; 861 } 862 863 /** 864 * Handles 'SC' cases. 865 */ 866 private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { 867 if (charAt(value, index + 2) == 'H') { 868 //-- Schlesinger's rule --// 869 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 870 //-- Dutch origin, for example "school", "schooner" --// 871 if (contains(value, index + 3, 2, "ER", "EN")) { 872 //-- "schermerhorn", "schenker" --// 873 result.append("X", "SK"); 874 } else { 875 result.append("SK"); 876 } 877 } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 878 result.append('X', 'S'); 879 } else { 880 result.append('X'); 881 } 882 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 883 result.append('S'); 884 } else { 885 result.append("SK"); 886 } 887 return index + 3; 888 } 889 890 /** 891 * Handles 'T' cases. 892 */ 893 private int handleT(final String value, final DoubleMetaphoneResult result, int index) { 894 if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) { 895 result.append('X'); 896 index += 3; 897 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 898 if (contains(value, index + 2, 2, "OM", "AM") || 899 //-- special case "thomas", "thames" or germanic --// 900 contains(value, 0, 4, "VAN ", "VON ") || 901 contains(value, 0, 3, "SCH")) { 902 result.append('T'); 903 } else { 904 result.append('0', 'T'); 905 } 906 index += 2; 907 } else { 908 result.append('T'); 909 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 910 } 911 return index; 912 } 913 914 /** 915 * Handles 'W' cases. 916 */ 917 private int handleW(final String value, final DoubleMetaphoneResult result, int index) { 918 if (contains(value, index, 2, "WR")) { 919 //-- can also be in middle of word --// 920 result.append('R'); 921 index += 2; 922 } else if (index == 0 && (isVowel(charAt(value, index + 1)) || 923 contains(value, index, 2, "WH"))) { 924 if (isVowel(charAt(value, index + 1))) { 925 //-- Wasserman should match Vasserman --// 926 result.append('A', 'F'); 927 } else { 928 //-- need Uomo to match Womo --// 929 result.append('A'); 930 } 931 index++; 932 } else if (index == value.length() - 1 && isVowel(charAt(value, index - 1)) || 933 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 934 contains(value, 0, 3, "SCH")) { 935 //-- Arnow should match Arnoff --// 936 result.appendAlternate('F'); 937 index++; 938 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 939 //-- Polish for example "filipowicz" --// 940 result.append("TS", "FX"); 941 index += 4; 942 } else { 943 index++; 944 } 945 return index; 946 } 947 948 /** 949 * Handles 'X' cases. 950 */ 951 private int handleX(final String value, final DoubleMetaphoneResult result, int index) { 952 if (index == 0) { 953 result.append('S'); 954 index++; 955 } else { 956 if (!(index == value.length() - 1 && 957 (contains(value, index - 3, 3, "IAU", "EAU") || 958 contains(value, index - 2, 2, "AU", "OU")))) { 959 //-- French for example breaux --// 960 result.append("KS"); 961 } 962 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 963 } 964 return index; 965 } 966 967 /** 968 * Handles 'Z' cases. 969 */ 970 private int handleZ(final String value, final DoubleMetaphoneResult result, int index, final boolean slavoGermanic) { 971 if (charAt(value, index + 1) == 'H') { 972 //-- Chinese pinyin for example "zhao" or Angelina "Zhang" --// 973 result.append('J'); 974 index += 2; 975 } else { 976 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 977 slavoGermanic && index > 0 && charAt(value, index - 1) != 'T') { 978 result.append("S", "TS"); 979 } else { 980 result.append('S'); 981 } 982 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 983 } 984 return index; 985 } 986 987 /** 988 * Tests whether the Double Metaphone values of two {@code String} values 989 * are equal. 990 * 991 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 992 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 993 * @return {@code true} if the encoded {@code String}s are equal; 994 * {@code false} otherwise. 995 * @see #isDoubleMetaphoneEqual(String,String,boolean) 996 */ 997 public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { 998 return isDoubleMetaphoneEqual(value1, value2, false); 999 } 1000 1001 /** 1002 * Tests whether the Double Metaphone values of two {@code String} values 1003 * are equal, optionally using the alternate value. 1004 * 1005 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 1006 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 1007 * @param alternate use the alternate value if {@code true}. 1008 * @return {@code true} if the encoded {@code String}s are equal; 1009 * {@code false} otherwise. 1010 */ 1011 public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { 1012 return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate)); 1013 } 1014 1015 /** 1016 * Tests whether or not the value starts with a silent letter. It will 1017 * return {@code true} if the value starts with any of 'GN', 'KN', 1018 * 'PN', 'WR' or 'PS'. 1019 */ 1020 private boolean isSilentStart(final String value) { 1021 boolean result = false; 1022 for (final String element : SILENT_START) { 1023 if (value.startsWith(element)) { 1024 result = true; 1025 break; 1026 } 1027 } 1028 return result; 1029 } 1030 1031 /** 1032 * Tests whether or not a value is of slavo-germanic origin. A value is 1033 * of Slavo-Germanic origin if it contains any of 'W', 'K', 'CZ', or 'WITZ'. 1034 */ 1035 private boolean isSlavoGermanic(final String value) { 1036 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 1037 value.contains("CZ") || value.contains("WITZ"); 1038 } 1039 1040 /** 1041 * Tests whether or not a character is a vowel or not. 1042 */ 1043 private boolean isVowel(final char ch) { 1044 return VOWELS.indexOf(ch) != -1; 1045 } 1046 1047 /** 1048 * Sets the maxCodeLen. 1049 * 1050 * @param maxCodeLen The maxCodeLen to set. 1051 */ 1052 public void setMaxCodeLen(final int maxCodeLen) { 1053 this.maxCodeLen = maxCodeLen; 1054 } 1055}