001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 023 /** 024 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 025 * Philips</CITE>. 026 * <p> 027 * This class is conditionally thread-safe. The instance field {@link #maxCodeLen} is mutable 028 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 029 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 030 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 031 * 032 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a> 033 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a> 034 * 035 * @version $Id: DoubleMetaphone.DoubleMetaphoneResult.html 889935 2013-12-11 05:05:13Z ggregory $ 036 */ 037 public class DoubleMetaphone implements StringEncoder { 038 039 /** 040 * "Vowels" to test for 041 */ 042 private static final String VOWELS = "AEIOUY"; 043 044 /** 045 * Prefixes when present which are not pronounced 046 */ 047 private static final String[] SILENT_START = 048 { "GN", "KN", "PN", "WR", "PS" }; 049 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 050 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 051 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 052 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 053 private static final String[] L_T_K_S_N_M_B_Z = 054 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 055 056 /** 057 * Maximum length of an encoding, default is 4 058 */ 059 private int maxCodeLen = 4; 060 061 /** 062 * Creates an instance of this DoubleMetaphone encoder 063 */ 064 public DoubleMetaphone() { 065 super(); 066 } 067 068 /** 069 * Encode a value with Double Metaphone. 070 * 071 * @param value String to encode 072 * @return an encoded string 073 */ 074 public String doubleMetaphone(String value) { 075 return doubleMetaphone(value, false); 076 } 077 078 /** 079 * Encode a value with Double Metaphone, optionally using the alternate encoding. 080 * 081 * @param value String to encode 082 * @param alternate use alternate encode 083 * @return an encoded string 084 */ 085 public String doubleMetaphone(String value, boolean alternate) { 086 value = cleanInput(value); 087 if (value == null) { 088 return null; 089 } 090 091 boolean slavoGermanic = isSlavoGermanic(value); 092 int index = isSilentStart(value) ? 1 : 0; 093 094 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 095 096 while (!result.isComplete() && index <= value.length() - 1) { 097 switch (value.charAt(index)) { 098 case 'A': 099 case 'E': 100 case 'I': 101 case 'O': 102 case 'U': 103 case 'Y': 104 index = handleAEIOUY(result, index); 105 break; 106 case 'B': 107 result.append('P'); 108 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 109 break; 110 case '\u00C7': 111 // A C with a Cedilla 112 result.append('S'); 113 index++; 114 break; 115 case 'C': 116 index = handleC(value, result, index); 117 break; 118 case 'D': 119 index = handleD(value, result, index); 120 break; 121 case 'F': 122 result.append('F'); 123 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 124 break; 125 case 'G': 126 index = handleG(value, result, index, slavoGermanic); 127 break; 128 case 'H': 129 index = handleH(value, result, index); 130 break; 131 case 'J': 132 index = handleJ(value, result, index, slavoGermanic); 133 break; 134 case 'K': 135 result.append('K'); 136 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 137 break; 138 case 'L': 139 index = handleL(value, result, index); 140 break; 141 case 'M': 142 result.append('M'); 143 index = conditionM0(value, index) ? index + 2 : index + 1; 144 break; 145 case 'N': 146 result.append('N'); 147 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 148 break; 149 case '\u00D1': 150 // N with a tilde (spanish ene) 151 result.append('N'); 152 index++; 153 break; 154 case 'P': 155 index = handleP(value, result, index); 156 break; 157 case 'Q': 158 result.append('K'); 159 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 160 break; 161 case 'R': 162 index = handleR(value, result, index, slavoGermanic); 163 break; 164 case 'S': 165 index = handleS(value, result, index, slavoGermanic); 166 break; 167 case 'T': 168 index = handleT(value, result, index); 169 break; 170 case 'V': 171 result.append('F'); 172 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 173 break; 174 case 'W': 175 index = handleW(value, result, index); 176 break; 177 case 'X': 178 index = handleX(value, result, index); 179 break; 180 case 'Z': 181 index = handleZ(value, result, index, slavoGermanic); 182 break; 183 default: 184 index++; 185 break; 186 } 187 } 188 189 return alternate ? result.getAlternate() : result.getPrimary(); 190 } 191 192 /** 193 * Encode the value using DoubleMetaphone. It will only work if 194 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 195 * 196 * @param obj Object to encode (should be of type String) 197 * @return An encoded Object (will be of type String) 198 * @throws EncoderException encode parameter is not of type String 199 */ 200 @Override 201 public Object encode(Object obj) throws EncoderException { 202 if (!(obj instanceof String)) { 203 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 204 } 205 return doubleMetaphone((String) obj); 206 } 207 208 /** 209 * Encode the value using DoubleMetaphone. 210 * 211 * @param value String to encode 212 * @return An encoded String 213 */ 214 @Override 215 public String encode(String value) { 216 return doubleMetaphone(value); 217 } 218 219 /** 220 * Check if the Double Metaphone values of two <code>String</code> values 221 * are equal. 222 * 223 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 224 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 225 * @return {@code true} if the encoded <code>String</code>s are equal; 226 * {@code false} otherwise. 227 * @see #isDoubleMetaphoneEqual(String,String,boolean) 228 */ 229 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 230 return isDoubleMetaphoneEqual(value1, value2, false); 231 } 232 233 /** 234 * Check if the Double Metaphone values of two <code>String</code> values 235 * are equal, optionally using the alternate value. 236 * 237 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 238 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 239 * @param alternate use the alternate value if {@code true}. 240 * @return {@code true} if the encoded <code>String</code>s are equal; 241 * {@code false} otherwise. 242 */ 243 public boolean isDoubleMetaphoneEqual(String value1, String value2, boolean alternate) { 244 return doubleMetaphone(value1, alternate).equals(doubleMetaphone(value2, alternate)); 245 } 246 247 /** 248 * Returns the maxCodeLen. 249 * @return int 250 */ 251 public int getMaxCodeLen() { 252 return this.maxCodeLen; 253 } 254 255 /** 256 * Sets the maxCodeLen. 257 * @param maxCodeLen The maxCodeLen to set 258 */ 259 public void setMaxCodeLen(int maxCodeLen) { 260 this.maxCodeLen = maxCodeLen; 261 } 262 263 //-- BEGIN HANDLERS --// 264 265 /** 266 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 267 */ 268 private int handleAEIOUY(DoubleMetaphoneResult result, int index) { 269 if (index == 0) { 270 result.append('A'); 271 } 272 return index + 1; 273 } 274 275 /** 276 * Handles 'C' cases. 277 */ 278 private int handleC(String value, DoubleMetaphoneResult result, int index) { 279 if (conditionC0(value, index)) { // very confusing, moved out 280 result.append('K'); 281 index += 2; 282 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 283 result.append('S'); 284 index += 2; 285 } else if (contains(value, index, 2, "CH")) { 286 index = handleCH(value, result, index); 287 } else if (contains(value, index, 2, "CZ") && 288 !contains(value, index - 2, 4, "WICZ")) { 289 //-- "Czerny" --// 290 result.append('S', 'X'); 291 index += 2; 292 } else if (contains(value, index + 1, 3, "CIA")) { 293 //-- "focaccia" --// 294 result.append('X'); 295 index += 3; 296 } else if (contains(value, index, 2, "CC") && 297 !(index == 1 && charAt(value, 0) == 'M')) { 298 //-- double "cc" but not "McClelland" --// 299 return handleCC(value, result, index); 300 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 301 result.append('K'); 302 index += 2; 303 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 304 //-- Italian vs. English --// 305 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 306 result.append('S', 'X'); 307 } else { 308 result.append('S'); 309 } 310 index += 2; 311 } else { 312 result.append('K'); 313 if (contains(value, index + 1, 2, " C", " Q", " G")) { 314 //-- Mac Caffrey, Mac Gregor --// 315 index += 3; 316 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 317 !contains(value, index + 1, 2, "CE", "CI")) { 318 index += 2; 319 } else { 320 index++; 321 } 322 } 323 324 return index; 325 } 326 327 /** 328 * Handles 'CC' cases. 329 */ 330 private int handleCC(String value, DoubleMetaphoneResult result, int index) { 331 if (contains(value, index + 2, 1, "I", "E", "H") && 332 !contains(value, index + 2, 2, "HU")) { 333 //-- "bellocchio" but not "bacchus" --// 334 if ((index == 1 && charAt(value, index - 1) == 'A') || 335 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 336 //-- "accident", "accede", "succeed" --// 337 result.append("KS"); 338 } else { 339 //-- "bacci", "bertucci", other Italian --// 340 result.append('X'); 341 } 342 index += 3; 343 } else { // Pierce's rule 344 result.append('K'); 345 index += 2; 346 } 347 348 return index; 349 } 350 351 /** 352 * Handles 'CH' cases. 353 */ 354 private int handleCH(String value, DoubleMetaphoneResult result, int index) { 355 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 356 result.append('K', 'X'); 357 return index + 2; 358 } else if (conditionCH0(value, index)) { 359 //-- Greek roots ("chemistry", "chorus", etc.) --// 360 result.append('K'); 361 return index + 2; 362 } else if (conditionCH1(value, index)) { 363 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 364 result.append('K'); 365 return index + 2; 366 } else { 367 if (index > 0) { 368 if (contains(value, 0, 2, "MC")) { 369 result.append('K'); 370 } else { 371 result.append('X', 'K'); 372 } 373 } else { 374 result.append('X'); 375 } 376 return index + 2; 377 } 378 } 379 380 /** 381 * Handles 'D' cases. 382 */ 383 private int handleD(String value, DoubleMetaphoneResult result, int index) { 384 if (contains(value, index, 2, "DG")) { 385 //-- "Edge" --// 386 if (contains(value, index + 2, 1, "I", "E", "Y")) { 387 result.append('J'); 388 index += 3; 389 //-- "Edgar" --// 390 } else { 391 result.append("TK"); 392 index += 2; 393 } 394 } else if (contains(value, index, 2, "DT", "DD")) { 395 result.append('T'); 396 index += 2; 397 } else { 398 result.append('T'); 399 index++; 400 } 401 return index; 402 } 403 404 /** 405 * Handles 'G' cases. 406 */ 407 private int handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) { 408 if (charAt(value, index + 1) == 'H') { 409 index = handleGH(value, result, index); 410 } else if (charAt(value, index + 1) == 'N') { 411 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 412 result.append("KN", "N"); 413 } else if (!contains(value, index + 2, 2, "EY") && 414 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 415 result.append("N", "KN"); 416 } else { 417 result.append("KN"); 418 } 419 index = index + 2; 420 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 421 result.append("KL", "L"); 422 index += 2; 423 } else if (index == 0 && 424 (charAt(value, index + 1) == 'Y' || 425 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 426 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 427 result.append('K', 'J'); 428 index += 2; 429 } else if ((contains(value, index + 1, 2, "ER") || 430 charAt(value, index + 1) == 'Y') && 431 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 432 !contains(value, index - 1, 1, "E", "I") && 433 !contains(value, index - 1, 3, "RGY", "OGY")) { 434 //-- -ger-, -gy- --// 435 result.append('K', 'J'); 436 index += 2; 437 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 438 contains(value, index - 1, 4, "AGGI", "OGGI")) { 439 //-- Italian "biaggi" --// 440 if (contains(value, 0 ,4, "VAN ", "VON ") || 441 contains(value, 0, 3, "SCH") || 442 contains(value, index + 1, 2, "ET")) { 443 //-- obvious germanic --// 444 result.append('K'); 445 } else if (contains(value, index + 1, 3, "IER")) { 446 result.append('J'); 447 } else { 448 result.append('J', 'K'); 449 } 450 index += 2; 451 } else if (charAt(value, index + 1) == 'G') { 452 index += 2; 453 result.append('K'); 454 } else { 455 index++; 456 result.append('K'); 457 } 458 return index; 459 } 460 461 /** 462 * Handles 'GH' cases. 463 */ 464 private int handleGH(String value, DoubleMetaphoneResult result, int index) { 465 if (index > 0 && !isVowel(charAt(value, index - 1))) { 466 result.append('K'); 467 index += 2; 468 } else if (index == 0) { 469 if (charAt(value, index + 2) == 'I') { 470 result.append('J'); 471 } else { 472 result.append('K'); 473 } 474 index += 2; 475 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 476 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 477 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 478 //-- Parker's rule (with some further refinements) - "hugh" 479 index += 2; 480 } else { 481 if (index > 2 && charAt(value, index - 1) == 'U' && 482 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 483 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 484 result.append('F'); 485 } else if (index > 0 && charAt(value, index - 1) != 'I') { 486 result.append('K'); 487 } 488 index += 2; 489 } 490 return index; 491 } 492 493 /** 494 * Handles 'H' cases. 495 */ 496 private int handleH(String value, DoubleMetaphoneResult result, int index) { 497 //-- only keep if first & before vowel or between 2 vowels --// 498 if ((index == 0 || isVowel(charAt(value, index - 1))) && 499 isVowel(charAt(value, index + 1))) { 500 result.append('H'); 501 index += 2; 502 //-- also takes car of "HH" --// 503 } else { 504 index++; 505 } 506 return index; 507 } 508 509 /** 510 * Handles 'J' cases. 511 */ 512 private int handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) { 513 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 514 //-- obvious Spanish, "Jose", "San Jacinto" --// 515 if ((index == 0 && (charAt(value, index + 4) == ' ') || 516 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 517 result.append('H'); 518 } else { 519 result.append('J', 'H'); 520 } 521 index++; 522 } else { 523 if (index == 0 && !contains(value, index, 4, "JOSE")) { 524 result.append('J', 'A'); 525 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 526 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 527 result.append('J', 'H'); 528 } else if (index == value.length() - 1) { 529 result.append('J', ' '); 530 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 531 !contains(value, index - 1, 1, "S", "K", "L")) { 532 result.append('J'); 533 } 534 535 if (charAt(value, index + 1) == 'J') { 536 index += 2; 537 } else { 538 index++; 539 } 540 } 541 return index; 542 } 543 544 /** 545 * Handles 'L' cases. 546 */ 547 private int handleL(String value, DoubleMetaphoneResult result, int index) { 548 if (charAt(value, index + 1) == 'L') { 549 if (conditionL0(value, index)) { 550 result.appendPrimary('L'); 551 } else { 552 result.append('L'); 553 } 554 index += 2; 555 } else { 556 index++; 557 result.append('L'); 558 } 559 return index; 560 } 561 562 /** 563 * Handles 'P' cases. 564 */ 565 private int handleP(String value, DoubleMetaphoneResult result, int index) { 566 if (charAt(value, index + 1) == 'H') { 567 result.append('F'); 568 index += 2; 569 } else { 570 result.append('P'); 571 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 572 } 573 return index; 574 } 575 576 /** 577 * Handles 'R' cases. 578 */ 579 private int handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) { 580 if (index == value.length() - 1 && !slavoGermanic && 581 contains(value, index - 2, 2, "IE") && 582 !contains(value, index - 4, 2, "ME", "MA")) { 583 result.appendAlternate('R'); 584 } else { 585 result.append('R'); 586 } 587 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 588 } 589 590 /** 591 * Handles 'S' cases. 592 */ 593 private int handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) { 594 if (contains(value, index - 1, 3, "ISL", "YSL")) { 595 //-- special cases "island", "isle", "carlisle", "carlysle" --// 596 index++; 597 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 598 //-- special case "sugar-" --// 599 result.append('X', 'S'); 600 index++; 601 } else if (contains(value, index, 2, "SH")) { 602 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 603 //-- germanic --// 604 result.append('S'); 605 } else { 606 result.append('X'); 607 } 608 index += 2; 609 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 610 //-- Italian and Armenian --// 611 if (slavoGermanic) { 612 result.append('S'); 613 } else { 614 result.append('S', 'X'); 615 } 616 index += 3; 617 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || 618 contains(value, index + 1, 1, "Z")) { 619 //-- german & anglicisations, e.g. "smith" match "schmidt" // 620 // "snider" match "schneider" --// 621 //-- also, -sz- in slavic language altho in hungarian it // 622 // is pronounced "s" --// 623 result.append('S', 'X'); 624 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 625 } else if (contains(value, index, 2, "SC")) { 626 index = handleSC(value, result, index); 627 } else { 628 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 629 //-- french e.g. "resnais", "artois" --// 630 result.appendAlternate('S'); 631 } else { 632 result.append('S'); 633 } 634 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 635 } 636 return index; 637 } 638 639 /** 640 * Handles 'SC' cases. 641 */ 642 private int handleSC(String value, DoubleMetaphoneResult result, int index) { 643 if (charAt(value, index + 2) == 'H') { 644 //-- Schlesinger's rule --// 645 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 646 //-- Dutch origin, e.g. "school", "schooner" --// 647 if (contains(value, index + 3, 2, "ER", "EN")) { 648 //-- "schermerhorn", "schenker" --// 649 result.append("X", "SK"); 650 } else { 651 result.append("SK"); 652 } 653 } else { 654 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 655 result.append('X', 'S'); 656 } else { 657 result.append('X'); 658 } 659 } 660 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 661 result.append('S'); 662 } else { 663 result.append("SK"); 664 } 665 return index + 3; 666 } 667 668 /** 669 * Handles 'T' cases. 670 */ 671 private int handleT(String value, DoubleMetaphoneResult result, int index) { 672 if (contains(value, index, 4, "TION")) { 673 result.append('X'); 674 index += 3; 675 } else if (contains(value, index, 3, "TIA", "TCH")) { 676 result.append('X'); 677 index += 3; 678 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 679 if (contains(value, index + 2, 2, "OM", "AM") || 680 //-- special case "thomas", "thames" or germanic --// 681 contains(value, 0, 4, "VAN ", "VON ") || 682 contains(value, 0, 3, "SCH")) { 683 result.append('T'); 684 } else { 685 result.append('0', 'T'); 686 } 687 index += 2; 688 } else { 689 result.append('T'); 690 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 691 } 692 return index; 693 } 694 695 /** 696 * Handles 'W' cases. 697 */ 698 private int handleW(String value, DoubleMetaphoneResult result, int index) { 699 if (contains(value, index, 2, "WR")) { 700 //-- can also be in middle of word --// 701 result.append('R'); 702 index += 2; 703 } else { 704 if (index == 0 && (isVowel(charAt(value, index + 1)) || 705 contains(value, index, 2, "WH"))) { 706 if (isVowel(charAt(value, index + 1))) { 707 //-- Wasserman should match Vasserman --// 708 result.append('A', 'F'); 709 } else { 710 //-- need Uomo to match Womo --// 711 result.append('A'); 712 } 713 index++; 714 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 715 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 716 contains(value, 0, 3, "SCH")) { 717 //-- Arnow should match Arnoff --// 718 result.appendAlternate('F'); 719 index++; 720 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 721 //-- Polish e.g. "filipowicz" --// 722 result.append("TS", "FX"); 723 index += 4; 724 } else { 725 index++; 726 } 727 } 728 return index; 729 } 730 731 /** 732 * Handles 'X' cases. 733 */ 734 private int handleX(String value, DoubleMetaphoneResult result, int index) { 735 if (index == 0) { 736 result.append('S'); 737 index++; 738 } else { 739 if (!((index == value.length() - 1) && 740 (contains(value, index - 3, 3, "IAU", "EAU") || 741 contains(value, index - 2, 2, "AU", "OU")))) { 742 //-- French e.g. breaux --// 743 result.append("KS"); 744 } 745 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 746 } 747 return index; 748 } 749 750 /** 751 * Handles 'Z' cases. 752 */ 753 private int handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) { 754 if (charAt(value, index + 1) == 'H') { 755 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 756 result.append('J'); 757 index += 2; 758 } else { 759 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 760 (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 761 result.append("S", "TS"); 762 } else { 763 result.append('S'); 764 } 765 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 766 } 767 return index; 768 } 769 770 //-- BEGIN CONDITIONS --// 771 772 /** 773 * Complex condition 0 for 'C'. 774 */ 775 private boolean conditionC0(String value, int index) { 776 if (contains(value, index, 4, "CHIA")) { 777 return true; 778 } else if (index <= 1) { 779 return false; 780 } else if (isVowel(charAt(value, index - 2))) { 781 return false; 782 } else if (!contains(value, index - 1, 3, "ACH")) { 783 return false; 784 } else { 785 char c = charAt(value, index + 2); 786 return (c != 'I' && c != 'E') || 787 contains(value, index - 2, 6, "BACHER", "MACHER"); 788 } 789 } 790 791 /** 792 * Complex condition 0 for 'CH'. 793 */ 794 private boolean conditionCH0(String value, int index) { 795 if (index != 0) { 796 return false; 797 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 798 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 799 return false; 800 } else if (contains(value, 0, 5, "CHORE")) { 801 return false; 802 } else { 803 return true; 804 } 805 } 806 807 /** 808 * Complex condition 1 for 'CH'. 809 */ 810 private boolean conditionCH1(String value, int index) { 811 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || 812 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 813 contains(value, index + 2, 1, "T", "S") || 814 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 815 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 816 } 817 818 /** 819 * Complex condition 0 for 'L'. 820 */ 821 private boolean conditionL0(String value, int index) { 822 if (index == value.length() - 3 && 823 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 824 return true; 825 } else if ((contains(value, value.length() - 2, 2, "AS", "OS") || 826 contains(value, value.length() - 1, 1, "A", "O")) && 827 contains(value, index - 1, 4, "ALLE")) { 828 return true; 829 } else { 830 return false; 831 } 832 } 833 834 /** 835 * Complex condition 0 for 'M'. 836 */ 837 private boolean conditionM0(String value, int index) { 838 if (charAt(value, index + 1) == 'M') { 839 return true; 840 } 841 return contains(value, index - 1, 3, "UMB") && 842 ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER")); 843 } 844 845 //-- BEGIN HELPER FUNCTIONS --// 846 847 /** 848 * Determines whether or not a value is of slavo-germanic orgin. A value is 849 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 850 */ 851 private boolean isSlavoGermanic(String value) { 852 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 853 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 854 } 855 856 /** 857 * Determines whether or not a character is a vowel or not 858 */ 859 private boolean isVowel(char ch) { 860 return VOWELS.indexOf(ch) != -1; 861 } 862 863 /** 864 * Determines whether or not the value starts with a silent letter. It will 865 * return {@code true} if the value starts with any of 'GN', 'KN', 866 * 'PN', 'WR' or 'PS'. 867 */ 868 private boolean isSilentStart(String value) { 869 boolean result = false; 870 for (String element : SILENT_START) { 871 if (value.startsWith(element)) { 872 result = true; 873 break; 874 } 875 } 876 return result; 877 } 878 879 /** 880 * Cleans the input. 881 */ 882 private String cleanInput(String input) { 883 if (input == null) { 884 return null; 885 } 886 input = input.trim(); 887 if (input.length() == 0) { 888 return null; 889 } 890 return input.toUpperCase(java.util.Locale.ENGLISH); 891 } 892 893 /** 894 * Gets the character at index <code>index</code> if available, otherwise 895 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 896 * of a default. 897 */ 898 protected char charAt(String value, int index) { 899 if (index < 0 || index >= value.length()) { 900 return Character.MIN_VALUE; 901 } 902 return value.charAt(index); 903 } 904 905 /** 906 * Shortcut method with 1 criteria. 907 */ 908 private static boolean contains(String value, int start, int length, String criteria) { 909 return contains(value, start, length, new String[] { criteria }); 910 } 911 912 /** 913 * Shortcut method with 2 criteria. 914 */ 915 private static boolean contains(String value, int start, int length, 916 String criteria1, String criteria2) { 917 return contains(value, start, length, new String[] { criteria1, criteria2 }); 918 } 919 920 /** 921 * Shortcut method with 3 criteria. 922 */ 923 private static boolean contains(String value, int start, int length, 924 String criteria1, String criteria2, String criteria3) { 925 return contains(value, start, length, new String[] { criteria1, criteria2, criteria3 }); 926 } 927 928 /** 929 * Shortcut method with 4 criteria. 930 */ 931 private static boolean contains(String value, int start, int length, 932 String criteria1, String criteria2, 933 String criteria3, String criteria4) { 934 return contains(value, start, length, 935 new String[] { criteria1, criteria2, criteria3, criteria4 }); 936 } 937 938 /** 939 * Shortcut method with 5 criteria. 940 */ 941 private static boolean contains(String value, int start, int length, 942 String criteria1, String criteria2, 943 String criteria3, String criteria4, 944 String criteria5) { 945 return contains(value, start, length, 946 new String[] { criteria1, criteria2, criteria3, 947 criteria4, criteria5 }); 948 } 949 950 /** 951 * Shortcut method with 6 criteria. 952 */ 953 private static boolean contains(String value, int start, int length, 954 String criteria1, String criteria2, 955 String criteria3, String criteria4, 956 String criteria5, String criteria6) { 957 return contains(value, start, length, 958 new String[] { criteria1, criteria2, criteria3, 959 criteria4, criteria5, criteria6 }); 960 } 961 962 /** 963 * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and 964 * matching up to length <code>length</code>. 965 */ 966 protected static boolean contains(String value, int start, int length, 967 String[] criteria) { 968 boolean result = false; 969 if (start >= 0 && start + length <= value.length()) { 970 String target = value.substring(start, start + length); 971 972 for (String element : criteria) { 973 if (target.equals(element)) { 974 result = true; 975 break; 976 } 977 } 978 } 979 return result; 980 } 981 982 //-- BEGIN INNER CLASSES --// 983 984 /** 985 * Inner class for storing results, since there is the optional alternate encoding. 986 */ 987 public class DoubleMetaphoneResult { 988 989 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 990 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 991 private final int maxLength; 992 993 public DoubleMetaphoneResult(int maxLength) { 994 this.maxLength = maxLength; 995 } 996 997 public void append(char value) { 998 appendPrimary(value); 999 appendAlternate(value); 1000 } 1001 1002 public void append(char primary, char alternate) { 1003 appendPrimary(primary); 1004 appendAlternate(alternate); 1005 } 1006 1007 public void appendPrimary(char value) { 1008 if (this.primary.length() < this.maxLength) { 1009 this.primary.append(value); 1010 } 1011 } 1012 1013 public void appendAlternate(char value) { 1014 if (this.alternate.length() < this.maxLength) { 1015 this.alternate.append(value); 1016 } 1017 } 1018 1019 public void append(String value) { 1020 appendPrimary(value); 1021 appendAlternate(value); 1022 } 1023 1024 public void append(String primary, String alternate) { 1025 appendPrimary(primary); 1026 appendAlternate(alternate); 1027 } 1028 1029 public void appendPrimary(String value) { 1030 int addChars = this.maxLength - this.primary.length(); 1031 if (value.length() <= addChars) { 1032 this.primary.append(value); 1033 } else { 1034 this.primary.append(value.substring(0, addChars)); 1035 } 1036 } 1037 1038 public void appendAlternate(String value) { 1039 int addChars = this.maxLength - this.alternate.length(); 1040 if (value.length() <= addChars) { 1041 this.alternate.append(value); 1042 } else { 1043 this.alternate.append(value.substring(0, addChars)); 1044 } 1045 } 1046 1047 public String getPrimary() { 1048 return this.primary.toString(); 1049 } 1050 1051 public String getAlternate() { 1052 return this.alternate.toString(); 1053 } 1054 1055 public boolean isComplete() { 1056 return this.primary.length() >= this.maxLength && 1057 this.alternate.length() >= this.maxLength; 1058 } 1059 } 1060 }