001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022import org.apache.commons.codec.binary.StringUtils; 023 024/** 025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 026 * Philips</CITE>. 027 * <p> 028 * This class is conditionally thread-safe. The instance field {@link #maxCodeLen} is mutable 029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 032 * 033 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a> 034 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a> 035 * 036 * @version $Id: DoubleMetaphone.DoubleMetaphoneResult.html 928559 2014-11-10 02:53:54Z ggregory $ 037 */ 038public class DoubleMetaphone implements StringEncoder { 039 040 /** 041 * "Vowels" to test for 042 */ 043 private static final String VOWELS = "AEIOUY"; 044 045 /** 046 * Prefixes when present which are not pronounced 047 */ 048 private static final String[] SILENT_START = 049 { "GN", "KN", "PN", "WR", "PS" }; 050 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 051 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 052 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 053 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 054 private static final String[] L_T_K_S_N_M_B_Z = 055 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 056 057 /** 058 * Maximum length of an encoding, default is 4 059 */ 060 private int maxCodeLen = 4; 061 062 /** 063 * Creates an instance of this DoubleMetaphone encoder 064 */ 065 public DoubleMetaphone() { 066 super(); 067 } 068 069 /** 070 * Encode a value with Double Metaphone. 071 * 072 * @param value String to encode 073 * @return an encoded string 074 */ 075 public String doubleMetaphone(final String value) { 076 return doubleMetaphone(value, false); 077 } 078 079 /** 080 * Encode a value with Double Metaphone, optionally using the alternate encoding. 081 * 082 * @param value String to encode 083 * @param alternate use alternate encode 084 * @return an encoded string 085 */ 086 public String doubleMetaphone(String value, final boolean alternate) { 087 value = cleanInput(value); 088 if (value == null) { 089 return null; 090 } 091 092 final boolean slavoGermanic = isSlavoGermanic(value); 093 int index = isSilentStart(value) ? 1 : 0; 094 095 final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 096 097 while (!result.isComplete() && index <= value.length() - 1) { 098 switch (value.charAt(index)) { 099 case 'A': 100 case 'E': 101 case 'I': 102 case 'O': 103 case 'U': 104 case 'Y': 105 index = handleAEIOUY(result, index); 106 break; 107 case 'B': 108 result.append('P'); 109 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 110 break; 111 case '\u00C7': 112 // A C with a Cedilla 113 result.append('S'); 114 index++; 115 break; 116 case 'C': 117 index = handleC(value, result, index); 118 break; 119 case 'D': 120 index = handleD(value, result, index); 121 break; 122 case 'F': 123 result.append('F'); 124 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 125 break; 126 case 'G': 127 index = handleG(value, result, index, slavoGermanic); 128 break; 129 case 'H': 130 index = handleH(value, result, index); 131 break; 132 case 'J': 133 index = handleJ(value, result, index, slavoGermanic); 134 break; 135 case 'K': 136 result.append('K'); 137 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 138 break; 139 case 'L': 140 index = handleL(value, result, index); 141 break; 142 case 'M': 143 result.append('M'); 144 index = conditionM0(value, index) ? index + 2 : index + 1; 145 break; 146 case 'N': 147 result.append('N'); 148 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 149 break; 150 case '\u00D1': 151 // N with a tilde (spanish ene) 152 result.append('N'); 153 index++; 154 break; 155 case 'P': 156 index = handleP(value, result, index); 157 break; 158 case 'Q': 159 result.append('K'); 160 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 161 break; 162 case 'R': 163 index = handleR(value, result, index, slavoGermanic); 164 break; 165 case 'S': 166 index = handleS(value, result, index, slavoGermanic); 167 break; 168 case 'T': 169 index = handleT(value, result, index); 170 break; 171 case 'V': 172 result.append('F'); 173 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 174 break; 175 case 'W': 176 index = handleW(value, result, index); 177 break; 178 case 'X': 179 index = handleX(value, result, index); 180 break; 181 case 'Z': 182 index = handleZ(value, result, index, slavoGermanic); 183 break; 184 default: 185 index++; 186 break; 187 } 188 } 189 190 return alternate ? result.getAlternate() : result.getPrimary(); 191 } 192 193 /** 194 * Encode the value using DoubleMetaphone. It will only work if 195 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 196 * 197 * @param obj Object to encode (should be of type String) 198 * @return An encoded Object (will be of type String) 199 * @throws EncoderException encode parameter is not of type String 200 */ 201 @Override 202 public Object encode(final Object obj) throws EncoderException { 203 if (!(obj instanceof String)) { 204 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 205 } 206 return doubleMetaphone((String) obj); 207 } 208 209 /** 210 * Encode the value using DoubleMetaphone. 211 * 212 * @param value String to encode 213 * @return An encoded String 214 */ 215 @Override 216 public String encode(final String value) { 217 return doubleMetaphone(value); 218 } 219 220 /** 221 * Check if the Double Metaphone values of two <code>String</code> values 222 * are equal. 223 * 224 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 225 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 226 * @return <code>true</code> if the encoded <code>String</code>s are equal; 227 * <code>false</code> otherwise. 228 * @see #isDoubleMetaphoneEqual(String,String,boolean) 229 */ 230 public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { 231 return isDoubleMetaphoneEqual(value1, value2, false); 232 } 233 234 /** 235 * Check if the Double Metaphone values of two <code>String</code> values 236 * are equal, optionally using the alternate value. 237 * 238 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 239 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 240 * @param alternate use the alternate value if <code>true</code>. 241 * @return <code>true</code> if the encoded <code>String</code>s are equal; 242 * <code>false</code> otherwise. 243 */ 244 public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { 245 return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate)); 246 } 247 248 /** 249 * Returns the maxCodeLen. 250 * @return int 251 */ 252 public int getMaxCodeLen() { 253 return this.maxCodeLen; 254 } 255 256 /** 257 * Sets the maxCodeLen. 258 * @param maxCodeLen The maxCodeLen to set 259 */ 260 public void setMaxCodeLen(final int maxCodeLen) { 261 this.maxCodeLen = maxCodeLen; 262 } 263 264 //-- BEGIN HANDLERS --// 265 266 /** 267 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 268 */ 269 private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { 270 if (index == 0) { 271 result.append('A'); 272 } 273 return index + 1; 274 } 275 276 /** 277 * Handles 'C' cases. 278 */ 279 private int handleC(final String value, final DoubleMetaphoneResult result, int index) { 280 if (conditionC0(value, index)) { // very confusing, moved out 281 result.append('K'); 282 index += 2; 283 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 284 result.append('S'); 285 index += 2; 286 } else if (contains(value, index, 2, "CH")) { 287 index = handleCH(value, result, index); 288 } else if (contains(value, index, 2, "CZ") && 289 !contains(value, index - 2, 4, "WICZ")) { 290 //-- "Czerny" --// 291 result.append('S', 'X'); 292 index += 2; 293 } else if (contains(value, index + 1, 3, "CIA")) { 294 //-- "focaccia" --// 295 result.append('X'); 296 index += 3; 297 } else if (contains(value, index, 2, "CC") && 298 !(index == 1 && charAt(value, 0) == 'M')) { 299 //-- double "cc" but not "McClelland" --// 300 return handleCC(value, result, index); 301 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 302 result.append('K'); 303 index += 2; 304 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 305 //-- Italian vs. English --// 306 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 307 result.append('S', 'X'); 308 } else { 309 result.append('S'); 310 } 311 index += 2; 312 } else { 313 result.append('K'); 314 if (contains(value, index + 1, 2, " C", " Q", " G")) { 315 //-- Mac Caffrey, Mac Gregor --// 316 index += 3; 317 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 318 !contains(value, index + 1, 2, "CE", "CI")) { 319 index += 2; 320 } else { 321 index++; 322 } 323 } 324 325 return index; 326 } 327 328 /** 329 * Handles 'CC' cases. 330 */ 331 private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { 332 if (contains(value, index + 2, 1, "I", "E", "H") && 333 !contains(value, index + 2, 2, "HU")) { 334 //-- "bellocchio" but not "bacchus" --// 335 if ((index == 1 && charAt(value, index - 1) == 'A') || 336 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 337 //-- "accident", "accede", "succeed" --// 338 result.append("KS"); 339 } else { 340 //-- "bacci", "bertucci", other Italian --// 341 result.append('X'); 342 } 343 index += 3; 344 } else { // Pierce's rule 345 result.append('K'); 346 index += 2; 347 } 348 349 return index; 350 } 351 352 /** 353 * Handles 'CH' cases. 354 */ 355 private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { 356 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 357 result.append('K', 'X'); 358 return index + 2; 359 } else if (conditionCH0(value, index)) { 360 //-- Greek roots ("chemistry", "chorus", etc.) --// 361 result.append('K'); 362 return index + 2; 363 } else if (conditionCH1(value, index)) { 364 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 365 result.append('K'); 366 return index + 2; 367 } else { 368 if (index > 0) { 369 if (contains(value, 0, 2, "MC")) { 370 result.append('K'); 371 } else { 372 result.append('X', 'K'); 373 } 374 } else { 375 result.append('X'); 376 } 377 return index + 2; 378 } 379 } 380 381 /** 382 * Handles 'D' cases. 383 */ 384 private int handleD(final String value, final DoubleMetaphoneResult result, int index) { 385 if (contains(value, index, 2, "DG")) { 386 //-- "Edge" --// 387 if (contains(value, index + 2, 1, "I", "E", "Y")) { 388 result.append('J'); 389 index += 3; 390 //-- "Edgar" --// 391 } else { 392 result.append("TK"); 393 index += 2; 394 } 395 } else if (contains(value, index, 2, "DT", "DD")) { 396 result.append('T'); 397 index += 2; 398 } else { 399 result.append('T'); 400 index++; 401 } 402 return index; 403 } 404 405 /** 406 * Handles 'G' cases. 407 */ 408 private int handleG(final String value, final DoubleMetaphoneResult result, int index, 409 final boolean slavoGermanic) { 410 if (charAt(value, index + 1) == 'H') { 411 index = handleGH(value, result, index); 412 } else if (charAt(value, index + 1) == 'N') { 413 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 414 result.append("KN", "N"); 415 } else if (!contains(value, index + 2, 2, "EY") && 416 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 417 result.append("N", "KN"); 418 } else { 419 result.append("KN"); 420 } 421 index = index + 2; 422 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 423 result.append("KL", "L"); 424 index += 2; 425 } else if (index == 0 && 426 (charAt(value, index + 1) == 'Y' || 427 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 428 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 429 result.append('K', 'J'); 430 index += 2; 431 } else if ((contains(value, index + 1, 2, "ER") || 432 charAt(value, index + 1) == 'Y') && 433 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 434 !contains(value, index - 1, 1, "E", "I") && 435 !contains(value, index - 1, 3, "RGY", "OGY")) { 436 //-- -ger-, -gy- --// 437 result.append('K', 'J'); 438 index += 2; 439 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 440 contains(value, index - 1, 4, "AGGI", "OGGI")) { 441 //-- Italian "biaggi" --// 442 if (contains(value, 0 ,4, "VAN ", "VON ") || 443 contains(value, 0, 3, "SCH") || 444 contains(value, index + 1, 2, "ET")) { 445 //-- obvious germanic --// 446 result.append('K'); 447 } else if (contains(value, index + 1, 3, "IER")) { 448 result.append('J'); 449 } else { 450 result.append('J', 'K'); 451 } 452 index += 2; 453 } else if (charAt(value, index + 1) == 'G') { 454 index += 2; 455 result.append('K'); 456 } else { 457 index++; 458 result.append('K'); 459 } 460 return index; 461 } 462 463 /** 464 * Handles 'GH' cases. 465 */ 466 private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { 467 if (index > 0 && !isVowel(charAt(value, index - 1))) { 468 result.append('K'); 469 index += 2; 470 } else if (index == 0) { 471 if (charAt(value, index + 2) == 'I') { 472 result.append('J'); 473 } else { 474 result.append('K'); 475 } 476 index += 2; 477 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 478 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 479 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 480 //-- Parker's rule (with some further refinements) - "hugh" 481 index += 2; 482 } else { 483 if (index > 2 && charAt(value, index - 1) == 'U' && 484 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 485 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 486 result.append('F'); 487 } else if (index > 0 && charAt(value, index - 1) != 'I') { 488 result.append('K'); 489 } 490 index += 2; 491 } 492 return index; 493 } 494 495 /** 496 * Handles 'H' cases. 497 */ 498 private int handleH(final String value, final DoubleMetaphoneResult result, int index) { 499 //-- only keep if first & before vowel or between 2 vowels --// 500 if ((index == 0 || isVowel(charAt(value, index - 1))) && 501 isVowel(charAt(value, index + 1))) { 502 result.append('H'); 503 index += 2; 504 //-- also takes car of "HH" --// 505 } else { 506 index++; 507 } 508 return index; 509 } 510 511 /** 512 * Handles 'J' cases. 513 */ 514 private int handleJ(final String value, final DoubleMetaphoneResult result, int index, 515 final boolean slavoGermanic) { 516 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 517 //-- obvious Spanish, "Jose", "San Jacinto" --// 518 if ((index == 0 && (charAt(value, index + 4) == ' ') || 519 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 520 result.append('H'); 521 } else { 522 result.append('J', 'H'); 523 } 524 index++; 525 } else { 526 if (index == 0 && !contains(value, index, 4, "JOSE")) { 527 result.append('J', 'A'); 528 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 529 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 530 result.append('J', 'H'); 531 } else if (index == value.length() - 1) { 532 result.append('J', ' '); 533 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 534 !contains(value, index - 1, 1, "S", "K", "L")) { 535 result.append('J'); 536 } 537 538 if (charAt(value, index + 1) == 'J') { 539 index += 2; 540 } else { 541 index++; 542 } 543 } 544 return index; 545 } 546 547 /** 548 * Handles 'L' cases. 549 */ 550 private int handleL(final String value, final DoubleMetaphoneResult result, int index) { 551 if (charAt(value, index + 1) == 'L') { 552 if (conditionL0(value, index)) { 553 result.appendPrimary('L'); 554 } else { 555 result.append('L'); 556 } 557 index += 2; 558 } else { 559 index++; 560 result.append('L'); 561 } 562 return index; 563 } 564 565 /** 566 * Handles 'P' cases. 567 */ 568 private int handleP(final String value, final DoubleMetaphoneResult result, int index) { 569 if (charAt(value, index + 1) == 'H') { 570 result.append('F'); 571 index += 2; 572 } else { 573 result.append('P'); 574 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 575 } 576 return index; 577 } 578 579 /** 580 * Handles 'R' cases. 581 */ 582 private int handleR(final String value, final DoubleMetaphoneResult result, final int index, 583 final boolean slavoGermanic) { 584 if (index == value.length() - 1 && !slavoGermanic && 585 contains(value, index - 2, 2, "IE") && 586 !contains(value, index - 4, 2, "ME", "MA")) { 587 result.appendAlternate('R'); 588 } else { 589 result.append('R'); 590 } 591 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 592 } 593 594 /** 595 * Handles 'S' cases. 596 */ 597 private int handleS(final String value, final DoubleMetaphoneResult result, int index, 598 final boolean slavoGermanic) { 599 if (contains(value, index - 1, 3, "ISL", "YSL")) { 600 //-- special cases "island", "isle", "carlisle", "carlysle" --// 601 index++; 602 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 603 //-- special case "sugar-" --// 604 result.append('X', 'S'); 605 index++; 606 } else if (contains(value, index, 2, "SH")) { 607 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 608 //-- germanic --// 609 result.append('S'); 610 } else { 611 result.append('X'); 612 } 613 index += 2; 614 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 615 //-- Italian and Armenian --// 616 if (slavoGermanic) { 617 result.append('S'); 618 } else { 619 result.append('S', 'X'); 620 } 621 index += 3; 622 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || 623 contains(value, index + 1, 1, "Z")) { 624 //-- german & anglicisations, e.g. "smith" match "schmidt" // 625 // "snider" match "schneider" --// 626 //-- also, -sz- in slavic language although in hungarian it // 627 // is pronounced "s" --// 628 result.append('S', 'X'); 629 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 630 } else if (contains(value, index, 2, "SC")) { 631 index = handleSC(value, result, index); 632 } else { 633 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 634 //-- french e.g. "resnais", "artois" --// 635 result.appendAlternate('S'); 636 } else { 637 result.append('S'); 638 } 639 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 640 } 641 return index; 642 } 643 644 /** 645 * Handles 'SC' cases. 646 */ 647 private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { 648 if (charAt(value, index + 2) == 'H') { 649 //-- Schlesinger's rule --// 650 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 651 //-- Dutch origin, e.g. "school", "schooner" --// 652 if (contains(value, index + 3, 2, "ER", "EN")) { 653 //-- "schermerhorn", "schenker" --// 654 result.append("X", "SK"); 655 } else { 656 result.append("SK"); 657 } 658 } else { 659 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 660 result.append('X', 'S'); 661 } else { 662 result.append('X'); 663 } 664 } 665 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 666 result.append('S'); 667 } else { 668 result.append("SK"); 669 } 670 return index + 3; 671 } 672 673 /** 674 * Handles 'T' cases. 675 */ 676 private int handleT(final String value, final DoubleMetaphoneResult result, int index) { 677 if (contains(value, index, 4, "TION")) { 678 result.append('X'); 679 index += 3; 680 } else if (contains(value, index, 3, "TIA", "TCH")) { 681 result.append('X'); 682 index += 3; 683 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 684 if (contains(value, index + 2, 2, "OM", "AM") || 685 //-- special case "thomas", "thames" or germanic --// 686 contains(value, 0, 4, "VAN ", "VON ") || 687 contains(value, 0, 3, "SCH")) { 688 result.append('T'); 689 } else { 690 result.append('0', 'T'); 691 } 692 index += 2; 693 } else { 694 result.append('T'); 695 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 696 } 697 return index; 698 } 699 700 /** 701 * Handles 'W' cases. 702 */ 703 private int handleW(final String value, final DoubleMetaphoneResult result, int index) { 704 if (contains(value, index, 2, "WR")) { 705 //-- can also be in middle of word --// 706 result.append('R'); 707 index += 2; 708 } else { 709 if (index == 0 && (isVowel(charAt(value, index + 1)) || 710 contains(value, index, 2, "WH"))) { 711 if (isVowel(charAt(value, index + 1))) { 712 //-- Wasserman should match Vasserman --// 713 result.append('A', 'F'); 714 } else { 715 //-- need Uomo to match Womo --// 716 result.append('A'); 717 } 718 index++; 719 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 720 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 721 contains(value, 0, 3, "SCH")) { 722 //-- Arnow should match Arnoff --// 723 result.appendAlternate('F'); 724 index++; 725 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 726 //-- Polish e.g. "filipowicz" --// 727 result.append("TS", "FX"); 728 index += 4; 729 } else { 730 index++; 731 } 732 } 733 return index; 734 } 735 736 /** 737 * Handles 'X' cases. 738 */ 739 private int handleX(final String value, final DoubleMetaphoneResult result, int index) { 740 if (index == 0) { 741 result.append('S'); 742 index++; 743 } else { 744 if (!((index == value.length() - 1) && 745 (contains(value, index - 3, 3, "IAU", "EAU") || 746 contains(value, index - 2, 2, "AU", "OU")))) { 747 //-- French e.g. breaux --// 748 result.append("KS"); 749 } 750 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 751 } 752 return index; 753 } 754 755 /** 756 * Handles 'Z' cases. 757 */ 758 private int handleZ(final String value, final DoubleMetaphoneResult result, int index, 759 final boolean slavoGermanic) { 760 if (charAt(value, index + 1) == 'H') { 761 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 762 result.append('J'); 763 index += 2; 764 } else { 765 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 766 (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 767 result.append("S", "TS"); 768 } else { 769 result.append('S'); 770 } 771 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 772 } 773 return index; 774 } 775 776 //-- BEGIN CONDITIONS --// 777 778 /** 779 * Complex condition 0 for 'C'. 780 */ 781 private boolean conditionC0(final String value, final int index) { 782 if (contains(value, index, 4, "CHIA")) { 783 return true; 784 } else if (index <= 1) { 785 return false; 786 } else if (isVowel(charAt(value, index - 2))) { 787 return false; 788 } else if (!contains(value, index - 1, 3, "ACH")) { 789 return false; 790 } else { 791 final char c = charAt(value, index + 2); 792 return (c != 'I' && c != 'E') || 793 contains(value, index - 2, 6, "BACHER", "MACHER"); 794 } 795 } 796 797 /** 798 * Complex condition 0 for 'CH'. 799 */ 800 private boolean conditionCH0(final String value, final int index) { 801 if (index != 0) { 802 return false; 803 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 804 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 805 return false; 806 } else if (contains(value, 0, 5, "CHORE")) { 807 return false; 808 } else { 809 return true; 810 } 811 } 812 813 /** 814 * Complex condition 1 for 'CH'. 815 */ 816 private boolean conditionCH1(final String value, final int index) { 817 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || 818 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 819 contains(value, index + 2, 1, "T", "S") || 820 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 821 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 822 } 823 824 /** 825 * Complex condition 0 for 'L'. 826 */ 827 private boolean conditionL0(final String value, final int index) { 828 if (index == value.length() - 3 && 829 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 830 return true; 831 } else if ((contains(value, value.length() - 2, 2, "AS", "OS") || 832 contains(value, value.length() - 1, 1, "A", "O")) && 833 contains(value, index - 1, 4, "ALLE")) { 834 return true; 835 } else { 836 return false; 837 } 838 } 839 840 /** 841 * Complex condition 0 for 'M'. 842 */ 843 private boolean conditionM0(final String value, final int index) { 844 if (charAt(value, index + 1) == 'M') { 845 return true; 846 } 847 return contains(value, index - 1, 3, "UMB") && 848 ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER")); 849 } 850 851 //-- BEGIN HELPER FUNCTIONS --// 852 853 /** 854 * Determines whether or not a value is of slavo-germanic origin. A value is 855 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 856 */ 857 private boolean isSlavoGermanic(final String value) { 858 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 859 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 860 } 861 862 /** 863 * Determines whether or not a character is a vowel or not 864 */ 865 private boolean isVowel(final char ch) { 866 return VOWELS.indexOf(ch) != -1; 867 } 868 869 /** 870 * Determines whether or not the value starts with a silent letter. It will 871 * return <code>true</code> if the value starts with any of 'GN', 'KN', 872 * 'PN', 'WR' or 'PS'. 873 */ 874 private boolean isSilentStart(final String value) { 875 boolean result = false; 876 for (final String element : SILENT_START) { 877 if (value.startsWith(element)) { 878 result = true; 879 break; 880 } 881 } 882 return result; 883 } 884 885 /** 886 * Cleans the input. 887 */ 888 private String cleanInput(String input) { 889 if (input == null) { 890 return null; 891 } 892 input = input.trim(); 893 if (input.length() == 0) { 894 return null; 895 } 896 return input.toUpperCase(java.util.Locale.ENGLISH); 897 } 898 899 /* 900 * Gets the character at index <code>index</code> if available, otherwise 901 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 902 * of a default. 903 */ 904 protected char charAt(final String value, final int index) { 905 if (index < 0 || index >= value.length()) { 906 return Character.MIN_VALUE; 907 } 908 return value.charAt(index); 909 } 910 911 /* 912 * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and 913 * matching up to length <code>length</code>. 914 */ 915 protected static boolean contains(final String value, final int start, final int length, 916 final String... criteria) { 917 boolean result = false; 918 if (start >= 0 && start + length <= value.length()) { 919 final String target = value.substring(start, start + length); 920 921 for (final String element : criteria) { 922 if (target.equals(element)) { 923 result = true; 924 break; 925 } 926 } 927 } 928 return result; 929 } 930 931 //-- BEGIN INNER CLASSES --// 932 933 /** 934 * Inner class for storing results, since there is the optional alternate encoding. 935 */ 936 public class DoubleMetaphoneResult { 937 938 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 939 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 940 private final int maxLength; 941 942 public DoubleMetaphoneResult(final int maxLength) { 943 this.maxLength = maxLength; 944 } 945 946 public void append(final char value) { 947 appendPrimary(value); 948 appendAlternate(value); 949 } 950 951 public void append(final char primary, final char alternate) { 952 appendPrimary(primary); 953 appendAlternate(alternate); 954 } 955 956 public void appendPrimary(final char value) { 957 if (this.primary.length() < this.maxLength) { 958 this.primary.append(value); 959 } 960 } 961 962 public void appendAlternate(final char value) { 963 if (this.alternate.length() < this.maxLength) { 964 this.alternate.append(value); 965 } 966 } 967 968 public void append(final String value) { 969 appendPrimary(value); 970 appendAlternate(value); 971 } 972 973 public void append(final String primary, final String alternate) { 974 appendPrimary(primary); 975 appendAlternate(alternate); 976 } 977 978 public void appendPrimary(final String value) { 979 final int addChars = this.maxLength - this.primary.length(); 980 if (value.length() <= addChars) { 981 this.primary.append(value); 982 } else { 983 this.primary.append(value.substring(0, addChars)); 984 } 985 } 986 987 public void appendAlternate(final String value) { 988 final int addChars = this.maxLength - this.alternate.length(); 989 if (value.length() <= addChars) { 990 this.alternate.append(value); 991 } else { 992 this.alternate.append(value.substring(0, addChars)); 993 } 994 } 995 996 public String getPrimary() { 997 return this.primary.toString(); 998 } 999 1000 public String getAlternate() { 1001 return this.alternate.toString(); 1002 } 1003 1004 public boolean isComplete() { 1005 return this.primary.length() >= this.maxLength && 1006 this.alternate.length() >= this.maxLength; 1007 } 1008 } 1009}