View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  import org.apache.commons.codec.binary.StringUtils;
23  
24  /**
25   * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
26   * Philips</CITE>.
27   * <p>
28   * This class is conditionally thread-safe. The instance field for the maximum code length is mutable
29   * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
30   * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
31   * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
32   * </p>
33   *
34   * @see <a href="https://drdobbs.com/184401251?pgno=2">Original Article</a>
35   * @see <a href="https://en.wikipedia.org/wiki/Metaphone">https://en.wikipedia.org/wiki/Metaphone</a>
36   */
37  public class DoubleMetaphone implements StringEncoder {
38  
39      /**
40       * Inner class for storing results, since there is the optional alternate encoding.
41       */
42      public class DoubleMetaphoneResult {
43  
44          private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
45          private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
46          private final int maxLength;
47  
48          public DoubleMetaphoneResult(final int maxLength) {
49              this.maxLength = maxLength;
50          }
51  
52          public void append(final char value) {
53              appendPrimary(value);
54              appendAlternate(value);
55          }
56  
57          public void append(final char primary, final char alternate) {
58              appendPrimary(primary);
59              appendAlternate(alternate);
60          }
61  
62          public void append(final String value) {
63              appendPrimary(value);
64              appendAlternate(value);
65          }
66  
67          public void append(final String primary, final String alternate) {
68              appendPrimary(primary);
69              appendAlternate(alternate);
70          }
71  
72          public void appendAlternate(final char value) {
73              if (this.alternate.length() < this.maxLength) {
74                  this.alternate.append(value);
75              }
76          }
77  
78          public void appendAlternate(final String value) {
79              final int addChars = this.maxLength - this.alternate.length();
80              if (value.length() <= addChars) {
81                  this.alternate.append(value);
82              } else {
83                  this.alternate.append(value, 0, addChars);
84              }
85          }
86  
87          public void appendPrimary(final char value) {
88              if (this.primary.length() < this.maxLength) {
89                  this.primary.append(value);
90              }
91          }
92  
93          public void appendPrimary(final String value) {
94              final int addChars = this.maxLength - this.primary.length();
95              if (value.length() <= addChars) {
96                  this.primary.append(value);
97              } else {
98                  this.primary.append(value, 0, addChars);
99              }
100         }
101 
102         public String getAlternate() {
103             return this.alternate.toString();
104         }
105 
106         public String getPrimary() {
107             return this.primary.toString();
108         }
109 
110         public boolean isComplete() {
111             return this.primary.length() >= this.maxLength &&
112                    this.alternate.length() >= this.maxLength;
113         }
114     }
115 
116     /**
117      * "Vowels" to test for
118      */
119     private static final String VOWELS = "AEIOUY";
120     /**
121      * Prefixes when present which are not pronounced
122      */
123     private static final String[] SILENT_START =
124         { "GN", "KN", "PN", "WR", "PS" };
125     private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
126         { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
127     private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
128         { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
129 
130     private static final String[] L_T_K_S_N_M_B_Z =
131         { "L", "T", "K", "S", "N", "M", "B", "Z" };
132 
133     /*
134      * Determines whether {@code value} contains any of the criteria starting at index {@code start} and
135      * matching up to length {@code length}.
136      */
137     protected static boolean contains(final String value, final int start, final int length,
138                                       final String... criteria) {
139         boolean result = false;
140         if (start >= 0 && start + length <= value.length()) {
141             final String target = value.substring(start, start + length);
142 
143             for (final String element : criteria) {
144                 if (target.equals(element)) {
145                     result = true;
146                     break;
147                 }
148             }
149         }
150         return result;
151     }
152 
153     /**
154      * Maximum length of an encoding, default is 4
155      */
156     private int maxCodeLen = 4;
157 
158     /*
159      * Gets the character at index {@code index} if available, otherwise
160      * it returns {@code Character.MIN_VALUE} so that there is some sort
161      * of default.
162      */
163     protected char charAt(final String value, final int index) {
164         if (index < 0 || index >= value.length()) {
165             return Character.MIN_VALUE;
166         }
167         return value.charAt(index);
168     }
169 
170     /**
171      * Cleans the input.
172      */
173     private String cleanInput(String input) {
174         if (input == null) {
175             return null;
176         }
177         input = input.trim();
178         if (input.isEmpty()) {
179             return null;
180         }
181         return input.toUpperCase(java.util.Locale.ENGLISH);
182     }
183 
184     /**
185      * Complex condition 0 for 'C'.
186      */
187     private boolean conditionC0(final String value, final int index) {
188         if (contains(value, index, 4, "CHIA")) {
189             return true;
190         }
191         if (index <= 1) {
192             return false;
193         }
194         if (isVowel(charAt(value, index - 2))) {
195             return false;
196         }
197         if (!contains(value, index - 1, 3, "ACH")) {
198             return false;
199         }
200         final char c = charAt(value, index + 2);
201         return c != 'I' && c != 'E' ||
202                 contains(value, index - 2, 6, "BACHER", "MACHER");
203     }
204 
205     /**
206      * Complex condition 0 for 'CH'.
207      */
208     private boolean conditionCH0(final String value, final int index) {
209         if (index != 0) {
210             return false;
211         }
212         if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
213                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
214             return false;
215         }
216         return !contains(value, 0, 5, "CHORE");
217     }
218 
219     /**
220      * Complex condition 1 for 'CH'.
221      */
222     private boolean conditionCH1(final String value, final int index) {
223         return contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH") ||
224                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
225                 contains(value, index + 2, 1, "T", "S") ||
226                 (contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
227                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1);
228     }
229 
230     /**
231      * Complex condition 0 for 'L'.
232      */
233     private boolean conditionL0(final String value, final int index) {
234         if (index == value.length() - 3 &&
235             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
236             return true;
237         }
238         return (contains(value, value.length() - 2, 2, "AS", "OS") ||
239                 contains(value, value.length() - 1, 1, "A", "O")) &&
240                 contains(value, index - 1, 4, "ALLE");
241     }
242 
243     //-- BEGIN HANDLERS --//
244 
245     /**
246      * Complex condition 0 for 'M'.
247      */
248     private boolean conditionM0(final String value, final int index) {
249         if (charAt(value, index + 1) == 'M') {
250             return true;
251         }
252         return contains(value, index - 1, 3, "UMB") &&
253                (index + 1 == value.length() - 1 || contains(value, index + 2, 2, "ER"));
254     }
255 
256     /**
257      * Encode a value with Double Metaphone.
258      *
259      * @param value String to encode
260      * @return an encoded string
261      */
262     public String doubleMetaphone(final String value) {
263         return doubleMetaphone(value, false);
264     }
265 
266     /**
267      * Encode a value with Double Metaphone, optionally using the alternate encoding.
268      *
269      * @param value String to encode
270      * @param alternate use alternate encode
271      * @return an encoded string
272      */
273     public String doubleMetaphone(String value, final boolean alternate) {
274         value = cleanInput(value);
275         if (value == null) {
276             return null;
277         }
278 
279         final boolean slavoGermanic = isSlavoGermanic(value);
280         int index = isSilentStart(value) ? 1 : 0;
281 
282         final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
283 
284         while (!result.isComplete() && index <= value.length() - 1) {
285             switch (value.charAt(index)) {
286             case 'A':
287             case 'E':
288             case 'I':
289             case 'O':
290             case 'U':
291             case 'Y':
292                 index = handleAEIOUY(result, index);
293                 break;
294             case 'B':
295                 result.append('P');
296                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
297                 break;
298             case '\u00C7':
299                 // A C with a Cedilla
300                 result.append('S');
301                 index++;
302                 break;
303             case 'C':
304                 index = handleC(value, result, index);
305                 break;
306             case 'D':
307                 index = handleD(value, result, index);
308                 break;
309             case 'F':
310                 result.append('F');
311                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
312                 break;
313             case 'G':
314                 index = handleG(value, result, index, slavoGermanic);
315                 break;
316             case 'H':
317                 index = handleH(value, result, index);
318                 break;
319             case 'J':
320                 index = handleJ(value, result, index, slavoGermanic);
321                 break;
322             case 'K':
323                 result.append('K');
324                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
325                 break;
326             case 'L':
327                 index = handleL(value, result, index);
328                 break;
329             case 'M':
330                 result.append('M');
331                 index = conditionM0(value, index) ? index + 2 : index + 1;
332                 break;
333             case 'N':
334                 result.append('N');
335                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
336                 break;
337             case '\u00D1':
338                 // N with a tilde (spanish ene)
339                 result.append('N');
340                 index++;
341                 break;
342             case 'P':
343                 index = handleP(value, result, index);
344                 break;
345             case 'Q':
346                 result.append('K');
347                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
348                 break;
349             case 'R':
350                 index = handleR(value, result, index, slavoGermanic);
351                 break;
352             case 'S':
353                 index = handleS(value, result, index, slavoGermanic);
354                 break;
355             case 'T':
356                 index = handleT(value, result, index);
357                 break;
358             case 'V':
359                 result.append('F');
360                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
361                 break;
362             case 'W':
363                 index = handleW(value, result, index);
364                 break;
365             case 'X':
366                 index = handleX(value, result, index);
367                 break;
368             case 'Z':
369                 index = handleZ(value, result, index, slavoGermanic);
370                 break;
371             default:
372                 index++;
373                 break;
374             }
375         }
376 
377         return alternate ? result.getAlternate() : result.getPrimary();
378     }
379 
380     /**
381      * Encode the value using DoubleMetaphone.  It will only work if
382      * {@code obj} is a {@code String} (like {@code Metaphone}).
383      *
384      * @param obj Object to encode (should be of type String)
385      * @return An encoded Object (will be of type String)
386      * @throws EncoderException encode parameter is not of type String
387      */
388     @Override
389     public Object encode(final Object obj) throws EncoderException {
390         if (!(obj instanceof String)) {
391             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
392         }
393         return doubleMetaphone((String) obj);
394     }
395 
396     /**
397      * Encode the value using DoubleMetaphone.
398      *
399      * @param value String to encode
400      * @return An encoded String
401      */
402     @Override
403     public String encode(final String value) {
404         return doubleMetaphone(value);
405     }
406 
407     /**
408      * Returns the maxCodeLen.
409      * @return int
410      */
411     public int getMaxCodeLen() {
412         return this.maxCodeLen;
413     }
414 
415     /**
416      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
417      */
418     private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) {
419         if (index == 0) {
420             result.append('A');
421         }
422         return index + 1;
423     }
424 
425     /**
426      * Handles 'C' cases.
427      */
428     private int handleC(final String value, final DoubleMetaphoneResult result, int index) {
429         if (conditionC0(value, index)) {  // very confusing, moved out
430             result.append('K');
431             index += 2;
432         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
433             result.append('S');
434             index += 2;
435         } else if (contains(value, index, 2, "CH")) {
436             index = handleCH(value, result, index);
437         } else if (contains(value, index, 2, "CZ") &&
438                    !contains(value, index - 2, 4, "WICZ")) {
439             //-- "Czerny" --//
440             result.append('S', 'X');
441             index += 2;
442         } else if (contains(value, index + 1, 3, "CIA")) {
443             //-- "focaccia" --//
444             result.append('X');
445             index += 3;
446         } else if (contains(value, index, 2, "CC") &&
447                    !(index == 1 && charAt(value, 0) == 'M')) {
448             //-- double "cc" but not "McClelland" --//
449             return handleCC(value, result, index);
450         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
451             result.append('K');
452             index += 2;
453         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
454             //-- Italian vs. English --//
455             if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
456                 result.append('S', 'X');
457             } else {
458                 result.append('S');
459             }
460             index += 2;
461         } else {
462             result.append('K');
463             if (contains(value, index + 1, 2, " C", " Q", " G")) {
464                 //-- Mac Caffrey, Mac Gregor --//
465                 index += 3;
466             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
467                        !contains(value, index + 1, 2, "CE", "CI")) {
468                 index += 2;
469             } else {
470                 index++;
471             }
472         }
473 
474         return index;
475     }
476 
477     /**
478      * Handles 'CC' cases.
479      */
480     private int handleCC(final String value, final DoubleMetaphoneResult result, int index) {
481         if (contains(value, index + 2, 1, "I", "E", "H") &&
482             !contains(value, index + 2, 2, "HU")) {
483             //-- "bellocchio" but not "bacchus" --//
484             if (index == 1 && charAt(value, index - 1) == 'A' ||
485                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
486                 //-- "accident", "accede", "succeed" --//
487                 result.append("KS");
488             } else {
489                 //-- "bacci", "bertucci", other Italian --//
490                 result.append('X');
491             }
492             index += 3;
493         } else {    // Pierce's rule
494             result.append('K');
495             index += 2;
496         }
497 
498         return index;
499     }
500 
501     /**
502      * Handles 'CH' cases.
503      */
504     private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) {
505         if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
506             result.append('K', 'X');
507             return index + 2;
508         }
509         if (conditionCH0(value, index)) {
510             //-- Greek roots ("chemistry", "chorus", etc.) --//
511             result.append('K');
512             return index + 2;
513         }
514         if (conditionCH1(value, index)) {
515             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
516             result.append('K');
517             return index + 2;
518         }
519         if (index > 0) {
520             if (contains(value, 0, 2, "MC")) {
521                 result.append('K');
522             } else {
523                 result.append('X', 'K');
524             }
525         } else {
526             result.append('X');
527         }
528         return index + 2;
529     }
530 
531     /**
532      * Handles 'D' cases.
533      */
534     private int handleD(final String value, final DoubleMetaphoneResult result, int index) {
535         if (contains(value, index, 2, "DG")) {
536             //-- "Edge" --//
537             if (contains(value, index + 2, 1, "I", "E", "Y")) {
538                 result.append('J');
539                 index += 3;
540                 //-- "Edgar" --//
541             } else {
542                 result.append("TK");
543                 index += 2;
544             }
545         } else if (contains(value, index, 2, "DT", "DD")) {
546             result.append('T');
547             index += 2;
548         } else {
549             result.append('T');
550             index++;
551         }
552         return index;
553     }
554 
555     /**
556      * Handles 'G' cases.
557      */
558     private int handleG(final String value, final DoubleMetaphoneResult result, int index,
559                         final boolean slavoGermanic) {
560         if (charAt(value, index + 1) == 'H') {
561             index = handleGH(value, result, index);
562         } else if (charAt(value, index + 1) == 'N') {
563             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
564                 result.append("KN", "N");
565             } else if (!contains(value, index + 2, 2, "EY") &&
566                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
567                 result.append("N", "KN");
568             } else {
569                 result.append("KN");
570             }
571             index += 2;
572         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
573             result.append("KL", "L");
574             index += 2;
575         } else if (index == 0 &&
576                    (charAt(value, index + 1) == 'Y' ||
577                     contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
578             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
579             result.append('K', 'J');
580             index += 2;
581         } else if ((contains(value, index + 1, 2, "ER") ||
582                     charAt(value, index + 1) == 'Y') &&
583                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
584                    !contains(value, index - 1, 1, "E", "I") &&
585                    !contains(value, index - 1, 3, "RGY", "OGY")) {
586             //-- -ger-, -gy- --//
587             result.append('K', 'J');
588             index += 2;
589         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
590                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
591             //-- Italian "biaggi" --//
592             if (contains(value, 0, 4, "VAN ", "VON ") ||
593                 contains(value, 0, 3, "SCH") ||
594                 contains(value, index + 1, 2, "ET")) {
595                 //-- obvious germanic --//
596                 result.append('K');
597             } else if (contains(value, index + 1, 3, "IER")) {
598                 result.append('J');
599             } else {
600                 result.append('J', 'K');
601             }
602             index += 2;
603         } else {
604             if (charAt(value, index + 1) == 'G') {
605                 index += 2;
606             } else {
607                 index++;
608             }
609             result.append('K');
610         }
611         return index;
612     }
613 
614     /**
615      * Handles 'GH' cases.
616      */
617     private int handleGH(final String value, final DoubleMetaphoneResult result, int index) {
618         if (index > 0 && !isVowel(charAt(value, index - 1))) {
619             result.append('K');
620         } else if (index == 0) {
621             if (charAt(value, index + 2) == 'I') {
622                 result.append('J');
623             } else {
624                 result.append('K');
625             }
626             index += 2;
627         } else if (index > 1 && contains(value, index - 2, 1, "B", "H", "D") ||
628                    index > 2 && contains(value, index - 3, 1, "B", "H", "D") ||
629                    index > 3 && contains(value, index - 4, 1, "B", "H")) {
630             //-- Parker's rule (with some further refinements) - "hugh"
631             index += 2;
632         } else {
633             if (index > 2 && charAt(value, index - 1) == 'U' &&
634                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
635                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
636                 result.append('F');
637             } else if (index > 0 && charAt(value, index - 1) != 'I') {
638                 result.append('K');
639             }
640             index += 2;
641         }
642         return index;
643     }
644 
645     /**
646      * Handles 'H' cases.
647      */
648     private int handleH(final String value, final DoubleMetaphoneResult result, int index) {
649         //-- only keep if first & before vowel or between 2 vowels --//
650         if ((index == 0 || isVowel(charAt(value, index - 1))) &&
651             isVowel(charAt(value, index + 1))) {
652             result.append('H');
653             index += 2;
654             //-- also takes car of "HH" --//
655         } else {
656             index++;
657         }
658         return index;
659     }
660 
661     /**
662      * Handles 'J' cases.
663      */
664     private int handleJ(final String value, final DoubleMetaphoneResult result, int index,
665                         final boolean slavoGermanic) {
666         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
667                 //-- obvious Spanish, "Jose", "San Jacinto" --//
668                 if (index == 0 && charAt(value, index + 4) == ' ' ||
669                      value.length() == 4 || contains(value, 0, 4, "SAN ")) {
670                     result.append('H');
671                 } else {
672                     result.append('J', 'H');
673                 }
674                 index++;
675             } else {
676                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
677                     result.append('J', 'A');
678                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
679                            (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
680                     result.append('J', 'H');
681                 } else if (index == value.length() - 1) {
682                     result.append('J', ' ');
683                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
684                            !contains(value, index - 1, 1, "S", "K", "L")) {
685                     result.append('J');
686                 }
687 
688                 if (charAt(value, index + 1) == 'J') {
689                     index += 2;
690                 } else {
691                     index++;
692                 }
693             }
694         return index;
695     }
696 
697     /**
698      * Handles 'L' cases.
699      */
700     private int handleL(final String value, final DoubleMetaphoneResult result, int index) {
701         if (charAt(value, index + 1) == 'L') {
702             if (conditionL0(value, index)) {
703                 result.appendPrimary('L');
704             } else {
705                 result.append('L');
706             }
707             index += 2;
708         } else {
709             index++;
710             result.append('L');
711         }
712         return index;
713     }
714 
715     /**
716      * Handles 'P' cases.
717      */
718     private int handleP(final String value, final DoubleMetaphoneResult result, int index) {
719         if (charAt(value, index + 1) == 'H') {
720             result.append('F');
721             index += 2;
722         } else {
723             result.append('P');
724             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
725         }
726         return index;
727     }
728 
729     /**
730      * Handles 'R' cases.
731      */
732     private int handleR(final String value, final DoubleMetaphoneResult result, final int index,
733                         final boolean slavoGermanic) {
734         if (index == value.length() - 1 && !slavoGermanic &&
735             contains(value, index - 2, 2, "IE") &&
736             !contains(value, index - 4, 2, "ME", "MA")) {
737             result.appendAlternate('R');
738         } else {
739             result.append('R');
740         }
741         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
742     }
743 
744     //-- BEGIN CONDITIONS --//
745 
746     /**
747      * Handles 'S' cases.
748      */
749     private int handleS(final String value, final DoubleMetaphoneResult result, int index,
750                         final boolean slavoGermanic) {
751         if (contains(value, index - 1, 3, "ISL", "YSL")) {
752             //-- special cases "island", "isle", "carlisle", "carlysle" --//
753             index++;
754         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
755             //-- special case "sugar-" --//
756             result.append('X', 'S');
757             index++;
758         } else if (contains(value, index, 2, "SH")) {
759             if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
760                 //-- germanic --//
761                 result.append('S');
762             } else {
763                 result.append('X');
764             }
765             index += 2;
766         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
767             //-- Italian and Armenian --//
768             if (slavoGermanic) {
769                 result.append('S');
770             } else {
771                 result.append('S', 'X');
772             }
773             index += 3;
774         } else if (index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W") ||
775                    contains(value, index + 1, 1, "Z")) {
776             //-- german & anglicisations, e.g. "smith" match "schmidt" //
777             // "snider" match "schneider" --//
778             //-- also, -sz- in slavic language although in hungarian it //
779             //   is pronounced "s" --//
780             result.append('S', 'X');
781             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
782         } else if (contains(value, index, 2, "SC")) {
783             index = handleSC(value, result, index);
784         } else {
785             if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
786                 //-- french e.g. "resnais", "artois" --//
787                 result.appendAlternate('S');
788             } else {
789                 result.append('S');
790             }
791             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
792         }
793         return index;
794     }
795 
796     /**
797      * Handles 'SC' cases.
798      */
799     private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) {
800         if (charAt(value, index + 2) == 'H') {
801             //-- Schlesinger's rule --//
802             if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
803                 //-- Dutch origin, e.g. "school", "schooner" --//
804                 if (contains(value, index + 3, 2, "ER", "EN")) {
805                     //-- "schermerhorn", "schenker" --//
806                     result.append("X", "SK");
807                 } else {
808                     result.append("SK");
809                 }
810             } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
811                 result.append('X', 'S');
812             } else {
813                 result.append('X');
814             }
815         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
816             result.append('S');
817         } else {
818             result.append("SK");
819         }
820         return index + 3;
821     }
822 
823     /**
824      * Handles 'T' cases.
825      */
826     private int handleT(final String value, final DoubleMetaphoneResult result, int index) {
827         if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) {
828             result.append('X');
829             index += 3;
830         } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
831             if (contains(value, index + 2, 2, "OM", "AM") ||
832                 //-- special case "thomas", "thames" or germanic --//
833                 contains(value, 0, 4, "VAN ", "VON ") ||
834                 contains(value, 0, 3, "SCH")) {
835                 result.append('T');
836             } else {
837                 result.append('0', 'T');
838             }
839             index += 2;
840         } else {
841             result.append('T');
842             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
843         }
844         return index;
845     }
846 
847     /**
848      * Handles 'W' cases.
849      */
850     private int handleW(final String value, final DoubleMetaphoneResult result, int index) {
851         if (contains(value, index, 2, "WR")) {
852             //-- can also be in middle of word --//
853             result.append('R');
854             index += 2;
855         } else if (index == 0 && (isVowel(charAt(value, index + 1)) ||
856                            contains(value, index, 2, "WH"))) {
857             if (isVowel(charAt(value, index + 1))) {
858                 //-- Wasserman should match Vasserman --//
859                 result.append('A', 'F');
860             } else {
861                 //-- need Uomo to match Womo --//
862                 result.append('A');
863             }
864             index++;
865         } else if (index == value.length() - 1 && isVowel(charAt(value, index - 1)) ||
866                    contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
867                    contains(value, 0, 3, "SCH")) {
868             //-- Arnow should match Arnoff --//
869             result.appendAlternate('F');
870             index++;
871         } else if (contains(value, index, 4, "WICZ", "WITZ")) {
872             //-- Polish e.g. "filipowicz" --//
873             result.append("TS", "FX");
874             index += 4;
875         } else {
876             index++;
877         }
878         return index;
879     }
880 
881     /**
882      * Handles 'X' cases.
883      */
884     private int handleX(final String value, final DoubleMetaphoneResult result, int index) {
885         if (index == 0) {
886             result.append('S');
887             index++;
888         } else {
889             if (!(index == value.length() - 1 &&
890                   (contains(value, index - 3, 3, "IAU", "EAU") ||
891                    contains(value, index - 2, 2, "AU", "OU")))) {
892                 //-- French e.g. breaux --//
893                 result.append("KS");
894             }
895             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
896         }
897         return index;
898     }
899 
900     //-- BEGIN HELPER FUNCTIONS --//
901 
902     /**
903      * Handles 'Z' cases.
904      */
905     private int handleZ(final String value, final DoubleMetaphoneResult result, int index,
906                         final boolean slavoGermanic) {
907         if (charAt(value, index + 1) == 'H') {
908             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
909             result.append('J');
910             index += 2;
911         } else {
912             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
913                 slavoGermanic && index > 0 && charAt(value, index - 1) != 'T') {
914                 result.append("S", "TS");
915             } else {
916                 result.append('S');
917             }
918             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
919         }
920         return index;
921     }
922 
923     /**
924      * Check if the Double Metaphone values of two {@code String} values
925      * are equal.
926      *
927      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
928      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
929      * @return {@code true} if the encoded {@code String}s are equal;
930      *          {@code false} otherwise.
931      * @see #isDoubleMetaphoneEqual(String,String,boolean)
932      */
933     public boolean isDoubleMetaphoneEqual(final String value1, final String value2) {
934         return isDoubleMetaphoneEqual(value1, value2, false);
935     }
936 
937     /**
938      * Check if the Double Metaphone values of two {@code String} values
939      * are equal, optionally using the alternate value.
940      *
941      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
942      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
943      * @param alternate use the alternate value if {@code true}.
944      * @return {@code true} if the encoded {@code String}s are equal;
945      *          {@code false} otherwise.
946      */
947     public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) {
948         return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate));
949     }
950 
951     /**
952      * Determines whether or not the value starts with a silent letter.  It will
953      * return {@code true} if the value starts with any of 'GN', 'KN',
954      * 'PN', 'WR' or 'PS'.
955      */
956     private boolean isSilentStart(final String value) {
957         boolean result = false;
958         for (final String element : SILENT_START) {
959             if (value.startsWith(element)) {
960                 result = true;
961                 break;
962             }
963         }
964         return result;
965     }
966 
967     /**
968      * Determines whether or not a value is of slavo-germanic origin. A value is
969      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
970      */
971     private boolean isSlavoGermanic(final String value) {
972         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
973                 value.contains("CZ") || value.contains("WITZ");
974     }
975 
976     /**
977      * Determines whether or not a character is a vowel or not
978      */
979     private boolean isVowel(final char ch) {
980         return VOWELS.indexOf(ch) != -1;
981     }
982 
983     //-- BEGIN INNER CLASSES --//
984 
985     /**
986      * Sets the maxCodeLen.
987      * @param maxCodeLen The maxCodeLen to set
988      */
989     public void setMaxCodeLen(final int maxCodeLen) {
990         this.maxCodeLen = maxCodeLen;
991     }
992 }