001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
025     * Philips</CITE>.
026     * <p>
027     * This class is conditionally thread-safe. The instance field {@link #maxCodeLen} is mutable
028     * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
029     * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
030     * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
031     *
032     * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a>
033     * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a>
034     *
035     * @version $Id: DoubleMetaphone.html 889935 2013-12-11 05:05:13Z ggregory $
036     */
037    public class DoubleMetaphone implements StringEncoder {
038    
039        /**
040         * "Vowels" to test for
041         */
042        private static final String VOWELS = "AEIOUY";
043    
044        /**
045         * Prefixes when present which are not pronounced
046         */
047        private static final String[] SILENT_START =
048            { "GN", "KN", "PN", "WR", "PS" };
049        private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
050            { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
051        private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
052            { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
053        private static final String[] L_T_K_S_N_M_B_Z =
054            { "L", "T", "K", "S", "N", "M", "B", "Z" };
055    
056        /**
057         * Maximum length of an encoding, default is 4
058         */
059        private int maxCodeLen = 4;
060    
061        /**
062         * Creates an instance of this DoubleMetaphone encoder
063         */
064        public DoubleMetaphone() {
065            super();
066        }
067    
068        /**
069         * Encode a value with Double Metaphone.
070         *
071         * @param value String to encode
072         * @return an encoded string
073         */
074        public String doubleMetaphone(String value) {
075            return doubleMetaphone(value, false);
076        }
077    
078        /**
079         * Encode a value with Double Metaphone, optionally using the alternate encoding.
080         *
081         * @param value String to encode
082         * @param alternate use alternate encode
083         * @return an encoded string
084         */
085        public String doubleMetaphone(String value, boolean alternate) {
086            value = cleanInput(value);
087            if (value == null) {
088                return null;
089            }
090    
091            boolean slavoGermanic = isSlavoGermanic(value);
092            int index = isSilentStart(value) ? 1 : 0;
093    
094            DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
095    
096            while (!result.isComplete() && index <= value.length() - 1) {
097                switch (value.charAt(index)) {
098                case 'A':
099                case 'E':
100                case 'I':
101                case 'O':
102                case 'U':
103                case 'Y':
104                    index = handleAEIOUY(result, index);
105                    break;
106                case 'B':
107                    result.append('P');
108                    index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109                    break;
110                case '\u00C7':
111                    // A C with a Cedilla
112                    result.append('S');
113                    index++;
114                    break;
115                case 'C':
116                    index = handleC(value, result, index);
117                    break;
118                case 'D':
119                    index = handleD(value, result, index);
120                    break;
121                case 'F':
122                    result.append('F');
123                    index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124                    break;
125                case 'G':
126                    index = handleG(value, result, index, slavoGermanic);
127                    break;
128                case 'H':
129                    index = handleH(value, result, index);
130                    break;
131                case 'J':
132                    index = handleJ(value, result, index, slavoGermanic);
133                    break;
134                case 'K':
135                    result.append('K');
136                    index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137                    break;
138                case 'L':
139                    index = handleL(value, result, index);
140                    break;
141                case 'M':
142                    result.append('M');
143                    index = conditionM0(value, index) ? index + 2 : index + 1;
144                    break;
145                case 'N':
146                    result.append('N');
147                    index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148                    break;
149                case '\u00D1':
150                    // N with a tilde (spanish ene)
151                    result.append('N');
152                    index++;
153                    break;
154                case 'P':
155                    index = handleP(value, result, index);
156                    break;
157                case 'Q':
158                    result.append('K');
159                    index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160                    break;
161                case 'R':
162                    index = handleR(value, result, index, slavoGermanic);
163                    break;
164                case 'S':
165                    index = handleS(value, result, index, slavoGermanic);
166                    break;
167                case 'T':
168                    index = handleT(value, result, index);
169                    break;
170                case 'V':
171                    result.append('F');
172                    index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173                    break;
174                case 'W':
175                    index = handleW(value, result, index);
176                    break;
177                case 'X':
178                    index = handleX(value, result, index);
179                    break;
180                case 'Z':
181                    index = handleZ(value, result, index, slavoGermanic);
182                    break;
183                default:
184                    index++;
185                    break;
186                }
187            }
188    
189            return alternate ? result.getAlternate() : result.getPrimary();
190        }
191    
192        /**
193         * Encode the value using DoubleMetaphone.  It will only work if
194         * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
195         *
196         * @param obj Object to encode (should be of type String)
197         * @return An encoded Object (will be of type String)
198         * @throws EncoderException encode parameter is not of type String
199         */
200        @Override
201        public Object encode(Object obj) throws EncoderException {
202            if (!(obj instanceof String)) {
203                throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
204            }
205            return doubleMetaphone((String) obj);
206        }
207    
208        /**
209         * Encode the value using DoubleMetaphone.
210         *
211         * @param value String to encode
212         * @return An encoded String
213         */
214        @Override
215        public String encode(String value) {
216            return doubleMetaphone(value);
217        }
218    
219        /**
220         * Check if the Double Metaphone values of two <code>String</code> values
221         * are equal.
222         *
223         * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
224         * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
225         * @return {@code true} if the encoded <code>String</code>s are equal;
226         *          {@code false} otherwise.
227         * @see #isDoubleMetaphoneEqual(String,String,boolean)
228         */
229        public boolean isDoubleMetaphoneEqual(String value1, String value2) {
230            return isDoubleMetaphoneEqual(value1, value2, false);
231        }
232    
233        /**
234         * Check if the Double Metaphone values of two <code>String</code> values
235         * are equal, optionally using the alternate value.
236         *
237         * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
238         * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
239         * @param alternate use the alternate value if {@code true}.
240         * @return {@code true} if the encoded <code>String</code>s are equal;
241         *          {@code false} otherwise.
242         */
243        public boolean isDoubleMetaphoneEqual(String value1, String value2, boolean alternate) {
244            return doubleMetaphone(value1, alternate).equals(doubleMetaphone(value2, alternate));
245        }
246    
247        /**
248         * Returns the maxCodeLen.
249         * @return int
250         */
251        public int getMaxCodeLen() {
252            return this.maxCodeLen;
253        }
254    
255        /**
256         * Sets the maxCodeLen.
257         * @param maxCodeLen The maxCodeLen to set
258         */
259        public void setMaxCodeLen(int maxCodeLen) {
260            this.maxCodeLen = maxCodeLen;
261        }
262    
263        //-- BEGIN HANDLERS --//
264    
265        /**
266         * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
267         */
268        private int handleAEIOUY(DoubleMetaphoneResult result, int index) {
269            if (index == 0) {
270                result.append('A');
271            }
272            return index + 1;
273        }
274    
275        /**
276         * Handles 'C' cases.
277         */
278        private int handleC(String value, DoubleMetaphoneResult result, int index) {
279            if (conditionC0(value, index)) {  // very confusing, moved out
280                result.append('K');
281                index += 2;
282            } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
283                result.append('S');
284                index += 2;
285            } else if (contains(value, index, 2, "CH")) {
286                index = handleCH(value, result, index);
287            } else if (contains(value, index, 2, "CZ") &&
288                       !contains(value, index - 2, 4, "WICZ")) {
289                //-- "Czerny" --//
290                result.append('S', 'X');
291                index += 2;
292            } else if (contains(value, index + 1, 3, "CIA")) {
293                //-- "focaccia" --//
294                result.append('X');
295                index += 3;
296            } else if (contains(value, index, 2, "CC") &&
297                       !(index == 1 && charAt(value, 0) == 'M')) {
298                //-- double "cc" but not "McClelland" --//
299                return handleCC(value, result, index);
300            } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
301                result.append('K');
302                index += 2;
303            } else if (contains(value, index, 2, "CI", "CE", "CY")) {
304                //-- Italian vs. English --//
305                if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
306                    result.append('S', 'X');
307                } else {
308                    result.append('S');
309                }
310                index += 2;
311            } else {
312                result.append('K');
313                if (contains(value, index + 1, 2, " C", " Q", " G")) {
314                    //-- Mac Caffrey, Mac Gregor --//
315                    index += 3;
316                } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
317                           !contains(value, index + 1, 2, "CE", "CI")) {
318                    index += 2;
319                } else {
320                    index++;
321                }
322            }
323    
324            return index;
325        }
326    
327        /**
328         * Handles 'CC' cases.
329         */
330        private int handleCC(String value, DoubleMetaphoneResult result, int index) {
331            if (contains(value, index + 2, 1, "I", "E", "H") &&
332                !contains(value, index + 2, 2, "HU")) {
333                //-- "bellocchio" but not "bacchus" --//
334                if ((index == 1 && charAt(value, index - 1) == 'A') ||
335                    contains(value, index - 1, 5, "UCCEE", "UCCES")) {
336                    //-- "accident", "accede", "succeed" --//
337                    result.append("KS");
338                } else {
339                    //-- "bacci", "bertucci", other Italian --//
340                    result.append('X');
341                }
342                index += 3;
343            } else {    // Pierce's rule
344                result.append('K');
345                index += 2;
346            }
347    
348            return index;
349        }
350    
351        /**
352         * Handles 'CH' cases.
353         */
354        private int handleCH(String value, DoubleMetaphoneResult result, int index) {
355            if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
356                result.append('K', 'X');
357                return index + 2;
358            } else if (conditionCH0(value, index)) {
359                //-- Greek roots ("chemistry", "chorus", etc.) --//
360                result.append('K');
361                return index + 2;
362            } else if (conditionCH1(value, index)) {
363                //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
364                result.append('K');
365                return index + 2;
366            } else {
367                if (index > 0) {
368                    if (contains(value, 0, 2, "MC")) {
369                        result.append('K');
370                    } else {
371                        result.append('X', 'K');
372                    }
373                } else {
374                    result.append('X');
375                }
376                return index + 2;
377            }
378        }
379    
380        /**
381         * Handles 'D' cases.
382         */
383        private int handleD(String value, DoubleMetaphoneResult result, int index) {
384            if (contains(value, index, 2, "DG")) {
385                //-- "Edge" --//
386                if (contains(value, index + 2, 1, "I", "E", "Y")) {
387                    result.append('J');
388                    index += 3;
389                    //-- "Edgar" --//
390                } else {
391                    result.append("TK");
392                    index += 2;
393                }
394            } else if (contains(value, index, 2, "DT", "DD")) {
395                result.append('T');
396                index += 2;
397            } else {
398                result.append('T');
399                index++;
400            }
401            return index;
402        }
403    
404        /**
405         * Handles 'G' cases.
406         */
407        private int handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
408            if (charAt(value, index + 1) == 'H') {
409                index = handleGH(value, result, index);
410            } else if (charAt(value, index + 1) == 'N') {
411                if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
412                    result.append("KN", "N");
413                } else if (!contains(value, index + 2, 2, "EY") &&
414                           charAt(value, index + 1) != 'Y' && !slavoGermanic) {
415                    result.append("N", "KN");
416                } else {
417                    result.append("KN");
418                }
419                index = index + 2;
420            } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
421                result.append("KL", "L");
422                index += 2;
423            } else if (index == 0 &&
424                       (charAt(value, index + 1) == 'Y' ||
425                        contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
426                //-- -ges-, -gep-, -gel-, -gie- at beginning --//
427                result.append('K', 'J');
428                index += 2;
429            } else if ((contains(value, index + 1, 2, "ER") ||
430                        charAt(value, index + 1) == 'Y') &&
431                       !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
432                       !contains(value, index - 1, 1, "E", "I") &&
433                       !contains(value, index - 1, 3, "RGY", "OGY")) {
434                //-- -ger-, -gy- --//
435                result.append('K', 'J');
436                index += 2;
437            } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
438                       contains(value, index - 1, 4, "AGGI", "OGGI")) {
439                //-- Italian "biaggi" --//
440                if (contains(value, 0 ,4, "VAN ", "VON ") ||
441                    contains(value, 0, 3, "SCH") ||
442                    contains(value, index + 1, 2, "ET")) {
443                    //-- obvious germanic --//
444                    result.append('K');
445                } else if (contains(value, index + 1, 3, "IER")) {
446                    result.append('J');
447                } else {
448                    result.append('J', 'K');
449                }
450                index += 2;
451            } else if (charAt(value, index + 1) == 'G') {
452                index += 2;
453                result.append('K');
454            } else {
455                index++;
456                result.append('K');
457            }
458            return index;
459        }
460    
461        /**
462         * Handles 'GH' cases.
463         */
464        private int handleGH(String value, DoubleMetaphoneResult result, int index) {
465            if (index > 0 && !isVowel(charAt(value, index - 1))) {
466                result.append('K');
467                index += 2;
468            } else if (index == 0) {
469                if (charAt(value, index + 2) == 'I') {
470                    result.append('J');
471                } else {
472                    result.append('K');
473                }
474                index += 2;
475            } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
476                       (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
477                       (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
478                //-- Parker's rule (with some further refinements) - "hugh"
479                index += 2;
480            } else {
481                if (index > 2 && charAt(value, index - 1) == 'U' &&
482                    contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
483                    //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
484                    result.append('F');
485                } else if (index > 0 && charAt(value, index - 1) != 'I') {
486                    result.append('K');
487                }
488                index += 2;
489            }
490            return index;
491        }
492    
493        /**
494         * Handles 'H' cases.
495         */
496        private int handleH(String value, DoubleMetaphoneResult result, int index) {
497            //-- only keep if first & before vowel or between 2 vowels --//
498            if ((index == 0 || isVowel(charAt(value, index - 1))) &&
499                isVowel(charAt(value, index + 1))) {
500                result.append('H');
501                index += 2;
502                //-- also takes car of "HH" --//
503            } else {
504                index++;
505            }
506            return index;
507        }
508    
509        /**
510         * Handles 'J' cases.
511         */
512        private int handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
513            if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
514                    //-- obvious Spanish, "Jose", "San Jacinto" --//
515                    if ((index == 0 && (charAt(value, index + 4) == ' ') ||
516                         value.length() == 4) || contains(value, 0, 4, "SAN ")) {
517                        result.append('H');
518                    } else {
519                        result.append('J', 'H');
520                    }
521                    index++;
522                } else {
523                    if (index == 0 && !contains(value, index, 4, "JOSE")) {
524                        result.append('J', 'A');
525                    } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
526                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
527                        result.append('J', 'H');
528                    } else if (index == value.length() - 1) {
529                        result.append('J', ' ');
530                    } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
531                               !contains(value, index - 1, 1, "S", "K", "L")) {
532                        result.append('J');
533                    }
534    
535                    if (charAt(value, index + 1) == 'J') {
536                        index += 2;
537                    } else {
538                        index++;
539                    }
540                }
541            return index;
542        }
543    
544        /**
545         * Handles 'L' cases.
546         */
547        private int handleL(String value, DoubleMetaphoneResult result, int index) {
548            if (charAt(value, index + 1) == 'L') {
549                if (conditionL0(value, index)) {
550                    result.appendPrimary('L');
551                } else {
552                    result.append('L');
553                }
554                index += 2;
555            } else {
556                index++;
557                result.append('L');
558            }
559            return index;
560        }
561    
562        /**
563         * Handles 'P' cases.
564         */
565        private int handleP(String value, DoubleMetaphoneResult result, int index) {
566            if (charAt(value, index + 1) == 'H') {
567                result.append('F');
568                index += 2;
569            } else {
570                result.append('P');
571                index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
572            }
573            return index;
574        }
575    
576        /**
577         * Handles 'R' cases.
578         */
579        private int handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
580            if (index == value.length() - 1 && !slavoGermanic &&
581                contains(value, index - 2, 2, "IE") &&
582                !contains(value, index - 4, 2, "ME", "MA")) {
583                result.appendAlternate('R');
584            } else {
585                result.append('R');
586            }
587            return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
588        }
589    
590        /**
591         * Handles 'S' cases.
592         */
593        private int handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
594            if (contains(value, index - 1, 3, "ISL", "YSL")) {
595                //-- special cases "island", "isle", "carlisle", "carlysle" --//
596                index++;
597            } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
598                //-- special case "sugar-" --//
599                result.append('X', 'S');
600                index++;
601            } else if (contains(value, index, 2, "SH")) {
602                if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
603                    //-- germanic --//
604                    result.append('S');
605                } else {
606                    result.append('X');
607                }
608                index += 2;
609            } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
610                //-- Italian and Armenian --//
611                if (slavoGermanic) {
612                    result.append('S');
613                } else {
614                    result.append('S', 'X');
615                }
616                index += 3;
617            } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) ||
618                       contains(value, index + 1, 1, "Z")) {
619                //-- german & anglicisations, e.g. "smith" match "schmidt" //
620                // "snider" match "schneider" --//
621                //-- also, -sz- in slavic language altho in hungarian it //
622                //   is pronounced "s" --//
623                result.append('S', 'X');
624                index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
625            } else if (contains(value, index, 2, "SC")) {
626                index = handleSC(value, result, index);
627            } else {
628                if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
629                    //-- french e.g. "resnais", "artois" --//
630                    result.appendAlternate('S');
631                } else {
632                    result.append('S');
633                }
634                index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
635            }
636            return index;
637        }
638    
639        /**
640         * Handles 'SC' cases.
641         */
642        private int handleSC(String value, DoubleMetaphoneResult result, int index) {
643            if (charAt(value, index + 2) == 'H') {
644                //-- Schlesinger's rule --//
645                if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
646                    //-- Dutch origin, e.g. "school", "schooner" --//
647                    if (contains(value, index + 3, 2, "ER", "EN")) {
648                        //-- "schermerhorn", "schenker" --//
649                        result.append("X", "SK");
650                    } else {
651                        result.append("SK");
652                    }
653                } else {
654                    if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
655                        result.append('X', 'S');
656                    } else {
657                        result.append('X');
658                    }
659                }
660            } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
661                result.append('S');
662            } else {
663                result.append("SK");
664            }
665            return index + 3;
666        }
667    
668        /**
669         * Handles 'T' cases.
670         */
671        private int handleT(String value, DoubleMetaphoneResult result, int index) {
672            if (contains(value, index, 4, "TION")) {
673                result.append('X');
674                index += 3;
675            } else if (contains(value, index, 3, "TIA", "TCH")) {
676                result.append('X');
677                index += 3;
678            } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
679                if (contains(value, index + 2, 2, "OM", "AM") ||
680                    //-- special case "thomas", "thames" or germanic --//
681                    contains(value, 0, 4, "VAN ", "VON ") ||
682                    contains(value, 0, 3, "SCH")) {
683                    result.append('T');
684                } else {
685                    result.append('0', 'T');
686                }
687                index += 2;
688            } else {
689                result.append('T');
690                index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
691            }
692            return index;
693        }
694    
695        /**
696         * Handles 'W' cases.
697         */
698        private int handleW(String value, DoubleMetaphoneResult result, int index) {
699            if (contains(value, index, 2, "WR")) {
700                //-- can also be in middle of word --//
701                result.append('R');
702                index += 2;
703            } else {
704                if (index == 0 && (isVowel(charAt(value, index + 1)) ||
705                                   contains(value, index, 2, "WH"))) {
706                    if (isVowel(charAt(value, index + 1))) {
707                        //-- Wasserman should match Vasserman --//
708                        result.append('A', 'F');
709                    } else {
710                        //-- need Uomo to match Womo --//
711                        result.append('A');
712                    }
713                    index++;
714                } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
715                           contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
716                           contains(value, 0, 3, "SCH")) {
717                    //-- Arnow should match Arnoff --//
718                    result.appendAlternate('F');
719                    index++;
720                } else if (contains(value, index, 4, "WICZ", "WITZ")) {
721                    //-- Polish e.g. "filipowicz" --//
722                    result.append("TS", "FX");
723                    index += 4;
724                } else {
725                    index++;
726                }
727            }
728            return index;
729        }
730    
731        /**
732         * Handles 'X' cases.
733         */
734        private int handleX(String value, DoubleMetaphoneResult result, int index) {
735            if (index == 0) {
736                result.append('S');
737                index++;
738            } else {
739                if (!((index == value.length() - 1) &&
740                      (contains(value, index - 3, 3, "IAU", "EAU") ||
741                       contains(value, index - 2, 2, "AU", "OU")))) {
742                    //-- French e.g. breaux --//
743                    result.append("KS");
744                }
745                index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
746            }
747            return index;
748        }
749    
750        /**
751         * Handles 'Z' cases.
752         */
753        private int handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic) {
754            if (charAt(value, index + 1) == 'H') {
755                //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
756                result.append('J');
757                index += 2;
758            } else {
759                if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
760                    (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
761                    result.append("S", "TS");
762                } else {
763                    result.append('S');
764                }
765                index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
766            }
767            return index;
768        }
769    
770        //-- BEGIN CONDITIONS --//
771    
772        /**
773         * Complex condition 0 for 'C'.
774         */
775        private boolean conditionC0(String value, int index) {
776            if (contains(value, index, 4, "CHIA")) {
777                return true;
778            } else if (index <= 1) {
779                return false;
780            } else if (isVowel(charAt(value, index - 2))) {
781                return false;
782            } else if (!contains(value, index - 1, 3, "ACH")) {
783                return false;
784            } else {
785                char c = charAt(value, index + 2);
786                return (c != 'I' && c != 'E') ||
787                        contains(value, index - 2, 6, "BACHER", "MACHER");
788            }
789        }
790    
791        /**
792         * Complex condition 0 for 'CH'.
793         */
794        private boolean conditionCH0(String value, int index) {
795            if (index != 0) {
796                return false;
797            } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
798                       !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
799                return false;
800            } else if (contains(value, 0, 5, "CHORE")) {
801                return false;
802            } else {
803                return true;
804            }
805        }
806    
807        /**
808         * Complex condition 1 for 'CH'.
809         */
810        private boolean conditionCH1(String value, int index) {
811            return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) ||
812                    contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
813                    contains(value, index + 2, 1, "T", "S") ||
814                    ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
815                     (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
816        }
817    
818        /**
819         * Complex condition 0 for 'L'.
820         */
821        private boolean conditionL0(String value, int index) {
822            if (index == value.length() - 3 &&
823                contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
824                return true;
825            } else if ((contains(value, value.length() - 2, 2, "AS", "OS") ||
826                        contains(value, value.length() - 1, 1, "A", "O")) &&
827                       contains(value, index - 1, 4, "ALLE")) {
828                return true;
829            } else {
830                return false;
831            }
832        }
833    
834        /**
835         * Complex condition 0 for 'M'.
836         */
837        private boolean conditionM0(String value, int index) {
838            if (charAt(value, index + 1) == 'M') {
839                return true;
840            }
841            return contains(value, index - 1, 3, "UMB") &&
842                   ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER"));
843        }
844    
845        //-- BEGIN HELPER FUNCTIONS --//
846    
847        /**
848         * Determines whether or not a value is of slavo-germanic orgin. A value is
849         * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
850         */
851        private boolean isSlavoGermanic(String value) {
852            return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
853                value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
854        }
855    
856        /**
857         * Determines whether or not a character is a vowel or not
858         */
859        private boolean isVowel(char ch) {
860            return VOWELS.indexOf(ch) != -1;
861        }
862    
863        /**
864         * Determines whether or not the value starts with a silent letter.  It will
865         * return {@code true} if the value starts with any of 'GN', 'KN',
866         * 'PN', 'WR' or 'PS'.
867         */
868        private boolean isSilentStart(String value) {
869            boolean result = false;
870            for (String element : SILENT_START) {
871                if (value.startsWith(element)) {
872                    result = true;
873                    break;
874                }
875            }
876            return result;
877        }
878    
879        /**
880         * Cleans the input.
881         */
882        private String cleanInput(String input) {
883            if (input == null) {
884                return null;
885            }
886            input = input.trim();
887            if (input.length() == 0) {
888                return null;
889            }
890            return input.toUpperCase(java.util.Locale.ENGLISH);
891        }
892    
893        /**
894         * Gets the character at index <code>index</code> if available, otherwise
895         * it returns <code>Character.MIN_VALUE</code> so that there is some sort
896         * of a default.
897         */
898        protected char charAt(String value, int index) {
899            if (index < 0 || index >= value.length()) {
900                return Character.MIN_VALUE;
901            }
902            return value.charAt(index);
903        }
904    
905        /**
906         * Shortcut method with 1 criteria.
907         */
908        private static boolean contains(String value, int start, int length, String criteria) {
909            return contains(value, start, length, new String[] { criteria });
910        }
911    
912        /**
913         * Shortcut method with 2 criteria.
914         */
915        private static boolean contains(String value, int start, int length,
916                                        String criteria1, String criteria2) {
917            return contains(value, start, length, new String[] { criteria1, criteria2 });
918        }
919    
920        /**
921         * Shortcut method with 3 criteria.
922         */
923        private static boolean contains(String value, int start, int length,
924                                        String criteria1, String criteria2, String criteria3) {
925            return contains(value, start, length, new String[] { criteria1, criteria2, criteria3 });
926        }
927    
928        /**
929         * Shortcut method with 4 criteria.
930         */
931        private static boolean contains(String value, int start, int length,
932                                        String criteria1, String criteria2,
933                                        String criteria3, String criteria4) {
934            return contains(value, start, length,
935                            new String[] { criteria1, criteria2, criteria3, criteria4 });
936        }
937    
938        /**
939         * Shortcut method with 5 criteria.
940         */
941        private static boolean contains(String value, int start, int length,
942                                        String criteria1, String criteria2,
943                                        String criteria3, String criteria4,
944                                        String criteria5) {
945            return contains(value, start, length,
946                            new String[] { criteria1, criteria2, criteria3,
947                                           criteria4, criteria5 });
948        }
949    
950        /**
951         * Shortcut method with 6 criteria.
952         */
953        private static boolean contains(String value, int start, int length,
954                                        String criteria1, String criteria2,
955                                        String criteria3, String criteria4,
956                                        String criteria5, String criteria6) {
957            return contains(value, start, length,
958                            new String[] { criteria1, criteria2, criteria3,
959                                           criteria4, criteria5, criteria6 });
960        }
961    
962        /**
963         * Determines whether <code>value</code> contains any of the criteria starting at index <code>start</code> and
964         * matching up to length <code>length</code>.
965         */
966        protected static boolean contains(String value, int start, int length,
967                                          String[] criteria) {
968            boolean result = false;
969            if (start >= 0 && start + length <= value.length()) {
970                String target = value.substring(start, start + length);
971    
972                for (String element : criteria) {
973                    if (target.equals(element)) {
974                        result = true;
975                        break;
976                    }
977                }
978            }
979            return result;
980        }
981    
982        //-- BEGIN INNER CLASSES --//
983    
984        /**
985         * Inner class for storing results, since there is the optional alternate encoding.
986         */
987        public class DoubleMetaphoneResult {
988    
989            private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
990            private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
991            private final int maxLength;
992    
993            public DoubleMetaphoneResult(int maxLength) {
994                this.maxLength = maxLength;
995            }
996    
997            public void append(char value) {
998                appendPrimary(value);
999                appendAlternate(value);
1000            }
1001    
1002            public void append(char primary, char alternate) {
1003                appendPrimary(primary);
1004                appendAlternate(alternate);
1005            }
1006    
1007            public void appendPrimary(char value) {
1008                if (this.primary.length() < this.maxLength) {
1009                    this.primary.append(value);
1010                }
1011            }
1012    
1013            public void appendAlternate(char value) {
1014                if (this.alternate.length() < this.maxLength) {
1015                    this.alternate.append(value);
1016                }
1017            }
1018    
1019            public void append(String value) {
1020                appendPrimary(value);
1021                appendAlternate(value);
1022            }
1023    
1024            public void append(String primary, String alternate) {
1025                appendPrimary(primary);
1026                appendAlternate(alternate);
1027            }
1028    
1029            public void appendPrimary(String value) {
1030                int addChars = this.maxLength - this.primary.length();
1031                if (value.length() <= addChars) {
1032                    this.primary.append(value);
1033                } else {
1034                    this.primary.append(value.substring(0, addChars));
1035                }
1036            }
1037    
1038            public void appendAlternate(String value) {
1039                int addChars = this.maxLength - this.alternate.length();
1040                if (value.length() <= addChars) {
1041                    this.alternate.append(value);
1042                } else {
1043                    this.alternate.append(value.substring(0, addChars));
1044                }
1045            }
1046    
1047            public String getPrimary() {
1048                return this.primary.toString();
1049            }
1050    
1051            public String getAlternate() {
1052                return this.alternate.toString();
1053            }
1054    
1055            public boolean isComplete() {
1056                return this.primary.length() >= this.maxLength &&
1057                       this.alternate.length() >= this.maxLength;
1058            }
1059        }
1060    }