001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022import org.apache.commons.codec.binary.StringUtils;
023
024/**
025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence
026 * Philips</CITE>.
027 * <p>
028 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable
029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is
030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication
031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup.
032 *
033 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a>
034 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a>
035 *
036 */
037public class DoubleMetaphone implements StringEncoder {
038
039    /**
040     * "Vowels" to test for
041     */
042    private static final String VOWELS = "AEIOUY";
043
044    /**
045     * Prefixes when present which are not pronounced
046     */
047    private static final String[] SILENT_START =
048        { "GN", "KN", "PN", "WR", "PS" };
049    private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
050        { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
051    private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
052        { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
053    private static final String[] L_T_K_S_N_M_B_Z =
054        { "L", "T", "K", "S", "N", "M", "B", "Z" };
055
056    /**
057     * Maximum length of an encoding, default is 4
058     */
059    private int maxCodeLen = 4;
060
061    /**
062     * Creates an instance of this DoubleMetaphone encoder
063     */
064    public DoubleMetaphone() {
065        super();
066    }
067
068    /**
069     * Encode a value with Double Metaphone.
070     *
071     * @param value String to encode
072     * @return an encoded string
073     */
074    public String doubleMetaphone(final String value) {
075        return doubleMetaphone(value, false);
076    }
077
078    /**
079     * Encode a value with Double Metaphone, optionally using the alternate encoding.
080     *
081     * @param value String to encode
082     * @param alternate use alternate encode
083     * @return an encoded string
084     */
085    public String doubleMetaphone(String value, final boolean alternate) {
086        value = cleanInput(value);
087        if (value == null) {
088            return null;
089        }
090
091        final boolean slavoGermanic = isSlavoGermanic(value);
092        int index = isSilentStart(value) ? 1 : 0;
093
094        final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
095
096        while (!result.isComplete() && index <= value.length() - 1) {
097            switch (value.charAt(index)) {
098            case 'A':
099            case 'E':
100            case 'I':
101            case 'O':
102            case 'U':
103            case 'Y':
104                index = handleAEIOUY(result, index);
105                break;
106            case 'B':
107                result.append('P');
108                index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109                break;
110            case '\u00C7':
111                // A C with a Cedilla
112                result.append('S');
113                index++;
114                break;
115            case 'C':
116                index = handleC(value, result, index);
117                break;
118            case 'D':
119                index = handleD(value, result, index);
120                break;
121            case 'F':
122                result.append('F');
123                index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124                break;
125            case 'G':
126                index = handleG(value, result, index, slavoGermanic);
127                break;
128            case 'H':
129                index = handleH(value, result, index);
130                break;
131            case 'J':
132                index = handleJ(value, result, index, slavoGermanic);
133                break;
134            case 'K':
135                result.append('K');
136                index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137                break;
138            case 'L':
139                index = handleL(value, result, index);
140                break;
141            case 'M':
142                result.append('M');
143                index = conditionM0(value, index) ? index + 2 : index + 1;
144                break;
145            case 'N':
146                result.append('N');
147                index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148                break;
149            case '\u00D1':
150                // N with a tilde (spanish ene)
151                result.append('N');
152                index++;
153                break;
154            case 'P':
155                index = handleP(value, result, index);
156                break;
157            case 'Q':
158                result.append('K');
159                index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160                break;
161            case 'R':
162                index = handleR(value, result, index, slavoGermanic);
163                break;
164            case 'S':
165                index = handleS(value, result, index, slavoGermanic);
166                break;
167            case 'T':
168                index = handleT(value, result, index);
169                break;
170            case 'V':
171                result.append('F');
172                index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173                break;
174            case 'W':
175                index = handleW(value, result, index);
176                break;
177            case 'X':
178                index = handleX(value, result, index);
179                break;
180            case 'Z':
181                index = handleZ(value, result, index, slavoGermanic);
182                break;
183            default:
184                index++;
185                break;
186            }
187        }
188
189        return alternate ? result.getAlternate() : result.getPrimary();
190    }
191
192    /**
193     * Encode the value using DoubleMetaphone.  It will only work if
194     * {@code obj} is a {@code String} (like {@code Metaphone}).
195     *
196     * @param obj Object to encode (should be of type String)
197     * @return An encoded Object (will be of type String)
198     * @throws EncoderException encode parameter is not of type String
199     */
200    @Override
201    public Object encode(final Object obj) throws EncoderException {
202        if (!(obj instanceof String)) {
203            throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
204        }
205        return doubleMetaphone((String) obj);
206    }
207
208    /**
209     * Encode the value using DoubleMetaphone.
210     *
211     * @param value String to encode
212     * @return An encoded String
213     */
214    @Override
215    public String encode(final String value) {
216        return doubleMetaphone(value);
217    }
218
219    /**
220     * Check if the Double Metaphone values of two {@code String} values
221     * are equal.
222     *
223     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
224     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
225     * @return {@code true} if the encoded {@code String}s are equal;
226     *          {@code false} otherwise.
227     * @see #isDoubleMetaphoneEqual(String,String,boolean)
228     */
229    public boolean isDoubleMetaphoneEqual(final String value1, final String value2) {
230        return isDoubleMetaphoneEqual(value1, value2, false);
231    }
232
233    /**
234     * Check if the Double Metaphone values of two {@code String} values
235     * are equal, optionally using the alternate value.
236     *
237     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
238     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
239     * @param alternate use the alternate value if {@code true}.
240     * @return {@code true} if the encoded {@code String}s are equal;
241     *          {@code false} otherwise.
242     */
243    public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) {
244        return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate));
245    }
246
247    /**
248     * Returns the maxCodeLen.
249     * @return int
250     */
251    public int getMaxCodeLen() {
252        return this.maxCodeLen;
253    }
254
255    /**
256     * Sets the maxCodeLen.
257     * @param maxCodeLen The maxCodeLen to set
258     */
259    public void setMaxCodeLen(final int maxCodeLen) {
260        this.maxCodeLen = maxCodeLen;
261    }
262
263    //-- BEGIN HANDLERS --//
264
265    /**
266     * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
267     */
268    private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) {
269        if (index == 0) {
270            result.append('A');
271        }
272        return index + 1;
273    }
274
275    /**
276     * Handles 'C' cases.
277     */
278    private int handleC(final String value, final DoubleMetaphoneResult result, int index) {
279        if (conditionC0(value, index)) {  // very confusing, moved out
280            result.append('K');
281            index += 2;
282        } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
283            result.append('S');
284            index += 2;
285        } else if (contains(value, index, 2, "CH")) {
286            index = handleCH(value, result, index);
287        } else if (contains(value, index, 2, "CZ") &&
288                   !contains(value, index - 2, 4, "WICZ")) {
289            //-- "Czerny" --//
290            result.append('S', 'X');
291            index += 2;
292        } else if (contains(value, index + 1, 3, "CIA")) {
293            //-- "focaccia" --//
294            result.append('X');
295            index += 3;
296        } else if (contains(value, index, 2, "CC") &&
297                   !(index == 1 && charAt(value, 0) == 'M')) {
298            //-- double "cc" but not "McClelland" --//
299            return handleCC(value, result, index);
300        } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
301            result.append('K');
302            index += 2;
303        } else if (contains(value, index, 2, "CI", "CE", "CY")) {
304            //-- Italian vs. English --//
305            if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
306                result.append('S', 'X');
307            } else {
308                result.append('S');
309            }
310            index += 2;
311        } else {
312            result.append('K');
313            if (contains(value, index + 1, 2, " C", " Q", " G")) {
314                //-- Mac Caffrey, Mac Gregor --//
315                index += 3;
316            } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
317                       !contains(value, index + 1, 2, "CE", "CI")) {
318                index += 2;
319            } else {
320                index++;
321            }
322        }
323
324        return index;
325    }
326
327    /**
328     * Handles 'CC' cases.
329     */
330    private int handleCC(final String value, final DoubleMetaphoneResult result, int index) {
331        if (contains(value, index + 2, 1, "I", "E", "H") &&
332            !contains(value, index + 2, 2, "HU")) {
333            //-- "bellocchio" but not "bacchus" --//
334            if ((index == 1 && charAt(value, index - 1) == 'A') ||
335                contains(value, index - 1, 5, "UCCEE", "UCCES")) {
336                //-- "accident", "accede", "succeed" --//
337                result.append("KS");
338            } else {
339                //-- "bacci", "bertucci", other Italian --//
340                result.append('X');
341            }
342            index += 3;
343        } else {    // Pierce's rule
344            result.append('K');
345            index += 2;
346        }
347
348        return index;
349    }
350
351    /**
352     * Handles 'CH' cases.
353     */
354    private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) {
355        if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
356            result.append('K', 'X');
357            return index + 2;
358        } else if (conditionCH0(value, index)) {
359            //-- Greek roots ("chemistry", "chorus", etc.) --//
360            result.append('K');
361            return index + 2;
362        } else if (conditionCH1(value, index)) {
363            //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
364            result.append('K');
365            return index + 2;
366        } else {
367            if (index > 0) {
368                if (contains(value, 0, 2, "MC")) {
369                    result.append('K');
370                } else {
371                    result.append('X', 'K');
372                }
373            } else {
374                result.append('X');
375            }
376            return index + 2;
377        }
378    }
379
380    /**
381     * Handles 'D' cases.
382     */
383    private int handleD(final String value, final DoubleMetaphoneResult result, int index) {
384        if (contains(value, index, 2, "DG")) {
385            //-- "Edge" --//
386            if (contains(value, index + 2, 1, "I", "E", "Y")) {
387                result.append('J');
388                index += 3;
389                //-- "Edgar" --//
390            } else {
391                result.append("TK");
392                index += 2;
393            }
394        } else if (contains(value, index, 2, "DT", "DD")) {
395            result.append('T');
396            index += 2;
397        } else {
398            result.append('T');
399            index++;
400        }
401        return index;
402    }
403
404    /**
405     * Handles 'G' cases.
406     */
407    private int handleG(final String value, final DoubleMetaphoneResult result, int index,
408                        final boolean slavoGermanic) {
409        if (charAt(value, index + 1) == 'H') {
410            index = handleGH(value, result, index);
411        } else if (charAt(value, index + 1) == 'N') {
412            if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
413                result.append("KN", "N");
414            } else if (!contains(value, index + 2, 2, "EY") &&
415                       charAt(value, index + 1) != 'Y' && !slavoGermanic) {
416                result.append("N", "KN");
417            } else {
418                result.append("KN");
419            }
420            index = index + 2;
421        } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
422            result.append("KL", "L");
423            index += 2;
424        } else if (index == 0 &&
425                   (charAt(value, index + 1) == 'Y' ||
426                    contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
427            //-- -ges-, -gep-, -gel-, -gie- at beginning --//
428            result.append('K', 'J');
429            index += 2;
430        } else if ((contains(value, index + 1, 2, "ER") ||
431                    charAt(value, index + 1) == 'Y') &&
432                   !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
433                   !contains(value, index - 1, 1, "E", "I") &&
434                   !contains(value, index - 1, 3, "RGY", "OGY")) {
435            //-- -ger-, -gy- --//
436            result.append('K', 'J');
437            index += 2;
438        } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
439                   contains(value, index - 1, 4, "AGGI", "OGGI")) {
440            //-- Italian "biaggi" --//
441            if (contains(value, 0 ,4, "VAN ", "VON ") ||
442                contains(value, 0, 3, "SCH") ||
443                contains(value, index + 1, 2, "ET")) {
444                //-- obvious germanic --//
445                result.append('K');
446            } else if (contains(value, index + 1, 3, "IER")) {
447                result.append('J');
448            } else {
449                result.append('J', 'K');
450            }
451            index += 2;
452        } else if (charAt(value, index + 1) == 'G') {
453            index += 2;
454            result.append('K');
455        } else {
456            index++;
457            result.append('K');
458        }
459        return index;
460    }
461
462    /**
463     * Handles 'GH' cases.
464     */
465    private int handleGH(final String value, final DoubleMetaphoneResult result, int index) {
466        if (index > 0 && !isVowel(charAt(value, index - 1))) {
467            result.append('K');
468            index += 2;
469        } else if (index == 0) {
470            if (charAt(value, index + 2) == 'I') {
471                result.append('J');
472            } else {
473                result.append('K');
474            }
475            index += 2;
476        } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
477                   (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
478                   (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
479            //-- Parker's rule (with some further refinements) - "hugh"
480            index += 2;
481        } else {
482            if (index > 2 && charAt(value, index - 1) == 'U' &&
483                contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
484                //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
485                result.append('F');
486            } else if (index > 0 && charAt(value, index - 1) != 'I') {
487                result.append('K');
488            }
489            index += 2;
490        }
491        return index;
492    }
493
494    /**
495     * Handles 'H' cases.
496     */
497    private int handleH(final String value, final DoubleMetaphoneResult result, int index) {
498        //-- only keep if first & before vowel or between 2 vowels --//
499        if ((index == 0 || isVowel(charAt(value, index - 1))) &&
500            isVowel(charAt(value, index + 1))) {
501            result.append('H');
502            index += 2;
503            //-- also takes car of "HH" --//
504        } else {
505            index++;
506        }
507        return index;
508    }
509
510    /**
511     * Handles 'J' cases.
512     */
513    private int handleJ(final String value, final DoubleMetaphoneResult result, int index,
514                        final boolean slavoGermanic) {
515        if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
516                //-- obvious Spanish, "Jose", "San Jacinto" --//
517                if ((index == 0 && (charAt(value, index + 4) == ' ') ||
518                     value.length() == 4) || contains(value, 0, 4, "SAN ")) {
519                    result.append('H');
520                } else {
521                    result.append('J', 'H');
522                }
523                index++;
524            } else {
525                if (index == 0 && !contains(value, index, 4, "JOSE")) {
526                    result.append('J', 'A');
527                } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
528                           (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
529                    result.append('J', 'H');
530                } else if (index == value.length() - 1) {
531                    result.append('J', ' ');
532                } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
533                           !contains(value, index - 1, 1, "S", "K", "L")) {
534                    result.append('J');
535                }
536
537                if (charAt(value, index + 1) == 'J') {
538                    index += 2;
539                } else {
540                    index++;
541                }
542            }
543        return index;
544    }
545
546    /**
547     * Handles 'L' cases.
548     */
549    private int handleL(final String value, final DoubleMetaphoneResult result, int index) {
550        if (charAt(value, index + 1) == 'L') {
551            if (conditionL0(value, index)) {
552                result.appendPrimary('L');
553            } else {
554                result.append('L');
555            }
556            index += 2;
557        } else {
558            index++;
559            result.append('L');
560        }
561        return index;
562    }
563
564    /**
565     * Handles 'P' cases.
566     */
567    private int handleP(final String value, final DoubleMetaphoneResult result, int index) {
568        if (charAt(value, index + 1) == 'H') {
569            result.append('F');
570            index += 2;
571        } else {
572            result.append('P');
573            index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
574        }
575        return index;
576    }
577
578    /**
579     * Handles 'R' cases.
580     */
581    private int handleR(final String value, final DoubleMetaphoneResult result, final int index,
582                        final boolean slavoGermanic) {
583        if (index == value.length() - 1 && !slavoGermanic &&
584            contains(value, index - 2, 2, "IE") &&
585            !contains(value, index - 4, 2, "ME", "MA")) {
586            result.appendAlternate('R');
587        } else {
588            result.append('R');
589        }
590        return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
591    }
592
593    /**
594     * Handles 'S' cases.
595     */
596    private int handleS(final String value, final DoubleMetaphoneResult result, int index,
597                        final boolean slavoGermanic) {
598        if (contains(value, index - 1, 3, "ISL", "YSL")) {
599            //-- special cases "island", "isle", "carlisle", "carlysle" --//
600            index++;
601        } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
602            //-- special case "sugar-" --//
603            result.append('X', 'S');
604            index++;
605        } else if (contains(value, index, 2, "SH")) {
606            if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
607                //-- germanic --//
608                result.append('S');
609            } else {
610                result.append('X');
611            }
612            index += 2;
613        } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
614            //-- Italian and Armenian --//
615            if (slavoGermanic) {
616                result.append('S');
617            } else {
618                result.append('S', 'X');
619            }
620            index += 3;
621        } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) ||
622                   contains(value, index + 1, 1, "Z")) {
623            //-- german & anglicisations, e.g. "smith" match "schmidt" //
624            // "snider" match "schneider" --//
625            //-- also, -sz- in slavic language although in hungarian it //
626            //   is pronounced "s" --//
627            result.append('S', 'X');
628            index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
629        } else if (contains(value, index, 2, "SC")) {
630            index = handleSC(value, result, index);
631        } else {
632            if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) {
633                //-- french e.g. "resnais", "artois" --//
634                result.appendAlternate('S');
635            } else {
636                result.append('S');
637            }
638            index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
639        }
640        return index;
641    }
642
643    /**
644     * Handles 'SC' cases.
645     */
646    private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) {
647        if (charAt(value, index + 2) == 'H') {
648            //-- Schlesinger's rule --//
649            if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
650                //-- Dutch origin, e.g. "school", "schooner" --//
651                if (contains(value, index + 3, 2, "ER", "EN")) {
652                    //-- "schermerhorn", "schenker" --//
653                    result.append("X", "SK");
654                } else {
655                    result.append("SK");
656                }
657            } else {
658                if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
659                    result.append('X', 'S');
660                } else {
661                    result.append('X');
662                }
663            }
664        } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
665            result.append('S');
666        } else {
667            result.append("SK");
668        }
669        return index + 3;
670    }
671
672    /**
673     * Handles 'T' cases.
674     */
675    private int handleT(final String value, final DoubleMetaphoneResult result, int index) {
676        if (contains(value, index, 4, "TION")) {
677            result.append('X');
678            index += 3;
679        } else if (contains(value, index, 3, "TIA", "TCH")) {
680            result.append('X');
681            index += 3;
682        } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) {
683            if (contains(value, index + 2, 2, "OM", "AM") ||
684                //-- special case "thomas", "thames" or germanic --//
685                contains(value, 0, 4, "VAN ", "VON ") ||
686                contains(value, 0, 3, "SCH")) {
687                result.append('T');
688            } else {
689                result.append('0', 'T');
690            }
691            index += 2;
692        } else {
693            result.append('T');
694            index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
695        }
696        return index;
697    }
698
699    /**
700     * Handles 'W' cases.
701     */
702    private int handleW(final String value, final DoubleMetaphoneResult result, int index) {
703        if (contains(value, index, 2, "WR")) {
704            //-- can also be in middle of word --//
705            result.append('R');
706            index += 2;
707        } else {
708            if (index == 0 && (isVowel(charAt(value, index + 1)) ||
709                               contains(value, index, 2, "WH"))) {
710                if (isVowel(charAt(value, index + 1))) {
711                    //-- Wasserman should match Vasserman --//
712                    result.append('A', 'F');
713                } else {
714                    //-- need Uomo to match Womo --//
715                    result.append('A');
716                }
717                index++;
718            } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
719                       contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
720                       contains(value, 0, 3, "SCH")) {
721                //-- Arnow should match Arnoff --//
722                result.appendAlternate('F');
723                index++;
724            } else if (contains(value, index, 4, "WICZ", "WITZ")) {
725                //-- Polish e.g. "filipowicz" --//
726                result.append("TS", "FX");
727                index += 4;
728            } else {
729                index++;
730            }
731        }
732        return index;
733    }
734
735    /**
736     * Handles 'X' cases.
737     */
738    private int handleX(final String value, final DoubleMetaphoneResult result, int index) {
739        if (index == 0) {
740            result.append('S');
741            index++;
742        } else {
743            if (!((index == value.length() - 1) &&
744                  (contains(value, index - 3, 3, "IAU", "EAU") ||
745                   contains(value, index - 2, 2, "AU", "OU")))) {
746                //-- French e.g. breaux --//
747                result.append("KS");
748            }
749            index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
750        }
751        return index;
752    }
753
754    /**
755     * Handles 'Z' cases.
756     */
757    private int handleZ(final String value, final DoubleMetaphoneResult result, int index,
758                        final boolean slavoGermanic) {
759        if (charAt(value, index + 1) == 'H') {
760            //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
761            result.append('J');
762            index += 2;
763        } else {
764            if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
765                (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
766                result.append("S", "TS");
767            } else {
768                result.append('S');
769            }
770            index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
771        }
772        return index;
773    }
774
775    //-- BEGIN CONDITIONS --//
776
777    /**
778     * Complex condition 0 for 'C'.
779     */
780    private boolean conditionC0(final String value, final int index) {
781        if (contains(value, index, 4, "CHIA")) {
782            return true;
783        } else if (index <= 1) {
784            return false;
785        } else if (isVowel(charAt(value, index - 2))) {
786            return false;
787        } else if (!contains(value, index - 1, 3, "ACH")) {
788            return false;
789        } else {
790            final char c = charAt(value, index + 2);
791            return (c != 'I' && c != 'E') ||
792                    contains(value, index - 2, 6, "BACHER", "MACHER");
793        }
794    }
795
796    /**
797     * Complex condition 0 for 'CH'.
798     */
799    private boolean conditionCH0(final String value, final int index) {
800        if (index != 0) {
801            return false;
802        } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
803                   !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
804            return false;
805        } else if (contains(value, 0, 5, "CHORE")) {
806            return false;
807        } else {
808            return true;
809        }
810    }
811
812    /**
813     * Complex condition 1 for 'CH'.
814     */
815    private boolean conditionCH1(final String value, final int index) {
816        return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) ||
817                contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
818                contains(value, index + 2, 1, "T", "S") ||
819                ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
820                 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
821    }
822
823    /**
824     * Complex condition 0 for 'L'.
825     */
826    private boolean conditionL0(final String value, final int index) {
827        if (index == value.length() - 3 &&
828            contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
829            return true;
830        } else if ((contains(value, value.length() - 2, 2, "AS", "OS") ||
831                    contains(value, value.length() - 1, 1, "A", "O")) &&
832                   contains(value, index - 1, 4, "ALLE")) {
833            return true;
834        } else {
835            return false;
836        }
837    }
838
839    /**
840     * Complex condition 0 for 'M'.
841     */
842    private boolean conditionM0(final String value, final int index) {
843        if (charAt(value, index + 1) == 'M') {
844            return true;
845        }
846        return contains(value, index - 1, 3, "UMB") &&
847               ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER"));
848    }
849
850    //-- BEGIN HELPER FUNCTIONS --//
851
852    /**
853     * Determines whether or not a value is of slavo-germanic origin. A value is
854     * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
855     */
856    private boolean isSlavoGermanic(final String value) {
857        return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
858            value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
859    }
860
861    /**
862     * Determines whether or not a character is a vowel or not
863     */
864    private boolean isVowel(final char ch) {
865        return VOWELS.indexOf(ch) != -1;
866    }
867
868    /**
869     * Determines whether or not the value starts with a silent letter.  It will
870     * return {@code true} if the value starts with any of 'GN', 'KN',
871     * 'PN', 'WR' or 'PS'.
872     */
873    private boolean isSilentStart(final String value) {
874        boolean result = false;
875        for (final String element : SILENT_START) {
876            if (value.startsWith(element)) {
877                result = true;
878                break;
879            }
880        }
881        return result;
882    }
883
884    /**
885     * Cleans the input.
886     */
887    private String cleanInput(String input) {
888        if (input == null) {
889            return null;
890        }
891        input = input.trim();
892        if (input.length() == 0) {
893            return null;
894        }
895        return input.toUpperCase(java.util.Locale.ENGLISH);
896    }
897
898    /*
899     * Gets the character at index {@code index} if available, otherwise
900     * it returns {@code Character.MIN_VALUE} so that there is some sort
901     * of a default.
902     */
903    protected char charAt(final String value, final int index) {
904        if (index < 0 || index >= value.length()) {
905            return Character.MIN_VALUE;
906        }
907        return value.charAt(index);
908    }
909
910    /*
911     * Determines whether {@code value} contains any of the criteria starting at index {@code start} and
912     * matching up to length {@code length}.
913     */
914    protected static boolean contains(final String value, final int start, final int length,
915                                      final String... criteria) {
916        boolean result = false;
917        if (start >= 0 && start + length <= value.length()) {
918            final String target = value.substring(start, start + length);
919
920            for (final String element : criteria) {
921                if (target.equals(element)) {
922                    result = true;
923                    break;
924                }
925            }
926        }
927        return result;
928    }
929
930    //-- BEGIN INNER CLASSES --//
931
932    /**
933     * Inner class for storing results, since there is the optional alternate encoding.
934     */
935    public class DoubleMetaphoneResult {
936
937        private final StringBuilder primary = new StringBuilder(getMaxCodeLen());
938        private final StringBuilder alternate = new StringBuilder(getMaxCodeLen());
939        private final int maxLength;
940
941        public DoubleMetaphoneResult(final int maxLength) {
942            this.maxLength = maxLength;
943        }
944
945        public void append(final char value) {
946            appendPrimary(value);
947            appendAlternate(value);
948        }
949
950        public void append(final char primary, final char alternate) {
951            appendPrimary(primary);
952            appendAlternate(alternate);
953        }
954
955        public void appendPrimary(final char value) {
956            if (this.primary.length() < this.maxLength) {
957                this.primary.append(value);
958            }
959        }
960
961        public void appendAlternate(final char value) {
962            if (this.alternate.length() < this.maxLength) {
963                this.alternate.append(value);
964            }
965        }
966
967        public void append(final String value) {
968            appendPrimary(value);
969            appendAlternate(value);
970        }
971
972        public void append(final String primary, final String alternate) {
973            appendPrimary(primary);
974            appendAlternate(alternate);
975        }
976
977        public void appendPrimary(final String value) {
978            final int addChars = this.maxLength - this.primary.length();
979            if (value.length() <= addChars) {
980                this.primary.append(value);
981            } else {
982                this.primary.append(value.substring(0, addChars));
983            }
984        }
985
986        public void appendAlternate(final String value) {
987            final int addChars = this.maxLength - this.alternate.length();
988            if (value.length() <= addChars) {
989                this.alternate.append(value);
990            } else {
991                this.alternate.append(value.substring(0, addChars));
992            }
993        }
994
995        public String getPrimary() {
996            return this.primary.toString();
997        }
998
999        public String getAlternate() {
1000            return this.alternate.toString();
1001        }
1002
1003        public boolean isComplete() {
1004            return this.primary.length() >= this.maxLength &&
1005                   this.alternate.length() >= this.maxLength;
1006        }
1007    }
1008}