001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
031 * p 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
035 * </p>
036 * <ul>
037 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (broken link 4/30/2013) </li>
039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
040 *  (link checked 4/30/2013) </li>
041 * </ul>
042 * <p>
043 * They have had undocumented changes from the originally published algorithm.
044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
045 * </p>
046 * <p>
047 * This class is conditionally thread-safe.
048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
049 * but is not volatile, and accesses are not synchronized.
050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
052 * after initial setup.
053 * </p>
054 */
055public class Metaphone implements StringEncoder {
056
057    /**
058     * Five values in the English language
059     */
060    private static final String VOWELS = "AEIOU";
061
062    /**
063     * Variable used in Metaphone algorithm
064     */
065    private static final String FRONTV = "EIY";
066
067    /**
068     * Variable used in Metaphone algorithm
069     */
070    private static final String VARSON = "CSPTG";
071
072    /**
073     * The max code length for metaphone is 4
074     */
075    private int maxCodeLen = 4;
076
077    /**
078     * Encodes an Object using the metaphone algorithm.  This method
079     * is provided in order to satisfy the requirements of the
080     * Encoder interface, and will throw an EncoderException if the
081     * supplied object is not of type {@link String}.
082     *
083     * @param obj Object to encode
084     * @return An object (or type {@link String}) containing the
085     *         metaphone code which corresponds to the String supplied.
086     * @throws EncoderException if the parameter supplied is not
087     *                          of type {@link String}
088     */
089    @Override
090    public Object encode(final Object obj) throws EncoderException {
091        if (!(obj instanceof String)) {
092            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
093        }
094        return metaphone((String) obj);
095    }
096
097    /**
098     * Encodes a String using the Metaphone algorithm.
099     *
100     * @param str String object to encode
101     * @return The metaphone code corresponding to the String supplied
102     */
103    @Override
104    public String encode(final String str) {
105        return metaphone(str);
106    }
107
108    /**
109     * Returns the maxCodeLen.
110     * @return int
111     */
112    public int getMaxCodeLen() { return this.maxCodeLen; }
113
114    private boolean isLastChar(final int wdsz, final int n) {
115        return n + 1 == wdsz;
116    }
117
118    /**
119     * Tests is the metaphones of two strings are identical.
120     *
121     * @param str1 First of two strings to compare
122     * @param str2 Second of two strings to compare
123     * @return {@code true} if the metaphones of these strings are identical,
124     *        {@code false} otherwise.
125     */
126    public boolean isMetaphoneEqual(final String str1, final String str2) {
127        return metaphone(str1).equals(metaphone(str2));
128    }
129
130    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131        boolean matches = false;
132        if (index >= 0 && index < string.length() - 1) {
133            matches = string.charAt(index + 1) == c;
134        }
135        return matches;
136    }
137
138    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139        boolean matches = false;
140        if (index > 0 && index < string.length()) {
141            matches = string.charAt(index - 1) == c;
142        }
143        return matches;
144    }
145
146    private boolean isVowel(final StringBuilder string, final int index) {
147        return VOWELS.indexOf(string.charAt(index)) >= 0;
148    }
149
150    /**
151     * Find the metaphone value of a String. This is similar to the
152     * soundex algorithm, but better at finding similar sounding words.
153     * All input is converted to upper case.
154     * Limitations: Input format is expected to be a single ASCII word
155     * with only characters in the A - Z range, no punctuation or numbers.
156     *
157     * @param txt String to find the metaphone code for
158     * @return A metaphone code corresponding to the String supplied
159     */
160    public String metaphone(final String txt) {
161        boolean hard = false;
162        final int txtLength;
163        if (txt == null || (txtLength = txt.length()) == 0) {
164            return "";
165        }
166        // single character is itself
167        if (txtLength == 1) {
168            return txt.toUpperCase(java.util.Locale.ENGLISH);
169        }
170
171        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172
173        final StringBuilder local = new StringBuilder(40); // manipulate
174        final StringBuilder code = new StringBuilder(10); // output
175        // handle initial 2 characters exceptions
176        switch (inwd[0]) {
177        case 'K':
178        case 'G':
179        case 'P': /* looking for KN, etc */
180            if (inwd[1] == 'N') {
181                local.append(inwd, 1, inwd.length - 1);
182            } else {
183                local.append(inwd);
184            }
185            break;
186        case 'A': /* looking for AE */
187            if (inwd[1] == 'E') {
188                local.append(inwd, 1, inwd.length - 1);
189            } else {
190                local.append(inwd);
191            }
192            break;
193        case 'W': /* looking for WR or WH */
194            if (inwd[1] == 'R') { // WR -> R
195                local.append(inwd, 1, inwd.length - 1);
196                break;
197            }
198            if (inwd[1] == 'H') {
199                local.append(inwd, 1, inwd.length - 1);
200                local.setCharAt(0, 'W'); // WH -> W
201            } else {
202                local.append(inwd);
203            }
204            break;
205        case 'X': /* initial X becomes S */
206            inwd[0] = 'S';
207            local.append(inwd);
208            break;
209        default:
210            local.append(inwd);
211        } // now local has working string with initials fixed
212
213        final int wdsz = local.length();
214        int n = 0;
215
216        while (code.length() < this.getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
217            final char symb = local.charAt(n);
218            // remove duplicate letters except C
219            if (symb != 'C' && isPreviousChar(local, n, symb)) {
220                n++;
221            } else { // not dup
222                switch (symb) {
223                case 'A':
224                case 'E':
225                case 'I':
226                case 'O':
227                case 'U':
228                    if (n == 0) {
229                        code.append(symb);
230                    }
231                    break; // only use vowel if leading char
232                case 'B':
233                    if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
234                        break;
235                    }
236                    code.append(symb);
237                    break;
238                case 'C': // lots of C special cases
239                    /* discard if SCI, SCE or SCY */
240                    if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
241                        break;
242                    }
243                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
244                        code.append('X');
245                        break;
246                    }
247                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
248                        code.append('S');
249                        break; // CI,CE,CY -> S
250                    }
251                    if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
252                        code.append('K');
253                        break;
254                    }
255                    if (isNextChar(local, n, 'H')) { // detect CH
256                        if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
257                            code.append('K');
258                        } else {
259                            code.append('X'); // CHvowel -> X
260                        }
261                    } else {
262                        code.append('K');
263                    }
264                    break;
265                case 'D':
266                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
267                        code.append('J');
268                        n += 2;
269                    } else {
270                        code.append('T');
271                    }
272                    break;
273                case 'G': // GH silent at end or before consonant
274                    if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
275                        break;
276                    }
277                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
278                        break;
279                    }
280                    if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
281                        break; // silent G
282                    }
283                    // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
284                    hard = isPreviousChar(local, n, 'G');
285                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
286                        code.append('J');
287                    } else {
288                        code.append('K');
289                    }
290                    break;
291                case 'H':
292                    if (isLastChar(wdsz, n)) {
293                        break; // terminal H
294                    }
295                    if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
296                        break;
297                    }
298                    if (isVowel(local, n + 1)) {
299                        code.append('H'); // Hvowel
300                    }
301                    break;
302                case 'F':
303                case 'J':
304                case 'L':
305                case 'M':
306                case 'N':
307                case 'R':
308                    code.append(symb);
309                    break;
310                case 'K':
311                    if (n > 0) { // not initial
312                        if (!isPreviousChar(local, n, 'C')) {
313                            code.append(symb);
314                        }
315                    } else {
316                        code.append(symb); // initial K
317                    }
318                    break;
319                case 'P':
320                    if (isNextChar(local, n, 'H')) {
321                        // PH -> F
322                        code.append('F');
323                    } else {
324                        code.append(symb);
325                    }
326                    break;
327                case 'Q':
328                    code.append('K');
329                    break;
330                case 'S':
331                    if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
332                        code.append('X');
333                    } else {
334                        code.append('S');
335                    }
336                    break;
337                case 'T':
338                    if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
339                        code.append('X');
340                        break;
341                    }
342                    if (regionMatch(local, n, "TCH")) {
343                        // Silent if in "TCH"
344                        break;
345                    }
346                    // substitute numeral 0 for TH (resembles theta after all)
347                    if (regionMatch(local, n, "TH")) {
348                        code.append('0');
349                    } else {
350                        code.append('T');
351                    }
352                    break;
353                case 'V':
354                    code.append('F');
355                    break;
356                case 'W':
357                case 'Y': // silent if not followed by vowel
358                    if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
359                        code.append(symb);
360                    }
361                    break;
362                case 'X':
363                    code.append('K');
364                    code.append('S');
365                    break;
366                case 'Z':
367                    code.append('S');
368                    break;
369                default:
370                    // do nothing
371                    break;
372                } // end switch
373                n++;
374            } // end else from symb != 'C'
375            if (code.length() > this.getMaxCodeLen()) {
376                code.setLength(this.getMaxCodeLen());
377            }
378        }
379        return code.toString();
380    }
381
382    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
383        boolean matches = false;
384        if (index >= 0 && index + test.length() - 1 < string.length()) {
385            final String substring = string.substring(index, index + test.length());
386            matches = substring.equals(test);
387        }
388        return matches;
389    }
390
391    /**
392     * Sets the maxCodeLen.
393     * @param maxCodeLen The maxCodeLen to set
394     */
395    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
396
397}