001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
031 * p 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
035 * </p>
036 * <ul>
037 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (broken link 4/30/2013) </li>
039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
040 *  (link checked 4/30/2013) </li>
041 * </ul>
042 * <p>
043 * They have had undocumented changes from the originally published algorithm.
044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
045 * </p>
046 * <p>
047 * This class is conditionally thread-safe.
048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
049 * but is not volatile, and accesses are not synchronized.
050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
052 * after initial setup.
053 * </p>
054 */
055public class Metaphone implements StringEncoder {
056
057    /**
058     * Five values in the English language
059     */
060    private static final String VOWELS = "AEIOU";
061
062    /**
063     * Variable used in Metaphone algorithm
064     */
065    private static final String FRONTV = "EIY";
066
067    /**
068     * Variable used in Metaphone algorithm
069     */
070    private static final String VARSON = "CSPTG";
071
072    /**
073     * The max code length for metaphone is 4
074     */
075    private int maxCodeLen = 4;
076
077    /**
078     * Encodes an Object using the metaphone algorithm.  This method
079     * is provided in order to satisfy the requirements of the
080     * Encoder interface, and will throw an EncoderException if the
081     * supplied object is not of type {@link String}.
082     *
083     * @param obj Object to encode
084     * @return An object (or type {@link String}) containing the
085     *         metaphone code which corresponds to the String supplied.
086     * @throws EncoderException if the parameter supplied is not
087     *                          of type {@link String}
088     */
089    @Override
090    public Object encode(final Object obj) throws EncoderException {
091        if (!(obj instanceof String)) {
092            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
093        }
094        return metaphone((String) obj);
095    }
096
097    /**
098     * Encodes a String using the Metaphone algorithm.
099     *
100     * @param str String object to encode
101     * @return The metaphone code corresponding to the String supplied
102     */
103    @Override
104    public String encode(final String str) {
105        return metaphone(str);
106    }
107
108    /**
109     * Returns the maxCodeLen.
110     * @return int
111     */
112    public int getMaxCodeLen() { return this.maxCodeLen; }
113
114    private boolean isLastChar(final int wdsz, final int n) {
115        return n + 1 == wdsz;
116    }
117
118    /**
119     * Tests is the metaphones of two strings are identical.
120     *
121     * @param str1 First of two strings to compare
122     * @param str2 Second of two strings to compare
123     * @return {@code true} if the metaphones of these strings are identical,
124     *        {@code false} otherwise.
125     */
126    public boolean isMetaphoneEqual(final String str1, final String str2) {
127        return metaphone(str1).equals(metaphone(str2));
128    }
129
130    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131        boolean matches = false;
132        if (index >= 0 && index < string.length() - 1) {
133            matches = string.charAt(index + 1) == c;
134        }
135        return matches;
136    }
137
138    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139        boolean matches = false;
140        if (index > 0 && index < string.length()) {
141            matches = string.charAt(index - 1) == c;
142        }
143        return matches;
144    }
145
146    private boolean isVowel(final StringBuilder string, final int index) {
147        return VOWELS.indexOf(string.charAt(index)) >= 0;
148    }
149
150    /**
151     * Find the metaphone value of a String. This is similar to the
152     * soundex algorithm, but better at finding similar sounding words.
153     * All input is converted to upper case.
154     * Limitations: Input format is expected to be a single ASCII word
155     * with only characters in the A - Z range, no punctuation or numbers.
156     *
157     * @param txt String to find the metaphone code for
158     * @return A metaphone code corresponding to the String supplied
159     */
160    public String metaphone(final String txt) {
161        boolean hard = false;
162        final int txtLength;
163        if (txt == null || (txtLength = txt.length()) == 0) {
164            return "";
165        }
166        // single character is itself
167        if (txtLength == 1) {
168            return txt.toUpperCase(java.util.Locale.ENGLISH);
169        }
170
171        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172
173        final StringBuilder local = new StringBuilder(40); // manipulate
174        final StringBuilder code = new StringBuilder(10); // output
175        // handle initial 2 characters exceptions
176        switch (inwd[0]) {
177        case 'K':
178        case 'G':
179        case 'P': /* looking for KN, etc */
180            if (inwd[1] == 'N') {
181                local.append(inwd, 1, inwd.length - 1);
182            } else {
183                local.append(inwd);
184            }
185            break;
186        case 'A': /* looking for AE */
187            if (inwd[1] == 'E') {
188                local.append(inwd, 1, inwd.length - 1);
189            } else {
190                local.append(inwd);
191            }
192            break;
193        case 'W': /* looking for WR or WH */
194            if (inwd[1] == 'R') { // WR -> R
195                local.append(inwd, 1, inwd.length - 1);
196                break;
197            }
198            if (inwd[1] == 'H') {
199                local.append(inwd, 1, inwd.length - 1);
200                local.setCharAt(0, 'W'); // WH -> W
201            } else {
202                local.append(inwd);
203            }
204            break;
205        case 'X': /* initial X becomes S */
206            inwd[0] = 'S';
207            local.append(inwd);
208            break;
209        default:
210            local.append(inwd);
211        } // now local has working string with initials fixed
212
213        final int wdsz = local.length();
214        int n = 0;
215
216        while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
217            final char symb = local.charAt(n);
218            // remove duplicate letters except C
219            if (symb != 'C' && isPreviousChar(local, n, symb)) {
220            } else { // not dup
221                switch (symb) {
222                case 'A':
223                case 'E':
224                case 'I':
225                case 'O':
226                case 'U':
227                    if (n == 0) {
228                        code.append(symb);
229                    }
230                    break; // only use vowel if leading char
231                case 'B':
232                    if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
233                        break;
234                    }
235                    code.append(symb);
236                    break;
237                case 'C': // lots of C special cases
238                    /* discard if SCI, SCE or SCY */
239                    if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
240                        break;
241                    }
242                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
243                        code.append('X');
244                        break;
245                    }
246                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247                        code.append('S');
248                        break; // CI,CE,CY -> S
249                    }
250                    if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
251                        code.append('K');
252                        break;
253                    }
254                    if (!isNextChar(local, n, 'H') || (n == 0 && wdsz >= 3 && isVowel(local, 2))) { // CH consonant -> K consonant
255                        code.append('K');
256                    } else {
257                        code.append('X'); // CHvowel -> X
258                    }
259                    break;
260                case 'D':
261                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
262                        code.append('J');
263                        n += 2;
264                    } else {
265                        code.append('T');
266                    }
267                    break;
268                case 'G': // GH silent at end or before consonant
269                    if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
270                        break;
271                    }
272                    if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
273                        break;
274                    }
275                    if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
276                        break; // silent G
277                    }
278                    // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
279                    hard = isPreviousChar(local, n, 'G');
280                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
281                        code.append('J');
282                    } else {
283                        code.append('K');
284                    }
285                    break;
286                case 'H':
287                    if (isLastChar(wdsz, n)) {
288                        break; // terminal H
289                    }
290                    if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
291                        break;
292                    }
293                    if (isVowel(local, n + 1)) {
294                        code.append('H'); // Hvowel
295                    }
296                    break;
297                case 'F':
298                case 'J':
299                case 'L':
300                case 'M':
301                case 'N':
302                case 'R':
303                    code.append(symb);
304                    break;
305                case 'K':
306                    if (n > 0) { // not initial
307                        if (!isPreviousChar(local, n, 'C')) {
308                            code.append(symb);
309                        }
310                    } else {
311                        code.append(symb); // initial K
312                    }
313                    break;
314                case 'P':
315                    if (isNextChar(local, n, 'H')) {
316                        // PH -> F
317                        code.append('F');
318                    } else {
319                        code.append(symb);
320                    }
321                    break;
322                case 'Q':
323                    code.append('K');
324                    break;
325                case 'S':
326                    if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
327                        code.append('X');
328                    } else {
329                        code.append('S');
330                    }
331                    break;
332                case 'T':
333                    if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
334                        code.append('X');
335                        break;
336                    }
337                    if (regionMatch(local, n, "TCH")) {
338                        // Silent if in "TCH"
339                        break;
340                    }
341                    // substitute numeral 0 for TH (resembles theta after all)
342                    if (regionMatch(local, n, "TH")) {
343                        code.append('0');
344                    } else {
345                        code.append('T');
346                    }
347                    break;
348                case 'V':
349                    code.append('F');
350                    break;
351                case 'W':
352                case 'Y': // silent if not followed by vowel
353                    if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
354                        code.append(symb);
355                    }
356                    break;
357                case 'X':
358                    code.append('K');
359                    code.append('S');
360                    break;
361                case 'Z':
362                    code.append('S');
363                    break;
364                default:
365                    // do nothing
366                    break;
367                } // end switch
368            } // end else from symb != 'C'
369            n++;
370            if (code.length() > getMaxCodeLen()) {
371                code.setLength(getMaxCodeLen());
372            }
373        }
374        return code.toString();
375    }
376
377    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
378        boolean matches = false;
379        if (index >= 0 && index + test.length() - 1 < string.length()) {
380            final String substring = string.substring(index, index + test.length());
381            matches = substring.equals(test);
382        }
383        return matches;
384    }
385
386    /**
387     * Sets the maxCodeLen.
388     * @param maxCodeLen The maxCodeLen to set
389     */
390    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
391
392}