001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * <p>
029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
030 * p 39.</CITE>
031 * <p>
032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
033 * </p>
034 * <ul>
035 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
036 *  (broken link 4/30/2013) </li>
037 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (link checked 4/30/2013) </li>
039 * </ul>
040 * <p>
041 * They have had undocumented changes from the originally published algorithm.
042 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
043 * <p>
044 * This class is conditionally thread-safe.
045 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
046 * but is not volatile, and accesses are not synchronized.
047 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
048 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
049 * after initial setup.
050 *
051 */
052public class Metaphone implements StringEncoder {
053
054    /**
055     * Five values in the English language
056     */
057    private static final String VOWELS = "AEIOU";
058
059    /**
060     * Variable used in Metaphone algorithm
061     */
062    private static final String FRONTV = "EIY";
063
064    /**
065     * Variable used in Metaphone algorithm
066     */
067    private static final String VARSON = "CSPTG";
068
069    /**
070     * The max code length for metaphone is 4
071     */
072    private int maxCodeLen = 4;
073
074    /**
075     * Creates an instance of the Metaphone encoder
076     */
077    public Metaphone() {
078        super();
079    }
080
081    /**
082     * Find the metaphone value of a String. This is similar to the
083     * soundex algorithm, but better at finding similar sounding words.
084     * All input is converted to upper case.
085     * Limitations: Input format is expected to be a single ASCII word
086     * with only characters in the A - Z range, no punctuation or numbers.
087     *
088     * @param txt String to find the metaphone code for
089     * @return A metaphone code corresponding to the String supplied
090     */
091    public String metaphone(final String txt) {
092        boolean hard = false;
093        int txtLength;
094        if (txt == null || (txtLength = txt.length()) == 0) {
095            return "";
096        }
097        // single character is itself
098        if (txtLength == 1) {
099            return txt.toUpperCase(java.util.Locale.ENGLISH);
100        }
101
102        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
103
104        final StringBuilder local = new StringBuilder(40); // manipulate
105        final StringBuilder code = new StringBuilder(10); //   output
106        // handle initial 2 characters exceptions
107        switch(inwd[0]) {
108        case 'K':
109        case 'G':
110        case 'P': /* looking for KN, etc*/
111            if (inwd[1] == 'N') {
112                local.append(inwd, 1, inwd.length - 1);
113            } else {
114                local.append(inwd);
115            }
116            break;
117        case 'A': /* looking for AE */
118            if (inwd[1] == 'E') {
119                local.append(inwd, 1, inwd.length - 1);
120            } else {
121                local.append(inwd);
122            }
123            break;
124        case 'W': /* looking for WR or WH */
125            if (inwd[1] == 'R') {   // WR -> R
126                local.append(inwd, 1, inwd.length - 1);
127                break;
128            }
129            if (inwd[1] == 'H') {
130                local.append(inwd, 1, inwd.length - 1);
131                local.setCharAt(0, 'W'); // WH -> W
132            } else {
133                local.append(inwd);
134            }
135            break;
136        case 'X': /* initial X becomes S */
137            inwd[0] = 'S';
138            local.append(inwd);
139            break;
140        default:
141            local.append(inwd);
142        } // now local has working string with initials fixed
143
144        final int wdsz = local.length();
145        int n = 0;
146
147        while (code.length() < this.getMaxCodeLen() &&
148               n < wdsz ) { // max code size of 4 works well
149            final char symb = local.charAt(n);
150            // remove duplicate letters except C
151            if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
152                n++;
153            } else { // not dup
154                switch(symb) {
155                case 'A':
156                case 'E':
157                case 'I':
158                case 'O':
159                case 'U':
160                    if (n == 0) {
161                        code.append(symb);
162                    }
163                    break; // only use vowel if leading char
164                case 'B':
165                    if ( isPreviousChar(local, n, 'M') &&
166                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
167                        break;
168                    }
169                    code.append(symb);
170                    break;
171                case 'C': // lots of C special cases
172                    /* discard if SCI, SCE or SCY */
173                    if ( isPreviousChar(local, n, 'S') &&
174                         !isLastChar(wdsz, n) &&
175                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
176                        break;
177                    }
178                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
179                        code.append('X');
180                        break;
181                    }
182                    if (!isLastChar(wdsz, n) &&
183                        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
184                        code.append('S');
185                        break; // CI,CE,CY -> S
186                    }
187                    if (isPreviousChar(local, n, 'S') &&
188                        isNextChar(local, n, 'H') ) { // SCH->sk
189                        code.append('K');
190                        break;
191                    }
192                    if (isNextChar(local, n, 'H')) { // detect CH
193                        if (n == 0 &&
194                            wdsz >= 3 &&
195                            isVowel(local,2) ) { // CH consonant -> K consonant
196                            code.append('K');
197                        } else {
198                            code.append('X'); // CHvowel -> X
199                        }
200                    } else {
201                        code.append('K');
202                    }
203                    break;
204                case 'D':
205                    if (!isLastChar(wdsz, n + 1) &&
206                        isNextChar(local, n, 'G') &&
207                        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
208                        code.append('J'); n += 2;
209                    } else {
210                        code.append('T');
211                    }
212                    break;
213                case 'G': // GH silent at end or before consonant
214                    if (isLastChar(wdsz, n + 1) &&
215                        isNextChar(local, n, 'H')) {
216                        break;
217                    }
218                    if (!isLastChar(wdsz, n + 1) &&
219                        isNextChar(local,n,'H') &&
220                        !isVowel(local,n+2)) {
221                        break;
222                    }
223                    if (n > 0 &&
224                        ( regionMatch(local, n, "GN") ||
225                          regionMatch(local, n, "GNED") ) ) {
226                        break; // silent G
227                    }
228                    if (isPreviousChar(local, n, 'G')) {
229                        // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
230                        hard = true;
231                    } else {
232                        hard = false;
233                    }
234                    if (!isLastChar(wdsz, n) &&
235                        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
236                        !hard) {
237                        code.append('J');
238                    } else {
239                        code.append('K');
240                    }
241                    break;
242                case 'H':
243                    if (isLastChar(wdsz, n)) {
244                        break; // terminal H
245                    }
246                    if (n > 0 &&
247                        VARSON.indexOf(local.charAt(n - 1)) >= 0) {
248                        break;
249                    }
250                    if (isVowel(local,n+1)) {
251                        code.append('H'); // Hvowel
252                    }
253                    break;
254                case 'F':
255                case 'J':
256                case 'L':
257                case 'M':
258                case 'N':
259                case 'R':
260                    code.append(symb);
261                    break;
262                case 'K':
263                    if (n > 0) { // not initial
264                        if (!isPreviousChar(local, n, 'C')) {
265                            code.append(symb);
266                        }
267                    } else {
268                        code.append(symb); // initial K
269                    }
270                    break;
271                case 'P':
272                    if (isNextChar(local,n,'H')) {
273                        // PH -> F
274                        code.append('F');
275                    } else {
276                        code.append(symb);
277                    }
278                    break;
279                case 'Q':
280                    code.append('K');
281                    break;
282                case 'S':
283                    if (regionMatch(local,n,"SH") ||
284                        regionMatch(local,n,"SIO") ||
285                        regionMatch(local,n,"SIA")) {
286                        code.append('X');
287                    } else {
288                        code.append('S');
289                    }
290                    break;
291                case 'T':
292                    if (regionMatch(local,n,"TIA") ||
293                        regionMatch(local,n,"TIO")) {
294                        code.append('X');
295                        break;
296                    }
297                    if (regionMatch(local,n,"TCH")) {
298                        // Silent if in "TCH"
299                        break;
300                    }
301                    // substitute numeral 0 for TH (resembles theta after all)
302                    if (regionMatch(local,n,"TH")) {
303                        code.append('0');
304                    } else {
305                        code.append('T');
306                    }
307                    break;
308                case 'V':
309                    code.append('F'); break;
310                case 'W':
311                case 'Y': // silent if not followed by vowel
312                    if (!isLastChar(wdsz,n) &&
313                        isVowel(local,n+1)) {
314                        code.append(symb);
315                    }
316                    break;
317                case 'X':
318                    code.append('K');
319                    code.append('S');
320                    break;
321                case 'Z':
322                    code.append('S');
323                    break;
324                default:
325                    // do nothing
326                    break;
327                } // end switch
328                n++;
329            } // end else from symb != 'C'
330            if (code.length() > this.getMaxCodeLen()) {
331                code.setLength(this.getMaxCodeLen());
332            }
333        }
334        return code.toString();
335    }
336
337    private boolean isVowel(final StringBuilder string, final int index) {
338        return VOWELS.indexOf(string.charAt(index)) >= 0;
339    }
340
341    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
342        boolean matches = false;
343        if( index > 0 &&
344            index < string.length() ) {
345            matches = string.charAt(index - 1) == c;
346        }
347        return matches;
348    }
349
350    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
351        boolean matches = false;
352        if( index >= 0 &&
353            index < string.length() - 1 ) {
354            matches = string.charAt(index + 1) == c;
355        }
356        return matches;
357    }
358
359    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
360        boolean matches = false;
361        if( index >= 0 &&
362            index + test.length() - 1 < string.length() ) {
363            final String substring = string.substring( index, index + test.length());
364            matches = substring.equals( test );
365        }
366        return matches;
367    }
368
369    private boolean isLastChar(final int wdsz, final int n) {
370        return n + 1 == wdsz;
371    }
372
373
374    /**
375     * Encodes an Object using the metaphone algorithm.  This method
376     * is provided in order to satisfy the requirements of the
377     * Encoder interface, and will throw an EncoderException if the
378     * supplied object is not of type java.lang.String.
379     *
380     * @param obj Object to encode
381     * @return An object (or type java.lang.String) containing the
382     *         metaphone code which corresponds to the String supplied.
383     * @throws EncoderException if the parameter supplied is not
384     *                          of type java.lang.String
385     */
386    @Override
387    public Object encode(final Object obj) throws EncoderException {
388        if (!(obj instanceof String)) {
389            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
390        }
391        return metaphone((String) obj);
392    }
393
394    /**
395     * Encodes a String using the Metaphone algorithm.
396     *
397     * @param str String object to encode
398     * @return The metaphone code corresponding to the String supplied
399     */
400    @Override
401    public String encode(final String str) {
402        return metaphone(str);
403    }
404
405    /**
406     * Tests is the metaphones of two strings are identical.
407     *
408     * @param str1 First of two strings to compare
409     * @param str2 Second of two strings to compare
410     * @return <code>true</code> if the metaphones of these strings are identical,
411     *        <code>false</code> otherwise.
412     */
413    public boolean isMetaphoneEqual(final String str1, final String str2) {
414        return metaphone(str1).equals(metaphone(str2));
415    }
416
417    /**
418     * Returns the maxCodeLen.
419     * @return int
420     */
421    public int getMaxCodeLen() { return this.maxCodeLen; }
422
423    /**
424     * Sets the maxCodeLen.
425     * @param maxCodeLen The maxCodeLen to set
426     */
427    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
428
429}