001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * <p>
029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
030 * p 39.</CITE>
031 * <p>
032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
033 * </p>
034 * <ul>
035 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
036 *  (broken link 4/30/2013) </li>
037 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (link checked 4/30/2013) </li>
039 * </ul>
040 * <p>
041 * They have had undocumented changes from the originally published algorithm.
042 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
043 * <p>
044 * This class is conditionally thread-safe.
045 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
046 * but is not volatile, and accesses are not synchronized.
047 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
048 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
049 * after initial setup.
050 *
051 * @version $Id: Metaphone.html 928559 2014-11-10 02:53:54Z ggregory $
052 */
053public class Metaphone implements StringEncoder {
054
055    /**
056     * Five values in the English language
057     */
058    private static final String VOWELS = "AEIOU";
059
060    /**
061     * Variable used in Metaphone algorithm
062     */
063    private static final String FRONTV = "EIY";
064
065    /**
066     * Variable used in Metaphone algorithm
067     */
068    private static final String VARSON = "CSPTG";
069
070    /**
071     * The max code length for metaphone is 4
072     */
073    private int maxCodeLen = 4;
074
075    /**
076     * Creates an instance of the Metaphone encoder
077     */
078    public Metaphone() {
079        super();
080    }
081
082    /**
083     * Find the metaphone value of a String. This is similar to the
084     * soundex algorithm, but better at finding similar sounding words.
085     * All input is converted to upper case.
086     * Limitations: Input format is expected to be a single ASCII word
087     * with only characters in the A - Z range, no punctuation or numbers.
088     *
089     * @param txt String to find the metaphone code for
090     * @return A metaphone code corresponding to the String supplied
091     */
092    public String metaphone(final String txt) {
093        boolean hard = false;
094        int txtLength;
095        if (txt == null || (txtLength = txt.length()) == 0) {
096            return "";
097        }
098        // single character is itself
099        if (txtLength == 1) {
100            return txt.toUpperCase(java.util.Locale.ENGLISH);
101        }
102
103        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
104
105        final StringBuilder local = new StringBuilder(40); // manipulate
106        final StringBuilder code = new StringBuilder(10); //   output
107        // handle initial 2 characters exceptions
108        switch(inwd[0]) {
109        case 'K':
110        case 'G':
111        case 'P': /* looking for KN, etc*/
112            if (inwd[1] == 'N') {
113                local.append(inwd, 1, inwd.length - 1);
114            } else {
115                local.append(inwd);
116            }
117            break;
118        case 'A': /* looking for AE */
119            if (inwd[1] == 'E') {
120                local.append(inwd, 1, inwd.length - 1);
121            } else {
122                local.append(inwd);
123            }
124            break;
125        case 'W': /* looking for WR or WH */
126            if (inwd[1] == 'R') {   // WR -> R
127                local.append(inwd, 1, inwd.length - 1);
128                break;
129            }
130            if (inwd[1] == 'H') {
131                local.append(inwd, 1, inwd.length - 1);
132                local.setCharAt(0, 'W'); // WH -> W
133            } else {
134                local.append(inwd);
135            }
136            break;
137        case 'X': /* initial X becomes S */
138            inwd[0] = 'S';
139            local.append(inwd);
140            break;
141        default:
142            local.append(inwd);
143        } // now local has working string with initials fixed
144
145        final int wdsz = local.length();
146        int n = 0;
147
148        while (code.length() < this.getMaxCodeLen() &&
149               n < wdsz ) { // max code size of 4 works well
150            final char symb = local.charAt(n);
151            // remove duplicate letters except C
152            if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
153                n++;
154            } else { // not dup
155                switch(symb) {
156                case 'A':
157                case 'E':
158                case 'I':
159                case 'O':
160                case 'U':
161                    if (n == 0) {
162                        code.append(symb);
163                    }
164                    break; // only use vowel if leading char
165                case 'B':
166                    if ( isPreviousChar(local, n, 'M') &&
167                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
168                        break;
169                    }
170                    code.append(symb);
171                    break;
172                case 'C': // lots of C special cases
173                    /* discard if SCI, SCE or SCY */
174                    if ( isPreviousChar(local, n, 'S') &&
175                         !isLastChar(wdsz, n) &&
176                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
177                        break;
178                    }
179                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
180                        code.append('X');
181                        break;
182                    }
183                    if (!isLastChar(wdsz, n) &&
184                        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
185                        code.append('S');
186                        break; // CI,CE,CY -> S
187                    }
188                    if (isPreviousChar(local, n, 'S') &&
189                        isNextChar(local, n, 'H') ) { // SCH->sk
190                        code.append('K');
191                        break;
192                    }
193                    if (isNextChar(local, n, 'H')) { // detect CH
194                        if (n == 0 &&
195                            wdsz >= 3 &&
196                            isVowel(local,2) ) { // CH consonant -> K consonant
197                            code.append('K');
198                        } else {
199                            code.append('X'); // CHvowel -> X
200                        }
201                    } else {
202                        code.append('K');
203                    }
204                    break;
205                case 'D':
206                    if (!isLastChar(wdsz, n + 1) &&
207                        isNextChar(local, n, 'G') &&
208                        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
209                        code.append('J'); n += 2;
210                    } else {
211                        code.append('T');
212                    }
213                    break;
214                case 'G': // GH silent at end or before consonant
215                    if (isLastChar(wdsz, n + 1) &&
216                        isNextChar(local, n, 'H')) {
217                        break;
218                    }
219                    if (!isLastChar(wdsz, n + 1) &&
220                        isNextChar(local,n,'H') &&
221                        !isVowel(local,n+2)) {
222                        break;
223                    }
224                    if (n > 0 &&
225                        ( regionMatch(local, n, "GN") ||
226                          regionMatch(local, n, "GNED") ) ) {
227                        break; // silent G
228                    }
229                    if (isPreviousChar(local, n, 'G')) {
230                        // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
231                        hard = true;
232                    } else {
233                        hard = false;
234                    }
235                    if (!isLastChar(wdsz, n) &&
236                        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
237                        !hard) {
238                        code.append('J');
239                    } else {
240                        code.append('K');
241                    }
242                    break;
243                case 'H':
244                    if (isLastChar(wdsz, n)) {
245                        break; // terminal H
246                    }
247                    if (n > 0 &&
248                        VARSON.indexOf(local.charAt(n - 1)) >= 0) {
249                        break;
250                    }
251                    if (isVowel(local,n+1)) {
252                        code.append('H'); // Hvowel
253                    }
254                    break;
255                case 'F':
256                case 'J':
257                case 'L':
258                case 'M':
259                case 'N':
260                case 'R':
261                    code.append(symb);
262                    break;
263                case 'K':
264                    if (n > 0) { // not initial
265                        if (!isPreviousChar(local, n, 'C')) {
266                            code.append(symb);
267                        }
268                    } else {
269                        code.append(symb); // initial K
270                    }
271                    break;
272                case 'P':
273                    if (isNextChar(local,n,'H')) {
274                        // PH -> F
275                        code.append('F');
276                    } else {
277                        code.append(symb);
278                    }
279                    break;
280                case 'Q':
281                    code.append('K');
282                    break;
283                case 'S':
284                    if (regionMatch(local,n,"SH") ||
285                        regionMatch(local,n,"SIO") ||
286                        regionMatch(local,n,"SIA")) {
287                        code.append('X');
288                    } else {
289                        code.append('S');
290                    }
291                    break;
292                case 'T':
293                    if (regionMatch(local,n,"TIA") ||
294                        regionMatch(local,n,"TIO")) {
295                        code.append('X');
296                        break;
297                    }
298                    if (regionMatch(local,n,"TCH")) {
299                        // Silent if in "TCH"
300                        break;
301                    }
302                    // substitute numeral 0 for TH (resembles theta after all)
303                    if (regionMatch(local,n,"TH")) {
304                        code.append('0');
305                    } else {
306                        code.append('T');
307                    }
308                    break;
309                case 'V':
310                    code.append('F'); break;
311                case 'W':
312                case 'Y': // silent if not followed by vowel
313                    if (!isLastChar(wdsz,n) &&
314                        isVowel(local,n+1)) {
315                        code.append(symb);
316                    }
317                    break;
318                case 'X':
319                    code.append('K');
320                    code.append('S');
321                    break;
322                case 'Z':
323                    code.append('S');
324                    break;
325                default:
326                    // do nothing
327                    break;
328                } // end switch
329                n++;
330            } // end else from symb != 'C'
331            if (code.length() > this.getMaxCodeLen()) {
332                code.setLength(this.getMaxCodeLen());
333            }
334        }
335        return code.toString();
336    }
337
338    private boolean isVowel(final StringBuilder string, final int index) {
339        return VOWELS.indexOf(string.charAt(index)) >= 0;
340    }
341
342    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
343        boolean matches = false;
344        if( index > 0 &&
345            index < string.length() ) {
346            matches = string.charAt(index - 1) == c;
347        }
348        return matches;
349    }
350
351    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
352        boolean matches = false;
353        if( index >= 0 &&
354            index < string.length() - 1 ) {
355            matches = string.charAt(index + 1) == c;
356        }
357        return matches;
358    }
359
360    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
361        boolean matches = false;
362        if( index >= 0 &&
363            index + test.length() - 1 < string.length() ) {
364            final String substring = string.substring( index, index + test.length());
365            matches = substring.equals( test );
366        }
367        return matches;
368    }
369
370    private boolean isLastChar(final int wdsz, final int n) {
371        return n + 1 == wdsz;
372    }
373
374
375    /**
376     * Encodes an Object using the metaphone algorithm.  This method
377     * is provided in order to satisfy the requirements of the
378     * Encoder interface, and will throw an EncoderException if the
379     * supplied object is not of type java.lang.String.
380     *
381     * @param obj Object to encode
382     * @return An object (or type java.lang.String) containing the
383     *         metaphone code which corresponds to the String supplied.
384     * @throws EncoderException if the parameter supplied is not
385     *                          of type java.lang.String
386     */
387    @Override
388    public Object encode(final Object obj) throws EncoderException {
389        if (!(obj instanceof String)) {
390            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
391        }
392        return metaphone((String) obj);
393    }
394
395    /**
396     * Encodes a String using the Metaphone algorithm.
397     *
398     * @param str String object to encode
399     * @return The metaphone code corresponding to the String supplied
400     */
401    @Override
402    public String encode(final String str) {
403        return metaphone(str);
404    }
405
406    /**
407     * Tests is the metaphones of two strings are identical.
408     *
409     * @param str1 First of two strings to compare
410     * @param str2 Second of two strings to compare
411     * @return <code>true</code> if the metaphones of these strings are identical,
412     *        <code>false</code> otherwise.
413     */
414    public boolean isMetaphoneEqual(final String str1, final String str2) {
415        return metaphone(str1).equals(metaphone(str2));
416    }
417
418    /**
419     * Returns the maxCodeLen.
420     * @return int
421     */
422    public int getMaxCodeLen() { return this.maxCodeLen; }
423
424    /**
425     * Sets the maxCodeLen.
426     * @param maxCodeLen The maxCodeLen to set
427     */
428    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
429
430}