001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a Metaphone value.
025     * <p>
026     * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027     * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028     * <p>
029     * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
030     * p 39.</CITE>
031     * <p>
032     * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl
033     * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
034     * They have had undocumented changes from the originally published algorithm.
035     * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
036     * <p>
037     * This class is conditionally thread-safe.
038     * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
039     * but is not volatile, and accesses are not synchronised.
040     * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation
041     * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
042     * after initial setup.
043     *
044     * @version $Id: Metaphone.html 889935 2013-12-11 05:05:13Z ggregory $
045     */
046    public class Metaphone implements StringEncoder {
047    
048        /**
049         * Five values in the English language
050         */
051        private static final String VOWELS = "AEIOU";
052    
053        /**
054         * Variable used in Metaphone algorithm
055         */
056        private static final String FRONTV = "EIY";
057    
058        /**
059         * Variable used in Metaphone algorithm
060         */
061        private static final String VARSON = "CSPTG";
062    
063        /**
064         * The max code length for metaphone is 4
065         */
066        private int maxCodeLen = 4;
067    
068        /**
069         * Creates an instance of the Metaphone encoder
070         */
071        public Metaphone() {
072            super();
073        }
074    
075        /**
076         * Find the metaphone value of a String. This is similar to the
077         * soundex algorithm, but better at finding similar sounding words.
078         * All input is converted to upper case.
079         * Limitations: Input format is expected to be a single ASCII word
080         * with only characters in the A - Z range, no punctuation or numbers.
081         *
082         * @param txt String to find the metaphone code for
083         * @return A metaphone code corresponding to the String supplied
084         */
085        public String metaphone(String txt) {
086            boolean hard = false;
087            if (txt == null || txt.length() == 0) {
088                return "";
089            }
090            // single character is itself
091            if (txt.length() == 1) {
092                return txt.toUpperCase(java.util.Locale.ENGLISH);
093            }
094    
095            char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
096    
097            StringBuilder local = new StringBuilder(40); // manipulate
098            StringBuilder code = new StringBuilder(10); //   output
099            // handle initial 2 characters exceptions
100            switch(inwd[0]) {
101            case 'K':
102            case 'G':
103            case 'P': /* looking for KN, etc*/
104                if (inwd[1] == 'N') {
105                    local.append(inwd, 1, inwd.length - 1);
106                } else {
107                    local.append(inwd);
108                }
109                break;
110            case 'A': /* looking for AE */
111                if (inwd[1] == 'E') {
112                    local.append(inwd, 1, inwd.length - 1);
113                } else {
114                    local.append(inwd);
115                }
116                break;
117            case 'W': /* looking for WR or WH */
118                if (inwd[1] == 'R') {   // WR -> R
119                    local.append(inwd, 1, inwd.length - 1);
120                    break;
121                }
122                if (inwd[1] == 'H') {
123                    local.append(inwd, 1, inwd.length - 1);
124                    local.setCharAt(0, 'W'); // WH -> W
125                } else {
126                    local.append(inwd);
127                }
128                break;
129            case 'X': /* initial X becomes S */
130                inwd[0] = 'S';
131                local.append(inwd);
132                break;
133            default:
134                local.append(inwd);
135            } // now local has working string with initials fixed
136    
137            int wdsz = local.length();
138            int n = 0;
139    
140            while (code.length() < this.getMaxCodeLen() &&
141                   n < wdsz ) { // max code size of 4 works well
142                char symb = local.charAt(n);
143                // remove duplicate letters except C
144                if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
145                    n++;
146                } else { // not dup
147                    switch(symb) {
148                    case 'A':
149                    case 'E':
150                    case 'I':
151                    case 'O':
152                    case 'U':
153                        if (n == 0) {
154                            code.append(symb);
155                        }
156                        break; // only use vowel if leading char
157                    case 'B':
158                        if ( isPreviousChar(local, n, 'M') &&
159                             isLastChar(wdsz, n) ) { // B is silent if word ends in MB
160                            break;
161                        }
162                        code.append(symb);
163                        break;
164                    case 'C': // lots of C special cases
165                        /* discard if SCI, SCE or SCY */
166                        if ( isPreviousChar(local, n, 'S') &&
167                             !isLastChar(wdsz, n) &&
168                             FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
169                            break;
170                        }
171                        if (regionMatch(local, n, "CIA")) { // "CIA" -> X
172                            code.append('X');
173                            break;
174                        }
175                        if (!isLastChar(wdsz, n) &&
176                            FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
177                            code.append('S');
178                            break; // CI,CE,CY -> S
179                        }
180                        if (isPreviousChar(local, n, 'S') &&
181                            isNextChar(local, n, 'H') ) { // SCH->sk
182                            code.append('K');
183                            break;
184                        }
185                        if (isNextChar(local, n, 'H')) { // detect CH
186                            if (n == 0 &&
187                                wdsz >= 3 &&
188                                isVowel(local,2) ) { // CH consonant -> K consonant
189                                code.append('K');
190                            } else {
191                                code.append('X'); // CHvowel -> X
192                            }
193                        } else {
194                            code.append('K');
195                        }
196                        break;
197                    case 'D':
198                        if (!isLastChar(wdsz, n + 1) &&
199                            isNextChar(local, n, 'G') &&
200                            FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
201                            code.append('J'); n += 2;
202                        } else {
203                            code.append('T');
204                        }
205                        break;
206                    case 'G': // GH silent at end or before consonant
207                        if (isLastChar(wdsz, n + 1) &&
208                            isNextChar(local, n, 'H')) {
209                            break;
210                        }
211                        if (!isLastChar(wdsz, n + 1) &&
212                            isNextChar(local,n,'H') &&
213                            !isVowel(local,n+2)) {
214                            break;
215                        }
216                        if (n > 0 &&
217                            ( regionMatch(local, n, "GN") ||
218                              regionMatch(local, n, "GNED") ) ) {
219                            break; // silent G
220                        }
221                        if (isPreviousChar(local, n, 'G')) {
222                            // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
223                            hard = true;
224                        } else {
225                            hard = false;
226                        }
227                        if (!isLastChar(wdsz, n) &&
228                            FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
229                            !hard) {
230                            code.append('J');
231                        } else {
232                            code.append('K');
233                        }
234                        break;
235                    case 'H':
236                        if (isLastChar(wdsz, n)) {
237                            break; // terminal H
238                        }
239                        if (n > 0 &&
240                            VARSON.indexOf(local.charAt(n - 1)) >= 0) {
241                            break;
242                        }
243                        if (isVowel(local,n+1)) {
244                            code.append('H'); // Hvowel
245                        }
246                        break;
247                    case 'F':
248                    case 'J':
249                    case 'L':
250                    case 'M':
251                    case 'N':
252                    case 'R':
253                        code.append(symb);
254                        break;
255                    case 'K':
256                        if (n > 0) { // not initial
257                            if (!isPreviousChar(local, n, 'C')) {
258                                code.append(symb);
259                            }
260                        } else {
261                            code.append(symb); // initial K
262                        }
263                        break;
264                    case 'P':
265                        if (isNextChar(local,n,'H')) {
266                            // PH -> F
267                            code.append('F');
268                        } else {
269                            code.append(symb);
270                        }
271                        break;
272                    case 'Q':
273                        code.append('K');
274                        break;
275                    case 'S':
276                        if (regionMatch(local,n,"SH") ||
277                            regionMatch(local,n,"SIO") ||
278                            regionMatch(local,n,"SIA")) {
279                            code.append('X');
280                        } else {
281                            code.append('S');
282                        }
283                        break;
284                    case 'T':
285                        if (regionMatch(local,n,"TIA") ||
286                            regionMatch(local,n,"TIO")) {
287                            code.append('X');
288                            break;
289                        }
290                        if (regionMatch(local,n,"TCH")) {
291                            // Silent if in "TCH"
292                            break;
293                        }
294                        // substitute numeral 0 for TH (resembles theta after all)
295                        if (regionMatch(local,n,"TH")) {
296                            code.append('0');
297                        } else {
298                            code.append('T');
299                        }
300                        break;
301                    case 'V':
302                        code.append('F'); break;
303                    case 'W':
304                    case 'Y': // silent if not followed by vowel
305                        if (!isLastChar(wdsz,n) &&
306                            isVowel(local,n+1)) {
307                            code.append(symb);
308                        }
309                        break;
310                    case 'X':
311                        code.append('K');
312                        code.append('S');
313                        break;
314                    case 'Z':
315                        code.append('S');
316                        break;
317                    default:
318                        // do nothing
319                        break;
320                    } // end switch
321                    n++;
322                } // end else from symb != 'C'
323                if (code.length() > this.getMaxCodeLen()) {
324                    code.setLength(this.getMaxCodeLen());
325                }
326            }
327            return code.toString();
328        }
329    
330        private boolean isVowel(StringBuilder string, int index) {
331            return VOWELS.indexOf(string.charAt(index)) >= 0;
332        }
333    
334        private boolean isPreviousChar(StringBuilder string, int index, char c) {
335            boolean matches = false;
336            if( index > 0 &&
337                index < string.length() ) {
338                matches = string.charAt(index - 1) == c;
339            }
340            return matches;
341        }
342    
343        private boolean isNextChar(StringBuilder string, int index, char c) {
344            boolean matches = false;
345            if( index >= 0 &&
346                index < string.length() - 1 ) {
347                matches = string.charAt(index + 1) == c;
348            }
349            return matches;
350        }
351    
352        private boolean regionMatch(StringBuilder string, int index, String test) {
353            boolean matches = false;
354            if( index >= 0 &&
355                index + test.length() - 1 < string.length() ) {
356                String substring = string.substring( index, index + test.length());
357                matches = substring.equals( test );
358            }
359            return matches;
360        }
361    
362        private boolean isLastChar(int wdsz, int n) {
363            return n + 1 == wdsz;
364        }
365    
366    
367        /**
368         * Encodes an Object using the metaphone algorithm.  This method
369         * is provided in order to satisfy the requirements of the
370         * Encoder interface, and will throw an EncoderException if the
371         * supplied object is not of type java.lang.String.
372         *
373         * @param obj Object to encode
374         * @return An object (or type java.lang.String) containing the
375         *         metaphone code which corresponds to the String supplied.
376         * @throws EncoderException if the parameter supplied is not
377         *                          of type java.lang.String
378         */
379        @Override
380        public Object encode(Object obj) throws EncoderException {
381            if (!(obj instanceof String)) {
382                throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
383            }
384            return metaphone((String) obj);
385        }
386    
387        /**
388         * Encodes a String using the Metaphone algorithm.
389         *
390         * @param str String object to encode
391         * @return The metaphone code corresponding to the String supplied
392         */
393        @Override
394        public String encode(String str) {
395            return metaphone(str);
396        }
397    
398        /**
399         * Tests is the metaphones of two strings are identical.
400         *
401         * @param str1 First of two strings to compare
402         * @param str2 Second of two strings to compare
403         * @return {@code true} if the metaphones of these strings are identical,
404         *        {@code false} otherwise.
405         */
406        public boolean isMetaphoneEqual(String str1, String str2) {
407            return metaphone(str1).equals(metaphone(str2));
408        }
409    
410        /**
411         * Returns the maxCodeLen.
412         * @return int
413         */
414        public int getMaxCodeLen() { return this.maxCodeLen; }
415    
416        /**
417         * Sets the maxCodeLen.
418         * @param maxCodeLen The maxCodeLen to set
419         */
420        public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
421    
422    }