View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * <p>
29   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30   * p 39.</CITE>
31   * <p>
32   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
33   * </p>
34   * <ul>
35   * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
36   *  (broken link 4/30/2013) </li>
37   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
38   *  (link checked 4/30/2013) </li>
39   * </ul>
40   * <p>
41   * They have had undocumented changes from the originally published algorithm.
42   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
43   * <p>
44   * This class is conditionally thread-safe.
45   * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
46   * but is not volatile, and accesses are not synchronized.
47   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
48   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
49   * after initial setup.
50   *
51   * @version $Id: Metaphone.java 1619948 2014-08-22 22:53:55Z ggregory $
52   */
53  public class Metaphone implements StringEncoder {
54  
55      /**
56       * Five values in the English language
57       */
58      private static final String VOWELS = "AEIOU";
59  
60      /**
61       * Variable used in Metaphone algorithm
62       */
63      private static final String FRONTV = "EIY";
64  
65      /**
66       * Variable used in Metaphone algorithm
67       */
68      private static final String VARSON = "CSPTG";
69  
70      /**
71       * The max code length for metaphone is 4
72       */
73      private int maxCodeLen = 4;
74  
75      /**
76       * Creates an instance of the Metaphone encoder
77       */
78      public Metaphone() {
79          super();
80      }
81  
82      /**
83       * Find the metaphone value of a String. This is similar to the
84       * soundex algorithm, but better at finding similar sounding words.
85       * All input is converted to upper case.
86       * Limitations: Input format is expected to be a single ASCII word
87       * with only characters in the A - Z range, no punctuation or numbers.
88       *
89       * @param txt String to find the metaphone code for
90       * @return A metaphone code corresponding to the String supplied
91       */
92      public String metaphone(final String txt) {
93          boolean hard = false;
94          int txtLength;
95          if (txt == null || (txtLength = txt.length()) == 0) {
96              return "";
97          }
98          // single character is itself
99          if (txtLength == 1) {
100             return txt.toUpperCase(java.util.Locale.ENGLISH);
101         }
102 
103         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
104 
105         final StringBuilder local = new StringBuilder(40); // manipulate
106         final StringBuilder code = new StringBuilder(10); //   output
107         // handle initial 2 characters exceptions
108         switch(inwd[0]) {
109         case 'K':
110         case 'G':
111         case 'P': /* looking for KN, etc*/
112             if (inwd[1] == 'N') {
113                 local.append(inwd, 1, inwd.length - 1);
114             } else {
115                 local.append(inwd);
116             }
117             break;
118         case 'A': /* looking for AE */
119             if (inwd[1] == 'E') {
120                 local.append(inwd, 1, inwd.length - 1);
121             } else {
122                 local.append(inwd);
123             }
124             break;
125         case 'W': /* looking for WR or WH */
126             if (inwd[1] == 'R') {   // WR -> R
127                 local.append(inwd, 1, inwd.length - 1);
128                 break;
129             }
130             if (inwd[1] == 'H') {
131                 local.append(inwd, 1, inwd.length - 1);
132                 local.setCharAt(0, 'W'); // WH -> W
133             } else {
134                 local.append(inwd);
135             }
136             break;
137         case 'X': /* initial X becomes S */
138             inwd[0] = 'S';
139             local.append(inwd);
140             break;
141         default:
142             local.append(inwd);
143         } // now local has working string with initials fixed
144 
145         final int wdsz = local.length();
146         int n = 0;
147 
148         while (code.length() < this.getMaxCodeLen() &&
149                n < wdsz ) { // max code size of 4 works well
150             final char symb = local.charAt(n);
151             // remove duplicate letters except C
152             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
153                 n++;
154             } else { // not dup
155                 switch(symb) {
156                 case 'A':
157                 case 'E':
158                 case 'I':
159                 case 'O':
160                 case 'U':
161                     if (n == 0) {
162                         code.append(symb);
163                     }
164                     break; // only use vowel if leading char
165                 case 'B':
166                     if ( isPreviousChar(local, n, 'M') &&
167                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
168                         break;
169                     }
170                     code.append(symb);
171                     break;
172                 case 'C': // lots of C special cases
173                     /* discard if SCI, SCE or SCY */
174                     if ( isPreviousChar(local, n, 'S') &&
175                          !isLastChar(wdsz, n) &&
176                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
177                         break;
178                     }
179                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
180                         code.append('X');
181                         break;
182                     }
183                     if (!isLastChar(wdsz, n) &&
184                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
185                         code.append('S');
186                         break; // CI,CE,CY -> S
187                     }
188                     if (isPreviousChar(local, n, 'S') &&
189                         isNextChar(local, n, 'H') ) { // SCH->sk
190                         code.append('K');
191                         break;
192                     }
193                     if (isNextChar(local, n, 'H')) { // detect CH
194                         if (n == 0 &&
195                             wdsz >= 3 &&
196                             isVowel(local,2) ) { // CH consonant -> K consonant
197                             code.append('K');
198                         } else {
199                             code.append('X'); // CHvowel -> X
200                         }
201                     } else {
202                         code.append('K');
203                     }
204                     break;
205                 case 'D':
206                     if (!isLastChar(wdsz, n + 1) &&
207                         isNextChar(local, n, 'G') &&
208                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
209                         code.append('J'); n += 2;
210                     } else {
211                         code.append('T');
212                     }
213                     break;
214                 case 'G': // GH silent at end or before consonant
215                     if (isLastChar(wdsz, n + 1) &&
216                         isNextChar(local, n, 'H')) {
217                         break;
218                     }
219                     if (!isLastChar(wdsz, n + 1) &&
220                         isNextChar(local,n,'H') &&
221                         !isVowel(local,n+2)) {
222                         break;
223                     }
224                     if (n > 0 &&
225                         ( regionMatch(local, n, "GN") ||
226                           regionMatch(local, n, "GNED") ) ) {
227                         break; // silent G
228                     }
229                     if (isPreviousChar(local, n, 'G')) {
230                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
231                         hard = true;
232                     } else {
233                         hard = false;
234                     }
235                     if (!isLastChar(wdsz, n) &&
236                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
237                         !hard) {
238                         code.append('J');
239                     } else {
240                         code.append('K');
241                     }
242                     break;
243                 case 'H':
244                     if (isLastChar(wdsz, n)) {
245                         break; // terminal H
246                     }
247                     if (n > 0 &&
248                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
249                         break;
250                     }
251                     if (isVowel(local,n+1)) {
252                         code.append('H'); // Hvowel
253                     }
254                     break;
255                 case 'F':
256                 case 'J':
257                 case 'L':
258                 case 'M':
259                 case 'N':
260                 case 'R':
261                     code.append(symb);
262                     break;
263                 case 'K':
264                     if (n > 0) { // not initial
265                         if (!isPreviousChar(local, n, 'C')) {
266                             code.append(symb);
267                         }
268                     } else {
269                         code.append(symb); // initial K
270                     }
271                     break;
272                 case 'P':
273                     if (isNextChar(local,n,'H')) {
274                         // PH -> F
275                         code.append('F');
276                     } else {
277                         code.append(symb);
278                     }
279                     break;
280                 case 'Q':
281                     code.append('K');
282                     break;
283                 case 'S':
284                     if (regionMatch(local,n,"SH") ||
285                         regionMatch(local,n,"SIO") ||
286                         regionMatch(local,n,"SIA")) {
287                         code.append('X');
288                     } else {
289                         code.append('S');
290                     }
291                     break;
292                 case 'T':
293                     if (regionMatch(local,n,"TIA") ||
294                         regionMatch(local,n,"TIO")) {
295                         code.append('X');
296                         break;
297                     }
298                     if (regionMatch(local,n,"TCH")) {
299                         // Silent if in "TCH"
300                         break;
301                     }
302                     // substitute numeral 0 for TH (resembles theta after all)
303                     if (regionMatch(local,n,"TH")) {
304                         code.append('0');
305                     } else {
306                         code.append('T');
307                     }
308                     break;
309                 case 'V':
310                     code.append('F'); break;
311                 case 'W':
312                 case 'Y': // silent if not followed by vowel
313                     if (!isLastChar(wdsz,n) &&
314                         isVowel(local,n+1)) {
315                         code.append(symb);
316                     }
317                     break;
318                 case 'X':
319                     code.append('K');
320                     code.append('S');
321                     break;
322                 case 'Z':
323                     code.append('S');
324                     break;
325                 default:
326                     // do nothing
327                     break;
328                 } // end switch
329                 n++;
330             } // end else from symb != 'C'
331             if (code.length() > this.getMaxCodeLen()) {
332                 code.setLength(this.getMaxCodeLen());
333             }
334         }
335         return code.toString();
336     }
337 
338     private boolean isVowel(final StringBuilder string, final int index) {
339         return VOWELS.indexOf(string.charAt(index)) >= 0;
340     }
341 
342     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
343         boolean matches = false;
344         if( index > 0 &&
345             index < string.length() ) {
346             matches = string.charAt(index - 1) == c;
347         }
348         return matches;
349     }
350 
351     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
352         boolean matches = false;
353         if( index >= 0 &&
354             index < string.length() - 1 ) {
355             matches = string.charAt(index + 1) == c;
356         }
357         return matches;
358     }
359 
360     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
361         boolean matches = false;
362         if( index >= 0 &&
363             index + test.length() - 1 < string.length() ) {
364             final String substring = string.substring( index, index + test.length());
365             matches = substring.equals( test );
366         }
367         return matches;
368     }
369 
370     private boolean isLastChar(final int wdsz, final int n) {
371         return n + 1 == wdsz;
372     }
373 
374 
375     /**
376      * Encodes an Object using the metaphone algorithm.  This method
377      * is provided in order to satisfy the requirements of the
378      * Encoder interface, and will throw an EncoderException if the
379      * supplied object is not of type java.lang.String.
380      *
381      * @param obj Object to encode
382      * @return An object (or type java.lang.String) containing the
383      *         metaphone code which corresponds to the String supplied.
384      * @throws EncoderException if the parameter supplied is not
385      *                          of type java.lang.String
386      */
387     @Override
388     public Object encode(final Object obj) throws EncoderException {
389         if (!(obj instanceof String)) {
390             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
391         }
392         return metaphone((String) obj);
393     }
394 
395     /**
396      * Encodes a String using the Metaphone algorithm.
397      *
398      * @param str String object to encode
399      * @return The metaphone code corresponding to the String supplied
400      */
401     @Override
402     public String encode(final String str) {
403         return metaphone(str);
404     }
405 
406     /**
407      * Tests is the metaphones of two strings are identical.
408      *
409      * @param str1 First of two strings to compare
410      * @param str2 Second of two strings to compare
411      * @return <code>true</code> if the metaphones of these strings are identical,
412      *        <code>false</code> otherwise.
413      */
414     public boolean isMetaphoneEqual(final String str1, final String str2) {
415         return metaphone(str1).equals(metaphone(str2));
416     }
417 
418     /**
419      * Returns the maxCodeLen.
420      * @return int
421      */
422     public int getMaxCodeLen() { return this.maxCodeLen; }
423 
424     /**
425      * Sets the maxCodeLen.
426      * @param maxCodeLen The maxCodeLen to set
427      */
428     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
429 
430 }