View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * </p>
29   * <p>
30   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
31   * p 39.</CITE>
32   * </p>
33   * <p>
34   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
35   * </p>
36   * <ul>
37   * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
38   *  (broken link 4/30/2013) </li>
39   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
40   *  (link checked 4/30/2013) </li>
41   * </ul>
42   * <p>
43   * They have had undocumented changes from the originally published algorithm.
44   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
45   * </p>
46   * <p>
47   * This class is conditionally thread-safe.
48   * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
49   * but is not volatile, and accesses are not synchronized.
50   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
51   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
52   * after initial setup.
53   * </p>
54   */
55  public class Metaphone implements StringEncoder {
56  
57      /**
58       * Five values in the English language
59       */
60      private static final String VOWELS = "AEIOU";
61  
62      /**
63       * Variable used in Metaphone algorithm
64       */
65      private static final String FRONTV = "EIY";
66  
67      /**
68       * Variable used in Metaphone algorithm
69       */
70      private static final String VARSON = "CSPTG";
71  
72      /**
73       * The max code length for metaphone is 4
74       */
75      private int maxCodeLen = 4;
76  
77      /**
78       * Encodes an Object using the metaphone algorithm.  This method
79       * is provided in order to satisfy the requirements of the
80       * Encoder interface, and will throw an EncoderException if the
81       * supplied object is not of type {@link String}.
82       *
83       * @param obj Object to encode
84       * @return An object (or type {@link String}) containing the
85       *         metaphone code which corresponds to the String supplied.
86       * @throws EncoderException if the parameter supplied is not
87       *                          of type {@link String}
88       */
89      @Override
90      public Object encode(final Object obj) throws EncoderException {
91          if (!(obj instanceof String)) {
92              throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
93          }
94          return metaphone((String) obj);
95      }
96  
97      /**
98       * Encodes a String using the Metaphone algorithm.
99       *
100      * @param str String object to encode
101      * @return The metaphone code corresponding to the String supplied
102      */
103     @Override
104     public String encode(final String str) {
105         return metaphone(str);
106     }
107 
108     /**
109      * Returns the maxCodeLen.
110      * @return int
111      */
112     public int getMaxCodeLen() { return this.maxCodeLen; }
113 
114     private boolean isLastChar(final int wdsz, final int n) {
115         return n + 1 == wdsz;
116     }
117 
118     /**
119      * Tests is the metaphones of two strings are identical.
120      *
121      * @param str1 First of two strings to compare
122      * @param str2 Second of two strings to compare
123      * @return {@code true} if the metaphones of these strings are identical,
124      *        {@code false} otherwise.
125      */
126     public boolean isMetaphoneEqual(final String str1, final String str2) {
127         return metaphone(str1).equals(metaphone(str2));
128     }
129 
130     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131         boolean matches = false;
132         if (index >= 0 && index < string.length() - 1) {
133             matches = string.charAt(index + 1) == c;
134         }
135         return matches;
136     }
137 
138     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139         boolean matches = false;
140         if (index > 0 && index < string.length()) {
141             matches = string.charAt(index - 1) == c;
142         }
143         return matches;
144     }
145 
146     private boolean isVowel(final StringBuilder string, final int index) {
147         return VOWELS.indexOf(string.charAt(index)) >= 0;
148     }
149 
150     /**
151      * Find the metaphone value of a String. This is similar to the
152      * soundex algorithm, but better at finding similar sounding words.
153      * All input is converted to upper case.
154      * Limitations: Input format is expected to be a single ASCII word
155      * with only characters in the A - Z range, no punctuation or numbers.
156      *
157      * @param txt String to find the metaphone code for
158      * @return A metaphone code corresponding to the String supplied
159      */
160     public String metaphone(final String txt) {
161         boolean hard = false;
162         final int txtLength;
163         if (txt == null || (txtLength = txt.length()) == 0) {
164             return "";
165         }
166         // single character is itself
167         if (txtLength == 1) {
168             return txt.toUpperCase(java.util.Locale.ENGLISH);
169         }
170 
171         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172 
173         final StringBuilder local = new StringBuilder(40); // manipulate
174         final StringBuilder code = new StringBuilder(10); // output
175         // handle initial 2 characters exceptions
176         switch (inwd[0]) {
177         case 'K':
178         case 'G':
179         case 'P': /* looking for KN, etc */
180             if (inwd[1] == 'N') {
181                 local.append(inwd, 1, inwd.length - 1);
182             } else {
183                 local.append(inwd);
184             }
185             break;
186         case 'A': /* looking for AE */
187             if (inwd[1] == 'E') {
188                 local.append(inwd, 1, inwd.length - 1);
189             } else {
190                 local.append(inwd);
191             }
192             break;
193         case 'W': /* looking for WR or WH */
194             if (inwd[1] == 'R') { // WR -> R
195                 local.append(inwd, 1, inwd.length - 1);
196                 break;
197             }
198             if (inwd[1] == 'H') {
199                 local.append(inwd, 1, inwd.length - 1);
200                 local.setCharAt(0, 'W'); // WH -> W
201             } else {
202                 local.append(inwd);
203             }
204             break;
205         case 'X': /* initial X becomes S */
206             inwd[0] = 'S';
207             local.append(inwd);
208             break;
209         default:
210             local.append(inwd);
211         } // now local has working string with initials fixed
212 
213         final int wdsz = local.length();
214         int n = 0;
215 
216         while (code.length() < this.getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
217             final char symb = local.charAt(n);
218             // remove duplicate letters except C
219             if (symb != 'C' && isPreviousChar(local, n, symb)) {
220                 n++;
221             } else { // not dup
222                 switch (symb) {
223                 case 'A':
224                 case 'E':
225                 case 'I':
226                 case 'O':
227                 case 'U':
228                     if (n == 0) {
229                         code.append(symb);
230                     }
231                     break; // only use vowel if leading char
232                 case 'B':
233                     if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
234                         break;
235                     }
236                     code.append(symb);
237                     break;
238                 case 'C': // lots of C special cases
239                     /* discard if SCI, SCE or SCY */
240                     if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
241                         break;
242                     }
243                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
244                         code.append('X');
245                         break;
246                     }
247                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
248                         code.append('S');
249                         break; // CI,CE,CY -> S
250                     }
251                     if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
252                         code.append('K');
253                         break;
254                     }
255                     if (isNextChar(local, n, 'H')) { // detect CH
256                         if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
257                             code.append('K');
258                         } else {
259                             code.append('X'); // CHvowel -> X
260                         }
261                     } else {
262                         code.append('K');
263                     }
264                     break;
265                 case 'D':
266                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
267                         code.append('J');
268                         n += 2;
269                     } else {
270                         code.append('T');
271                     }
272                     break;
273                 case 'G': // GH silent at end or before consonant
274                     if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
275                         break;
276                     }
277                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
278                         break;
279                     }
280                     if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
281                         break; // silent G
282                     }
283                     // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
284                     hard = isPreviousChar(local, n, 'G');
285                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
286                         code.append('J');
287                     } else {
288                         code.append('K');
289                     }
290                     break;
291                 case 'H':
292                     if (isLastChar(wdsz, n)) {
293                         break; // terminal H
294                     }
295                     if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
296                         break;
297                     }
298                     if (isVowel(local, n + 1)) {
299                         code.append('H'); // Hvowel
300                     }
301                     break;
302                 case 'F':
303                 case 'J':
304                 case 'L':
305                 case 'M':
306                 case 'N':
307                 case 'R':
308                     code.append(symb);
309                     break;
310                 case 'K':
311                     if (n > 0) { // not initial
312                         if (!isPreviousChar(local, n, 'C')) {
313                             code.append(symb);
314                         }
315                     } else {
316                         code.append(symb); // initial K
317                     }
318                     break;
319                 case 'P':
320                     if (isNextChar(local, n, 'H')) {
321                         // PH -> F
322                         code.append('F');
323                     } else {
324                         code.append(symb);
325                     }
326                     break;
327                 case 'Q':
328                     code.append('K');
329                     break;
330                 case 'S':
331                     if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
332                         code.append('X');
333                     } else {
334                         code.append('S');
335                     }
336                     break;
337                 case 'T':
338                     if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
339                         code.append('X');
340                         break;
341                     }
342                     if (regionMatch(local, n, "TCH")) {
343                         // Silent if in "TCH"
344                         break;
345                     }
346                     // substitute numeral 0 for TH (resembles theta after all)
347                     if (regionMatch(local, n, "TH")) {
348                         code.append('0');
349                     } else {
350                         code.append('T');
351                     }
352                     break;
353                 case 'V':
354                     code.append('F');
355                     break;
356                 case 'W':
357                 case 'Y': // silent if not followed by vowel
358                     if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
359                         code.append(symb);
360                     }
361                     break;
362                 case 'X':
363                     code.append('K');
364                     code.append('S');
365                     break;
366                 case 'Z':
367                     code.append('S');
368                     break;
369                 default:
370                     // do nothing
371                     break;
372                 } // end switch
373                 n++;
374             } // end else from symb != 'C'
375             if (code.length() > this.getMaxCodeLen()) {
376                 code.setLength(this.getMaxCodeLen());
377             }
378         }
379         return code.toString();
380     }
381 
382     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
383         boolean matches = false;
384         if (index >= 0 && index + test.length() - 1 < string.length()) {
385             final String substring = string.substring(index, index + test.length());
386             matches = substring.equals(test);
387         }
388         return matches;
389     }
390 
391     /**
392      * Sets the maxCodeLen.
393      * @param maxCodeLen The maxCodeLen to set
394      */
395     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
396 
397 }