View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.Locale;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Encodes a string into a Metaphone value.
27   * <p>
28   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
29   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
30   * </p>
31   * <p>
32   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
33   * p 39.</CITE>
34   * </p>
35   * <p>
36   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
37   * </p>
38   * <ul>
39   * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
40   *  (broken link 4/30/2013) </li>
41   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
42   *  (link checked 4/30/2013) </li>
43   * </ul>
44   * <p>
45   * They have had undocumented changes from the originally published algorithm.
46   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
47   * </p>
48   * <p>
49   * This class is conditionally thread-safe.
50   * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
51   * but is not volatile, and accesses are not synchronized.
52   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
53   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
54   * after initial setup.
55   * </p>
56   */
57  public class Metaphone implements StringEncoder {
58  
59      /**
60       * Five values in the English language.
61       */
62      private static final String VOWELS = "AEIOU";
63  
64      /**
65       * Variable used in Metaphone algorithm.
66       */
67      private static final String FRONTV = "EIY";
68  
69      /**
70       * Variable used in Metaphone algorithm.
71       */
72      private static final String VARSON = "CSPTG";
73  
74      /**
75       * The max code length for Metaphone is 4.
76       */
77      private int maxCodeLen = 4;
78  
79      /**
80       * Constructs a new instance.
81       */
82      public Metaphone() {
83          // empty
84      }
85  
86      /**
87       * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
88       * EncoderException if the supplied object is not of type {@link String}.
89       *
90       * @param obj Object to encode.
91       * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied.
92       * @throws EncoderException if the parameter supplied is not of type {@link String}.
93       */
94      @Override
95      public Object encode(final Object obj) throws EncoderException {
96          if (!(obj instanceof String)) {
97              throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
98          }
99          return metaphone((String) obj);
100     }
101 
102     /**
103      * Encodes a String using the Metaphone algorithm.
104      *
105      * @param str String object to encode
106      * @return The Metaphone code corresponding to the String supplied
107      */
108     @Override
109     public String encode(final String str) {
110         return metaphone(str);
111     }
112 
113     /**
114      * Gets the maxCodeLen.
115      *
116      * @return the maxCodeLen.
117      */
118     public int getMaxCodeLen() {
119         return this.maxCodeLen;
120     }
121 
122     private boolean isLastChar(final int wdsz, final int n) {
123         return n + 1 == wdsz;
124     }
125 
126     /**
127      * Tests is the Metaphones of two strings are identical.
128      *
129      * @param str1 First of two strings to compare.
130      * @param str2 Second of two strings to compare.
131      * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise.
132      */
133     public boolean isMetaphoneEqual(final String str1, final String str2) {
134         return metaphone(str1).equals(metaphone(str2));
135     }
136 
137     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
138         boolean matches = false;
139         if (index >= 0 && index < string.length() - 1) {
140             matches = string.charAt(index + 1) == c;
141         }
142         return matches;
143     }
144 
145     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
146         boolean matches = false;
147         if (index > 0 && index < string.length()) {
148             matches = string.charAt(index - 1) == c;
149         }
150         return matches;
151     }
152 
153     private boolean isVowel(final StringBuilder string, final int index) {
154         return VOWELS.indexOf(string.charAt(index)) >= 0;
155     }
156 
157     /**
158      * Find the Metaphone value of a String. This is similar to the
159      * Soundex algorithm, but better at finding similar sounding words.
160      * All input is converted to upper case.
161      * Limitations: Input format is expected to be a single ASCII word
162      * with only characters in the A - Z range, no punctuation or numbers.
163      *
164      * @param txt String to find the Metaphone code for.
165      * @return A Metaphone code corresponding to the String supplied.
166      */
167     public String metaphone(final String txt) {
168         boolean hard = false;
169         final int txtLength;
170         if (txt == null || (txtLength = txt.length()) == 0) {
171             return "";
172         }
173         // single character is itself
174         if (txtLength == 1) {
175             return txt.toUpperCase(Locale.ENGLISH);
176         }
177 
178         final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray();
179 
180         final StringBuilder local = new StringBuilder(40); // manipulate
181         final StringBuilder code = new StringBuilder(10); // output
182         // handle initial 2 characters exceptions
183         switch (inwd[0]) {
184         case 'K':
185         case 'G':
186         case 'P': /* looking for KN, etc */
187             if (inwd[1] == 'N') {
188                 local.append(inwd, 1, inwd.length - 1);
189             } else {
190                 local.append(inwd);
191             }
192             break;
193         case 'A': /* looking for AE */
194             if (inwd[1] == 'E') {
195                 local.append(inwd, 1, inwd.length - 1);
196             } else {
197                 local.append(inwd);
198             }
199             break;
200         case 'W': /* looking for WR or WH */
201             if (inwd[1] == 'R') { // WR -> R
202                 local.append(inwd, 1, inwd.length - 1);
203                 break;
204             }
205             if (inwd[1] == 'H') {
206                 local.append(inwd, 1, inwd.length - 1);
207                 local.setCharAt(0, 'W'); // WH -> W
208             } else {
209                 local.append(inwd);
210             }
211             break;
212         case 'X': /* initial X becomes S */
213             inwd[0] = 'S';
214             local.append(inwd);
215             break;
216         default:
217             local.append(inwd);
218         } // now local has working string with initials fixed
219 
220         final int wdsz = local.length();
221         int n = 0;
222 
223         while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
224             final char symb = local.charAt(n);
225             // remove duplicate letters except C
226             if (symb == 'C' || !isPreviousChar(local, n, symb)) {
227                 // not dup
228                 switch (symb) {
229                 case 'A':
230                 case 'E':
231                 case 'I':
232                 case 'O':
233                 case 'U':
234                     if (n == 0) {
235                         code.append(symb);
236                     }
237                     break; // only use vowel if leading char
238                 case 'B':
239                     if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
240                         break;
241                     }
242                     code.append(symb);
243                     break;
244                 case 'C': // lots of C special cases
245                     /* discard if SCI, SCE or SCY */
246                     if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247                         break;
248                     }
249                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
250                         code.append('X');
251                         break;
252                     }
253                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
254                         code.append('S');
255                         break; // CI,CE,CY -> S
256                     }
257                     if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
258                         code.append('K');
259                         break;
260                     }
261                     if (!isNextChar(local, n, 'H') || n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
262                         code.append('K');
263                     } else {
264                         code.append('X'); // CHvowel -> X
265                     }
266                     break;
267                 case 'D':
268                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
269                         code.append('J');
270                         n += 2;
271                     } else {
272                         code.append('T');
273                     }
274                     break;
275                 case 'G': // GH silent at end or before consonant
276                     if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
277                         break;
278                     }
279                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
280                         break;
281                     }
282                     if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
283                         break; // silent G
284                     }
285                     // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
286                     hard = isPreviousChar(local, n, 'G');
287                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
288                         code.append('J');
289                     } else {
290                         code.append('K');
291                     }
292                     break;
293                 case 'H':
294                     if (isLastChar(wdsz, n)) {
295                         break; // terminal H
296                     }
297                     if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
298                         break;
299                     }
300                     if (isVowel(local, n + 1)) {
301                         code.append('H'); // Hvowel
302                     }
303                     break;
304                 case 'F':
305                 case 'J':
306                 case 'L':
307                 case 'M':
308                 case 'N':
309                 case 'R':
310                     code.append(symb);
311                     break;
312                 case 'K':
313                     if (n > 0) { // not initial
314                         if (!isPreviousChar(local, n, 'C')) {
315                             code.append(symb);
316                         }
317                     } else {
318                         code.append(symb); // initial K
319                     }
320                     break;
321                 case 'P':
322                     if (isNextChar(local, n, 'H')) {
323                         // PH -> F
324                         code.append('F');
325                     } else {
326                         code.append(symb);
327                     }
328                     break;
329                 case 'Q':
330                     code.append('K');
331                     break;
332                 case 'S':
333                     if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
334                         code.append('X');
335                     } else {
336                         code.append('S');
337                     }
338                     break;
339                 case 'T':
340                     if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
341                         code.append('X');
342                         break;
343                     }
344                     if (regionMatch(local, n, "TCH")) {
345                         // Silent if in "TCH"
346                         break;
347                     }
348                     // substitute numeral 0 for TH (resembles theta after all)
349                     if (regionMatch(local, n, "TH")) {
350                         code.append('0');
351                     } else {
352                         code.append('T');
353                     }
354                     break;
355                 case 'V':
356                     code.append('F');
357                     break;
358                 case 'W':
359                 case 'Y': // silent if not followed by vowel
360                     if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
361                         code.append(symb);
362                     }
363                     break;
364                 case 'X':
365                     code.append('K');
366                     code.append('S');
367                     break;
368                 case 'Z':
369                     code.append('S');
370                     break;
371                 default:
372                     // do nothing
373                     break;
374                 } // end switch
375             } // end else from symb != 'C'
376             n++;
377             if (code.length() > getMaxCodeLen()) {
378                 code.setLength(getMaxCodeLen());
379             }
380         }
381         return code.toString();
382     }
383 
384     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
385         boolean matches = false;
386         if (index >= 0 && index + test.length() - 1 < string.length()) {
387             final String substring = string.substring(index, index + test.length());
388             matches = substring.equals(test);
389         }
390         return matches;
391     }
392 
393     /**
394      * Sets the maxCodeLen.
395      *
396      * @param maxCodeLen The maxCodeLen to set.
397      */
398     public void setMaxCodeLen(final int maxCodeLen) {
399         this.maxCodeLen = maxCodeLen;
400     }
401 
402 }