View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.Locale;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Encodes a string into a Metaphone value.
27   * <p>
28   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
29   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
30   * </p>
31   * <p>
32   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
33   * p 39.</CITE>
34   * </p>
35   * <p>
36   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
37   * </p>
38   * <ul>
39   * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
40   *  (broken link 4/30/2013) </li>
41   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
42   *  (link checked 4/30/2013) </li>
43   * </ul>
44   * <p>
45   * They have had undocumented changes from the originally published algorithm.
46   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
47   * </p>
48   * <p>
49   * This class is conditionally thread-safe.
50   * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
51   * but is not volatile, and accesses are not synchronized.
52   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
53   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
54   * after initial setup.
55   * </p>
56   */
57  public class Metaphone implements StringEncoder {
58  
59      /**
60       * Five values in the English language.
61       */
62      private static final String VOWELS = "AEIOU";
63  
64      /**
65       * Variable used in Metaphone algorithm.
66       */
67      private static final String FRONTV = "EIY";
68  
69      /**
70       * Variable used in Metaphone algorithm.
71       */
72      private static final String VARSON = "CSPTG";
73  
74      /**
75       * The max code length for Metaphone is 4.
76       */
77      private int maxCodeLen = 4;
78  
79      /**
80       * Constructs a new instance.
81       */
82      public Metaphone() {
83          // empty
84      }
85  
86      /**
87       * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
88       * EncoderException if the supplied object is not of type {@link String}.
89       *
90       * @param obj Object to encode.
91       * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied.
92       * @throws EncoderException if the parameter supplied is not of type {@link String}.
93       */
94      @Override
95      public Object encode(final Object obj) throws EncoderException {
96          if (!(obj instanceof String)) {
97              throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
98          }
99          return metaphone((String) obj);
100     }
101 
102     /**
103      * Encodes a String using the Metaphone algorithm.
104      *
105      * @param str String object to encode.
106      * @return The Metaphone code corresponding to the String supplied.
107      */
108     @Override
109     public String encode(final String str) {
110         return metaphone(str);
111     }
112 
113     /**
114      * Gets the maxCodeLen.
115      *
116      * @return the maxCodeLen.
117      */
118     public int getMaxCodeLen() {
119         return this.maxCodeLen;
120     }
121 
122     private boolean isLastChar(final int wdsz, final int n) {
123         return n + 1 == wdsz;
124     }
125 
126     /**
127      * Tests is the Metaphones of two strings are identical.
128      *
129      * @param str1 First of two strings to compare.
130      * @param str2 Second of two strings to compare.
131      * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise.
132      */
133     public boolean isMetaphoneEqual(final String str1, final String str2) {
134         return metaphone(str1).equals(metaphone(str2));
135     }
136 
137     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
138         boolean matches = false;
139         if (index >= 0 && index < string.length() - 1) {
140             matches = string.charAt(index + 1) == c;
141         }
142         return matches;
143     }
144 
145     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
146         boolean matches = false;
147         if (index > 0 && index < string.length()) {
148             matches = string.charAt(index - 1) == c;
149         }
150         return matches;
151     }
152 
153     private boolean isVowel(final StringBuilder string, final int index) {
154         return VOWELS.indexOf(string.charAt(index)) >= 0;
155     }
156 
157     /**
158      * Find the Metaphone value of a String. This is similar to the
159      * Soundex algorithm, but better at finding similar sounding words.
160      * All input is converted to upper case.
161      * Limitations: Input format is expected to be a single ASCII word
162      * with only characters in the A - Z range, no punctuation or numbers.
163      *
164      * @param txt String to find the Metaphone code for.
165      * @return A Metaphone code corresponding to the String supplied.
166      */
167     public String metaphone(final String txt) {
168         boolean hard = false;
169         final int txtLength;
170         if (txt == null || (txtLength = txt.length()) == 0) {
171             return "";
172         }
173         // single character is itself
174         if (txtLength == 1) {
175             return txt.toUpperCase(Locale.ENGLISH);
176         }
177 
178         final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray();
179 
180         final StringBuilder local = new StringBuilder(40); // manipulate
181         final StringBuilder code = new StringBuilder(10); // output
182         // handle initial 2 characters exceptions
183         switch (inwd[0]) {
184         case 'K':
185         case 'G':
186         case 'P': /* looking for KN, etc */
187             if (inwd[1] == 'N') {
188                 local.append(inwd, 1, inwd.length - 1);
189             } else {
190                 local.append(inwd);
191             }
192             break;
193         case 'A': /* looking for AE */
194             if (inwd[1] == 'E') {
195                 local.append(inwd, 1, inwd.length - 1);
196             } else {
197                 local.append(inwd);
198             }
199             break;
200         case 'W': /* looking for WR or WH */
201             if (inwd[1] == 'R') { // WR -> R
202                 local.append(inwd, 1, inwd.length - 1);
203                 break;
204             }
205             if (inwd[1] == 'H') {
206                 local.append(inwd, 1, inwd.length - 1);
207                 local.setCharAt(0, 'W'); // WH -> W
208             } else {
209                 local.append(inwd);
210             }
211             break;
212         case 'X': /* initial X becomes S */
213             inwd[0] = 'S';
214             local.append(inwd);
215             break;
216         default:
217             local.append(inwd);
218         } // now local has working string with initials fixed
219 
220         final int wdsz = local.length();
221         int n = 0;
222 
223         while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
224             final char symb = local.charAt(n);
225             // remove duplicate letters except C
226             if (symb == 'C' || !isPreviousChar(local, n, symb)) {
227                 // not dup
228                 switch (symb) {
229                 case 'A':
230                 case 'E':
231                 case 'I':
232                 case 'O':
233                 case 'U':
234                     if (n == 0) {
235                         code.append(symb);
236                     }
237                     break; // only use vowel if leading char
238                 case 'B':
239                     if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
240                         break;
241                     }
242                     code.append(symb);
243                     break;
244                 case 'C': // lots of C special cases
245                     /* discard if SCI, SCE or SCY */
246                     if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247                         break;
248                     }
249                     if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
250                         code.append('K');
251                         break;
252                     }
253                     if (regionMatch(local, n, "CIA") || isNextChar(local, n, 'H')) { // "CIA" -> X or CH -> X
254                         code.append('X');
255                         break;
256                     }
257                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
258                         code.append('S');
259                         break; // CI,CE,CY -> S
260                     }
261                     code.append('K'); // default C -> K
262                     break;
263                 case 'D':
264                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
265                         code.append('J');
266                         n += 2;
267                     } else {
268                         code.append('T');
269                     }
270                     break;
271                 case 'G': // GH silent at end or before consonant
272                     if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
273                         break;
274                     }
275                     if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
276                         break;
277                     }
278                     if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
279                         break; // silent G
280                     }
281                     // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
282                     hard = isPreviousChar(local, n, 'G');
283                     if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
284                         code.append('J');
285                     } else {
286                         code.append('K');
287                     }
288                     break;
289                 case 'H':
290                     if (isLastChar(wdsz, n)) {
291                         break; // terminal H
292                     }
293                     if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
294                         break;
295                     }
296                     if (isVowel(local, n + 1)) {
297                         code.append('H'); // Hvowel
298                     }
299                     break;
300                 case 'F':
301                 case 'J':
302                 case 'L':
303                 case 'M':
304                 case 'N':
305                 case 'R':
306                     code.append(symb);
307                     break;
308                 case 'K':
309                     if (n > 0) { // not initial
310                         if (!isPreviousChar(local, n, 'C')) {
311                             code.append(symb);
312                         }
313                     } else {
314                         code.append(symb); // initial K
315                     }
316                     break;
317                 case 'P':
318                     if (isNextChar(local, n, 'H')) {
319                         // PH -> F
320                         code.append('F');
321                     } else {
322                         code.append(symb);
323                     }
324                     break;
325                 case 'Q':
326                     code.append('K');
327                     break;
328                 case 'S':
329                     if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
330                         code.append('X');
331                     } else {
332                         code.append('S');
333                     }
334                     break;
335                 case 'T':
336                     if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
337                         code.append('X');
338                         break;
339                     }
340                     if (regionMatch(local, n, "TCH")) {
341                         // Silent if in "TCH"
342                         break;
343                     }
344                     // substitute numeral 0 for TH (resembles theta after all)
345                     if (regionMatch(local, n, "TH")) {
346                         code.append('0');
347                     } else {
348                         code.append('T');
349                     }
350                     break;
351                 case 'V':
352                     code.append('F');
353                     break;
354                 case 'W':
355                 case 'Y': // silent if not followed by vowel
356                     if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
357                         code.append(symb);
358                     }
359                     break;
360                 case 'X':
361                     code.append('K');
362                     code.append('S');
363                     break;
364                 case 'Z':
365                     code.append('S');
366                     break;
367                 default:
368                     // do nothing
369                     break;
370                 } // end switch
371             } // end else from symb != 'C'
372             n++;
373             if (code.length() > getMaxCodeLen()) {
374                 code.setLength(getMaxCodeLen());
375             }
376         }
377         return code.toString();
378     }
379 
380     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
381         boolean matches = false;
382         if (index >= 0 && index + test.length() - 1 < string.length()) {
383             final String substring = string.substring(index, index + test.length());
384             matches = substring.equals(test);
385         }
386         return matches;
387     }
388 
389     /**
390      * Sets the maxCodeLen.
391      *
392      * @param maxCodeLen The maxCodeLen to set.
393      */
394     public void setMaxCodeLen(final int maxCodeLen) {
395         this.maxCodeLen = maxCodeLen;
396     }
397 
398 }