View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * <p>
29   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30   * p 39.</CITE>
31   * <p>
32   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl
33   * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
34   * They have had undocumented changes from the originally published algorithm.
35   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
36   * <p>
37   * This class is conditionally thread-safe.
38   * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
39   * but is not volatile, and accesses are not synchronised.
40   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation
41   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
42   * after initial setup.
43   *
44   * @version $Id: Metaphone.html 889935 2013-12-11 05:05:13Z ggregory $
45   */
46  public class Metaphone implements StringEncoder {
47  
48      /**
49       * Five values in the English language
50       */
51      private static final String VOWELS = "AEIOU";
52  
53      /**
54       * Variable used in Metaphone algorithm
55       */
56      private static final String FRONTV = "EIY";
57  
58      /**
59       * Variable used in Metaphone algorithm
60       */
61      private static final String VARSON = "CSPTG";
62  
63      /**
64       * The max code length for metaphone is 4
65       */
66      private int maxCodeLen = 4;
67  
68      /**
69       * Creates an instance of the Metaphone encoder
70       */
71      public Metaphone() {
72          super();
73      }
74  
75      /**
76       * Find the metaphone value of a String. This is similar to the
77       * soundex algorithm, but better at finding similar sounding words.
78       * All input is converted to upper case.
79       * Limitations: Input format is expected to be a single ASCII word
80       * with only characters in the A - Z range, no punctuation or numbers.
81       *
82       * @param txt String to find the metaphone code for
83       * @return A metaphone code corresponding to the String supplied
84       */
85      public String metaphone(String txt) {
86          boolean hard = false;
87          if (txt == null || txt.length() == 0) {
88              return "";
89          }
90          // single character is itself
91          if (txt.length() == 1) {
92              return txt.toUpperCase(java.util.Locale.ENGLISH);
93          }
94  
95          char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
96  
97          StringBuilder local = new StringBuilder(40); // manipulate
98          StringBuilder code = new StringBuilder(10); //   output
99          // handle initial 2 characters exceptions
100         switch(inwd[0]) {
101         case 'K':
102         case 'G':
103         case 'P': /* looking for KN, etc*/
104             if (inwd[1] == 'N') {
105                 local.append(inwd, 1, inwd.length - 1);
106             } else {
107                 local.append(inwd);
108             }
109             break;
110         case 'A': /* looking for AE */
111             if (inwd[1] == 'E') {
112                 local.append(inwd, 1, inwd.length - 1);
113             } else {
114                 local.append(inwd);
115             }
116             break;
117         case 'W': /* looking for WR or WH */
118             if (inwd[1] == 'R') {   // WR -> R
119                 local.append(inwd, 1, inwd.length - 1);
120                 break;
121             }
122             if (inwd[1] == 'H') {
123                 local.append(inwd, 1, inwd.length - 1);
124                 local.setCharAt(0, 'W'); // WH -> W
125             } else {
126                 local.append(inwd);
127             }
128             break;
129         case 'X': /* initial X becomes S */
130             inwd[0] = 'S';
131             local.append(inwd);
132             break;
133         default:
134             local.append(inwd);
135         } // now local has working string with initials fixed
136 
137         int wdsz = local.length();
138         int n = 0;
139 
140         while (code.length() < this.getMaxCodeLen() &&
141                n < wdsz ) { // max code size of 4 works well
142             char symb = local.charAt(n);
143             // remove duplicate letters except C
144             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
145                 n++;
146             } else { // not dup
147                 switch(symb) {
148                 case 'A':
149                 case 'E':
150                 case 'I':
151                 case 'O':
152                 case 'U':
153                     if (n == 0) {
154                         code.append(symb);
155                     }
156                     break; // only use vowel if leading char
157                 case 'B':
158                     if ( isPreviousChar(local, n, 'M') &&
159                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
160                         break;
161                     }
162                     code.append(symb);
163                     break;
164                 case 'C': // lots of C special cases
165                     /* discard if SCI, SCE or SCY */
166                     if ( isPreviousChar(local, n, 'S') &&
167                          !isLastChar(wdsz, n) &&
168                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
169                         break;
170                     }
171                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
172                         code.append('X');
173                         break;
174                     }
175                     if (!isLastChar(wdsz, n) &&
176                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
177                         code.append('S');
178                         break; // CI,CE,CY -> S
179                     }
180                     if (isPreviousChar(local, n, 'S') &&
181                         isNextChar(local, n, 'H') ) { // SCH->sk
182                         code.append('K');
183                         break;
184                     }
185                     if (isNextChar(local, n, 'H')) { // detect CH
186                         if (n == 0 &&
187                             wdsz >= 3 &&
188                             isVowel(local,2) ) { // CH consonant -> K consonant
189                             code.append('K');
190                         } else {
191                             code.append('X'); // CHvowel -> X
192                         }
193                     } else {
194                         code.append('K');
195                     }
196                     break;
197                 case 'D':
198                     if (!isLastChar(wdsz, n + 1) &&
199                         isNextChar(local, n, 'G') &&
200                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
201                         code.append('J'); n += 2;
202                     } else {
203                         code.append('T');
204                     }
205                     break;
206                 case 'G': // GH silent at end or before consonant
207                     if (isLastChar(wdsz, n + 1) &&
208                         isNextChar(local, n, 'H')) {
209                         break;
210                     }
211                     if (!isLastChar(wdsz, n + 1) &&
212                         isNextChar(local,n,'H') &&
213                         !isVowel(local,n+2)) {
214                         break;
215                     }
216                     if (n > 0 &&
217                         ( regionMatch(local, n, "GN") ||
218                           regionMatch(local, n, "GNED") ) ) {
219                         break; // silent G
220                     }
221                     if (isPreviousChar(local, n, 'G')) {
222                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
223                         hard = true;
224                     } else {
225                         hard = false;
226                     }
227                     if (!isLastChar(wdsz, n) &&
228                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
229                         !hard) {
230                         code.append('J');
231                     } else {
232                         code.append('K');
233                     }
234                     break;
235                 case 'H':
236                     if (isLastChar(wdsz, n)) {
237                         break; // terminal H
238                     }
239                     if (n > 0 &&
240                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
241                         break;
242                     }
243                     if (isVowel(local,n+1)) {
244                         code.append('H'); // Hvowel
245                     }
246                     break;
247                 case 'F':
248                 case 'J':
249                 case 'L':
250                 case 'M':
251                 case 'N':
252                 case 'R':
253                     code.append(symb);
254                     break;
255                 case 'K':
256                     if (n > 0) { // not initial
257                         if (!isPreviousChar(local, n, 'C')) {
258                             code.append(symb);
259                         }
260                     } else {
261                         code.append(symb); // initial K
262                     }
263                     break;
264                 case 'P':
265                     if (isNextChar(local,n,'H')) {
266                         // PH -> F
267                         code.append('F');
268                     } else {
269                         code.append(symb);
270                     }
271                     break;
272                 case 'Q':
273                     code.append('K');
274                     break;
275                 case 'S':
276                     if (regionMatch(local,n,"SH") ||
277                         regionMatch(local,n,"SIO") ||
278                         regionMatch(local,n,"SIA")) {
279                         code.append('X');
280                     } else {
281                         code.append('S');
282                     }
283                     break;
284                 case 'T':
285                     if (regionMatch(local,n,"TIA") ||
286                         regionMatch(local,n,"TIO")) {
287                         code.append('X');
288                         break;
289                     }
290                     if (regionMatch(local,n,"TCH")) {
291                         // Silent if in "TCH"
292                         break;
293                     }
294                     // substitute numeral 0 for TH (resembles theta after all)
295                     if (regionMatch(local,n,"TH")) {
296                         code.append('0');
297                     } else {
298                         code.append('T');
299                     }
300                     break;
301                 case 'V':
302                     code.append('F'); break;
303                 case 'W':
304                 case 'Y': // silent if not followed by vowel
305                     if (!isLastChar(wdsz,n) &&
306                         isVowel(local,n+1)) {
307                         code.append(symb);
308                     }
309                     break;
310                 case 'X':
311                     code.append('K');
312                     code.append('S');
313                     break;
314                 case 'Z':
315                     code.append('S');
316                     break;
317                 default:
318                     // do nothing
319                     break;
320                 } // end switch
321                 n++;
322             } // end else from symb != 'C'
323             if (code.length() > this.getMaxCodeLen()) {
324                 code.setLength(this.getMaxCodeLen());
325             }
326         }
327         return code.toString();
328     }
329 
330     private boolean isVowel(StringBuilder string, int index) {
331         return VOWELS.indexOf(string.charAt(index)) >= 0;
332     }
333 
334     private boolean isPreviousChar(StringBuilder string, int index, char c) {
335         boolean matches = false;
336         if( index > 0 &&
337             index < string.length() ) {
338             matches = string.charAt(index - 1) == c;
339         }
340         return matches;
341     }
342 
343     private boolean isNextChar(StringBuilder string, int index, char c) {
344         boolean matches = false;
345         if( index >= 0 &&
346             index < string.length() - 1 ) {
347             matches = string.charAt(index + 1) == c;
348         }
349         return matches;
350     }
351 
352     private boolean regionMatch(StringBuilder string, int index, String test) {
353         boolean matches = false;
354         if( index >= 0 &&
355             index + test.length() - 1 < string.length() ) {
356             String substring = string.substring( index, index + test.length());
357             matches = substring.equals( test );
358         }
359         return matches;
360     }
361 
362     private boolean isLastChar(int wdsz, int n) {
363         return n + 1 == wdsz;
364     }
365 
366 
367     /**
368      * Encodes an Object using the metaphone algorithm.  This method
369      * is provided in order to satisfy the requirements of the
370      * Encoder interface, and will throw an EncoderException if the
371      * supplied object is not of type java.lang.String.
372      *
373      * @param obj Object to encode
374      * @return An object (or type java.lang.String) containing the
375      *         metaphone code which corresponds to the String supplied.
376      * @throws EncoderException if the parameter supplied is not
377      *                          of type java.lang.String
378      */
379     @Override
380     public Object encode(Object obj) throws EncoderException {
381         if (!(obj instanceof String)) {
382             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
383         }
384         return metaphone((String) obj);
385     }
386 
387     /**
388      * Encodes a String using the Metaphone algorithm.
389      *
390      * @param str String object to encode
391      * @return The metaphone code corresponding to the String supplied
392      */
393     @Override
394     public String encode(String str) {
395         return metaphone(str);
396     }
397 
398     /**
399      * Tests is the metaphones of two strings are identical.
400      *
401      * @param str1 First of two strings to compare
402      * @param str2 Second of two strings to compare
403      * @return {@code true} if the metaphones of these strings are identical,
404      *        {@code false} otherwise.
405      */
406     public boolean isMetaphoneEqual(String str1, String str2) {
407         return metaphone(str1).equals(metaphone(str2));
408     }
409 
410     /**
411      * Returns the maxCodeLen.
412      * @return int
413      */
414     public int getMaxCodeLen() { return this.maxCodeLen; }
415 
416     /**
417      * Sets the maxCodeLen.
418      * @param maxCodeLen The maxCodeLen to set
419      */
420     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
421 
422 }