View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * <p>
29   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30   * p 39.</CITE>
31   * <p>
32   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
33   * </p>
34   * <ul>
35   * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </li>
36   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </li>
37   * </ul>
38   * <p>
39   * They have had undocumented changes from the originally published algorithm.
40   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
41   * <p>
42   * This class is conditionally thread-safe.
43   * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
44   * but is not volatile, and accesses are not synchronized.
45   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
46   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
47   * after initial setup.
48   *
49   * @version $Id: Metaphone.java 1477596 2013-04-30 12:42:29Z ggregory $
50   */
51  public class Metaphone implements StringEncoder {
52  
53      /**
54       * Five values in the English language
55       */
56      private static final String VOWELS = "AEIOU";
57  
58      /**
59       * Variable used in Metaphone algorithm
60       */
61      private static final String FRONTV = "EIY";
62  
63      /**
64       * Variable used in Metaphone algorithm
65       */
66      private static final String VARSON = "CSPTG";
67  
68      /**
69       * The max code length for metaphone is 4
70       */
71      private int maxCodeLen = 4;
72  
73      /**
74       * Creates an instance of the Metaphone encoder
75       */
76      public Metaphone() {
77          super();
78      }
79  
80      /**
81       * Find the metaphone value of a String. This is similar to the
82       * soundex algorithm, but better at finding similar sounding words.
83       * All input is converted to upper case.
84       * Limitations: Input format is expected to be a single ASCII word
85       * with only characters in the A - Z range, no punctuation or numbers.
86       *
87       * @param txt String to find the metaphone code for
88       * @return A metaphone code corresponding to the String supplied
89       */
90      public String metaphone(final String txt) {
91          boolean hard = false;
92          if (txt == null || txt.length() == 0) {
93              return "";
94          }
95          // single character is itself
96          if (txt.length() == 1) {
97              return txt.toUpperCase(java.util.Locale.ENGLISH);
98          }
99  
100         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
101 
102         final StringBuilder local = new StringBuilder(40); // manipulate
103         final StringBuilder code = new StringBuilder(10); //   output
104         // handle initial 2 characters exceptions
105         switch(inwd[0]) {
106         case 'K':
107         case 'G':
108         case 'P': /* looking for KN, etc*/
109             if (inwd[1] == 'N') {
110                 local.append(inwd, 1, inwd.length - 1);
111             } else {
112                 local.append(inwd);
113             }
114             break;
115         case 'A': /* looking for AE */
116             if (inwd[1] == 'E') {
117                 local.append(inwd, 1, inwd.length - 1);
118             } else {
119                 local.append(inwd);
120             }
121             break;
122         case 'W': /* looking for WR or WH */
123             if (inwd[1] == 'R') {   // WR -> R
124                 local.append(inwd, 1, inwd.length - 1);
125                 break;
126             }
127             if (inwd[1] == 'H') {
128                 local.append(inwd, 1, inwd.length - 1);
129                 local.setCharAt(0, 'W'); // WH -> W
130             } else {
131                 local.append(inwd);
132             }
133             break;
134         case 'X': /* initial X becomes S */
135             inwd[0] = 'S';
136             local.append(inwd);
137             break;
138         default:
139             local.append(inwd);
140         } // now local has working string with initials fixed
141 
142         final int wdsz = local.length();
143         int n = 0;
144 
145         while (code.length() < this.getMaxCodeLen() &&
146                n < wdsz ) { // max code size of 4 works well
147             final char symb = local.charAt(n);
148             // remove duplicate letters except C
149             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
150                 n++;
151             } else { // not dup
152                 switch(symb) {
153                 case 'A':
154                 case 'E':
155                 case 'I':
156                 case 'O':
157                 case 'U':
158                     if (n == 0) {
159                         code.append(symb);
160                     }
161                     break; // only use vowel if leading char
162                 case 'B':
163                     if ( isPreviousChar(local, n, 'M') &&
164                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
165                         break;
166                     }
167                     code.append(symb);
168                     break;
169                 case 'C': // lots of C special cases
170                     /* discard if SCI, SCE or SCY */
171                     if ( isPreviousChar(local, n, 'S') &&
172                          !isLastChar(wdsz, n) &&
173                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
174                         break;
175                     }
176                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
177                         code.append('X');
178                         break;
179                     }
180                     if (!isLastChar(wdsz, n) &&
181                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
182                         code.append('S');
183                         break; // CI,CE,CY -> S
184                     }
185                     if (isPreviousChar(local, n, 'S') &&
186                         isNextChar(local, n, 'H') ) { // SCH->sk
187                         code.append('K');
188                         break;
189                     }
190                     if (isNextChar(local, n, 'H')) { // detect CH
191                         if (n == 0 &&
192                             wdsz >= 3 &&
193                             isVowel(local,2) ) { // CH consonant -> K consonant
194                             code.append('K');
195                         } else {
196                             code.append('X'); // CHvowel -> X
197                         }
198                     } else {
199                         code.append('K');
200                     }
201                     break;
202                 case 'D':
203                     if (!isLastChar(wdsz, n + 1) &&
204                         isNextChar(local, n, 'G') &&
205                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
206                         code.append('J'); n += 2;
207                     } else {
208                         code.append('T');
209                     }
210                     break;
211                 case 'G': // GH silent at end or before consonant
212                     if (isLastChar(wdsz, n + 1) &&
213                         isNextChar(local, n, 'H')) {
214                         break;
215                     }
216                     if (!isLastChar(wdsz, n + 1) &&
217                         isNextChar(local,n,'H') &&
218                         !isVowel(local,n+2)) {
219                         break;
220                     }
221                     if (n > 0 &&
222                         ( regionMatch(local, n, "GN") ||
223                           regionMatch(local, n, "GNED") ) ) {
224                         break; // silent G
225                     }
226                     if (isPreviousChar(local, n, 'G')) {
227                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
228                         hard = true;
229                     } else {
230                         hard = false;
231                     }
232                     if (!isLastChar(wdsz, n) &&
233                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
234                         !hard) {
235                         code.append('J');
236                     } else {
237                         code.append('K');
238                     }
239                     break;
240                 case 'H':
241                     if (isLastChar(wdsz, n)) {
242                         break; // terminal H
243                     }
244                     if (n > 0 &&
245                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
246                         break;
247                     }
248                     if (isVowel(local,n+1)) {
249                         code.append('H'); // Hvowel
250                     }
251                     break;
252                 case 'F':
253                 case 'J':
254                 case 'L':
255                 case 'M':
256                 case 'N':
257                 case 'R':
258                     code.append(symb);
259                     break;
260                 case 'K':
261                     if (n > 0) { // not initial
262                         if (!isPreviousChar(local, n, 'C')) {
263                             code.append(symb);
264                         }
265                     } else {
266                         code.append(symb); // initial K
267                     }
268                     break;
269                 case 'P':
270                     if (isNextChar(local,n,'H')) {
271                         // PH -> F
272                         code.append('F');
273                     } else {
274                         code.append(symb);
275                     }
276                     break;
277                 case 'Q':
278                     code.append('K');
279                     break;
280                 case 'S':
281                     if (regionMatch(local,n,"SH") ||
282                         regionMatch(local,n,"SIO") ||
283                         regionMatch(local,n,"SIA")) {
284                         code.append('X');
285                     } else {
286                         code.append('S');
287                     }
288                     break;
289                 case 'T':
290                     if (regionMatch(local,n,"TIA") ||
291                         regionMatch(local,n,"TIO")) {
292                         code.append('X');
293                         break;
294                     }
295                     if (regionMatch(local,n,"TCH")) {
296                         // Silent if in "TCH"
297                         break;
298                     }
299                     // substitute numeral 0 for TH (resembles theta after all)
300                     if (regionMatch(local,n,"TH")) {
301                         code.append('0');
302                     } else {
303                         code.append('T');
304                     }
305                     break;
306                 case 'V':
307                     code.append('F'); break;
308                 case 'W':
309                 case 'Y': // silent if not followed by vowel
310                     if (!isLastChar(wdsz,n) &&
311                         isVowel(local,n+1)) {
312                         code.append(symb);
313                     }
314                     break;
315                 case 'X':
316                     code.append('K');
317                     code.append('S');
318                     break;
319                 case 'Z':
320                     code.append('S');
321                     break;
322                 default:
323                     // do nothing
324                     break;
325                 } // end switch
326                 n++;
327             } // end else from symb != 'C'
328             if (code.length() > this.getMaxCodeLen()) {
329                 code.setLength(this.getMaxCodeLen());
330             }
331         }
332         return code.toString();
333     }
334 
335     private boolean isVowel(final StringBuilder string, final int index) {
336         return VOWELS.indexOf(string.charAt(index)) >= 0;
337     }
338 
339     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
340         boolean matches = false;
341         if( index > 0 &&
342             index < string.length() ) {
343             matches = string.charAt(index - 1) == c;
344         }
345         return matches;
346     }
347 
348     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
349         boolean matches = false;
350         if( index >= 0 &&
351             index < string.length() - 1 ) {
352             matches = string.charAt(index + 1) == c;
353         }
354         return matches;
355     }
356 
357     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
358         boolean matches = false;
359         if( index >= 0 &&
360             index + test.length() - 1 < string.length() ) {
361             final String substring = string.substring( index, index + test.length());
362             matches = substring.equals( test );
363         }
364         return matches;
365     }
366 
367     private boolean isLastChar(final int wdsz, final int n) {
368         return n + 1 == wdsz;
369     }
370 
371 
372     /**
373      * Encodes an Object using the metaphone algorithm.  This method
374      * is provided in order to satisfy the requirements of the
375      * Encoder interface, and will throw an EncoderException if the
376      * supplied object is not of type java.lang.String.
377      *
378      * @param obj Object to encode
379      * @return An object (or type java.lang.String) containing the
380      *         metaphone code which corresponds to the String supplied.
381      * @throws EncoderException if the parameter supplied is not
382      *                          of type java.lang.String
383      */
384     @Override
385     public Object encode(final Object obj) throws EncoderException {
386         if (!(obj instanceof String)) {
387             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
388         }
389         return metaphone((String) obj);
390     }
391 
392     /**
393      * Encodes a String using the Metaphone algorithm.
394      *
395      * @param str String object to encode
396      * @return The metaphone code corresponding to the String supplied
397      */
398     @Override
399     public String encode(final String str) {
400         return metaphone(str);
401     }
402 
403     /**
404      * Tests is the metaphones of two strings are identical.
405      *
406      * @param str1 First of two strings to compare
407      * @param str2 Second of two strings to compare
408      * @return {@code true} if the metaphones of these strings are identical,
409      *        {@code false} otherwise.
410      */
411     public boolean isMetaphoneEqual(final String str1, final String str2) {
412         return metaphone(str1).equals(metaphone(str2));
413     }
414 
415     /**
416      * Returns the maxCodeLen.
417      * @return int
418      */
419     public int getMaxCodeLen() { return this.maxCodeLen; }
420 
421     /**
422      * Sets the maxCodeLen.
423      * @param maxCodeLen The maxCodeLen to set
424      */
425     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
426 
427 }