View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value.
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * <p>
29   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30   * p 39.</CITE>
31   * <p>
32   * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
33   * </p>
34   * <ul>
35   * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
36   *  (broken link 4/30/2013) </li>
37   * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
38   *  (link checked 4/30/2013) </li>
39   * </ul>
40   * <p>
41   * They have had undocumented changes from the originally published algorithm.
42   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
43   * <p>
44   * This class is conditionally thread-safe.
45   * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
46   * but is not volatile, and accesses are not synchronized.
47   * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
48   * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
49   * after initial setup.
50   *
51   * @version $Id: Metaphone.html 891688 2013-12-24 20:49:46Z ggregory $
52   */
53  public class Metaphone implements StringEncoder {
54  
55      /**
56       * Five values in the English language
57       */
58      private static final String VOWELS = "AEIOU";
59  
60      /**
61       * Variable used in Metaphone algorithm
62       */
63      private static final String FRONTV = "EIY";
64  
65      /**
66       * Variable used in Metaphone algorithm
67       */
68      private static final String VARSON = "CSPTG";
69  
70      /**
71       * The max code length for metaphone is 4
72       */
73      private int maxCodeLen = 4;
74  
75      /**
76       * Creates an instance of the Metaphone encoder
77       */
78      public Metaphone() {
79          super();
80      }
81  
82      /**
83       * Find the metaphone value of a String. This is similar to the
84       * soundex algorithm, but better at finding similar sounding words.
85       * All input is converted to upper case.
86       * Limitations: Input format is expected to be a single ASCII word
87       * with only characters in the A - Z range, no punctuation or numbers.
88       *
89       * @param txt String to find the metaphone code for
90       * @return A metaphone code corresponding to the String supplied
91       */
92      public String metaphone(final String txt) {
93          boolean hard = false;
94          if (txt == null || txt.length() == 0) {
95              return "";
96          }
97          // single character is itself
98          if (txt.length() == 1) {
99              return txt.toUpperCase(java.util.Locale.ENGLISH);
100         }
101 
102         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
103 
104         final StringBuilder local = new StringBuilder(40); // manipulate
105         final StringBuilder code = new StringBuilder(10); //   output
106         // handle initial 2 characters exceptions
107         switch(inwd[0]) {
108         case 'K':
109         case 'G':
110         case 'P': /* looking for KN, etc*/
111             if (inwd[1] == 'N') {
112                 local.append(inwd, 1, inwd.length - 1);
113             } else {
114                 local.append(inwd);
115             }
116             break;
117         case 'A': /* looking for AE */
118             if (inwd[1] == 'E') {
119                 local.append(inwd, 1, inwd.length - 1);
120             } else {
121                 local.append(inwd);
122             }
123             break;
124         case 'W': /* looking for WR or WH */
125             if (inwd[1] == 'R') {   // WR -> R
126                 local.append(inwd, 1, inwd.length - 1);
127                 break;
128             }
129             if (inwd[1] == 'H') {
130                 local.append(inwd, 1, inwd.length - 1);
131                 local.setCharAt(0, 'W'); // WH -> W
132             } else {
133                 local.append(inwd);
134             }
135             break;
136         case 'X': /* initial X becomes S */
137             inwd[0] = 'S';
138             local.append(inwd);
139             break;
140         default:
141             local.append(inwd);
142         } // now local has working string with initials fixed
143 
144         final int wdsz = local.length();
145         int n = 0;
146 
147         while (code.length() < this.getMaxCodeLen() &&
148                n < wdsz ) { // max code size of 4 works well
149             final char symb = local.charAt(n);
150             // remove duplicate letters except C
151             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
152                 n++;
153             } else { // not dup
154                 switch(symb) {
155                 case 'A':
156                 case 'E':
157                 case 'I':
158                 case 'O':
159                 case 'U':
160                     if (n == 0) {
161                         code.append(symb);
162                     }
163                     break; // only use vowel if leading char
164                 case 'B':
165                     if ( isPreviousChar(local, n, 'M') &&
166                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
167                         break;
168                     }
169                     code.append(symb);
170                     break;
171                 case 'C': // lots of C special cases
172                     /* discard if SCI, SCE or SCY */
173                     if ( isPreviousChar(local, n, 'S') &&
174                          !isLastChar(wdsz, n) &&
175                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
176                         break;
177                     }
178                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
179                         code.append('X');
180                         break;
181                     }
182                     if (!isLastChar(wdsz, n) &&
183                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
184                         code.append('S');
185                         break; // CI,CE,CY -> S
186                     }
187                     if (isPreviousChar(local, n, 'S') &&
188                         isNextChar(local, n, 'H') ) { // SCH->sk
189                         code.append('K');
190                         break;
191                     }
192                     if (isNextChar(local, n, 'H')) { // detect CH
193                         if (n == 0 &&
194                             wdsz >= 3 &&
195                             isVowel(local,2) ) { // CH consonant -> K consonant
196                             code.append('K');
197                         } else {
198                             code.append('X'); // CHvowel -> X
199                         }
200                     } else {
201                         code.append('K');
202                     }
203                     break;
204                 case 'D':
205                     if (!isLastChar(wdsz, n + 1) &&
206                         isNextChar(local, n, 'G') &&
207                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
208                         code.append('J'); n += 2;
209                     } else {
210                         code.append('T');
211                     }
212                     break;
213                 case 'G': // GH silent at end or before consonant
214                     if (isLastChar(wdsz, n + 1) &&
215                         isNextChar(local, n, 'H')) {
216                         break;
217                     }
218                     if (!isLastChar(wdsz, n + 1) &&
219                         isNextChar(local,n,'H') &&
220                         !isVowel(local,n+2)) {
221                         break;
222                     }
223                     if (n > 0 &&
224                         ( regionMatch(local, n, "GN") ||
225                           regionMatch(local, n, "GNED") ) ) {
226                         break; // silent G
227                     }
228                     if (isPreviousChar(local, n, 'G')) {
229                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
230                         hard = true;
231                     } else {
232                         hard = false;
233                     }
234                     if (!isLastChar(wdsz, n) &&
235                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
236                         !hard) {
237                         code.append('J');
238                     } else {
239                         code.append('K');
240                     }
241                     break;
242                 case 'H':
243                     if (isLastChar(wdsz, n)) {
244                         break; // terminal H
245                     }
246                     if (n > 0 &&
247                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
248                         break;
249                     }
250                     if (isVowel(local,n+1)) {
251                         code.append('H'); // Hvowel
252                     }
253                     break;
254                 case 'F':
255                 case 'J':
256                 case 'L':
257                 case 'M':
258                 case 'N':
259                 case 'R':
260                     code.append(symb);
261                     break;
262                 case 'K':
263                     if (n > 0) { // not initial
264                         if (!isPreviousChar(local, n, 'C')) {
265                             code.append(symb);
266                         }
267                     } else {
268                         code.append(symb); // initial K
269                     }
270                     break;
271                 case 'P':
272                     if (isNextChar(local,n,'H')) {
273                         // PH -> F
274                         code.append('F');
275                     } else {
276                         code.append(symb);
277                     }
278                     break;
279                 case 'Q':
280                     code.append('K');
281                     break;
282                 case 'S':
283                     if (regionMatch(local,n,"SH") ||
284                         regionMatch(local,n,"SIO") ||
285                         regionMatch(local,n,"SIA")) {
286                         code.append('X');
287                     } else {
288                         code.append('S');
289                     }
290                     break;
291                 case 'T':
292                     if (regionMatch(local,n,"TIA") ||
293                         regionMatch(local,n,"TIO")) {
294                         code.append('X');
295                         break;
296                     }
297                     if (regionMatch(local,n,"TCH")) {
298                         // Silent if in "TCH"
299                         break;
300                     }
301                     // substitute numeral 0 for TH (resembles theta after all)
302                     if (regionMatch(local,n,"TH")) {
303                         code.append('0');
304                     } else {
305                         code.append('T');
306                     }
307                     break;
308                 case 'V':
309                     code.append('F'); break;
310                 case 'W':
311                 case 'Y': // silent if not followed by vowel
312                     if (!isLastChar(wdsz,n) &&
313                         isVowel(local,n+1)) {
314                         code.append(symb);
315                     }
316                     break;
317                 case 'X':
318                     code.append('K');
319                     code.append('S');
320                     break;
321                 case 'Z':
322                     code.append('S');
323                     break;
324                 default:
325                     // do nothing
326                     break;
327                 } // end switch
328                 n++;
329             } // end else from symb != 'C'
330             if (code.length() > this.getMaxCodeLen()) {
331                 code.setLength(this.getMaxCodeLen());
332             }
333         }
334         return code.toString();
335     }
336 
337     private boolean isVowel(final StringBuilder string, final int index) {
338         return VOWELS.indexOf(string.charAt(index)) >= 0;
339     }
340 
341     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
342         boolean matches = false;
343         if( index > 0 &&
344             index < string.length() ) {
345             matches = string.charAt(index - 1) == c;
346         }
347         return matches;
348     }
349 
350     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
351         boolean matches = false;
352         if( index >= 0 &&
353             index < string.length() - 1 ) {
354             matches = string.charAt(index + 1) == c;
355         }
356         return matches;
357     }
358 
359     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
360         boolean matches = false;
361         if( index >= 0 &&
362             index + test.length() - 1 < string.length() ) {
363             final String substring = string.substring( index, index + test.length());
364             matches = substring.equals( test );
365         }
366         return matches;
367     }
368 
369     private boolean isLastChar(final int wdsz, final int n) {
370         return n + 1 == wdsz;
371     }
372 
373 
374     /**
375      * Encodes an Object using the metaphone algorithm.  This method
376      * is provided in order to satisfy the requirements of the
377      * Encoder interface, and will throw an EncoderException if the
378      * supplied object is not of type java.lang.String.
379      *
380      * @param obj Object to encode
381      * @return An object (or type java.lang.String) containing the
382      *         metaphone code which corresponds to the String supplied.
383      * @throws EncoderException if the parameter supplied is not
384      *                          of type java.lang.String
385      */
386     @Override
387     public Object encode(final Object obj) throws EncoderException {
388         if (!(obj instanceof String)) {
389             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
390         }
391         return metaphone((String) obj);
392     }
393 
394     /**
395      * Encodes a String using the Metaphone algorithm.
396      *
397      * @param str String object to encode
398      * @return The metaphone code corresponding to the String supplied
399      */
400     @Override
401     public String encode(final String str) {
402         return metaphone(str);
403     }
404 
405     /**
406      * Tests is the metaphones of two strings are identical.
407      *
408      * @param str1 First of two strings to compare
409      * @param str2 Second of two strings to compare
410      * @return {@code true} if the metaphones of these strings are identical,
411      *        {@code false} otherwise.
412      */
413     public boolean isMetaphoneEqual(final String str1, final String str2) {
414         return metaphone(str1).equals(metaphone(str2));
415     }
416 
417     /**
418      * Returns the maxCodeLen.
419      * @return int
420      */
421     public int getMaxCodeLen() { return this.maxCodeLen; }
422 
423     /**
424      * Sets the maxCodeLen.
425      * @param maxCodeLen The maxCodeLen to set
426      */
427     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
428 
429 }