View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Metaphone value. 
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * </p>
29   * <p>
30   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
31   * 39.</CITE>
32   * </p>
33   * <p>
34   * Note, that this does not match the algorithm that ships with PHP, or the algorithm 
35   * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
36   * They have had undocumented changes from the originally published algorithm. 
37   * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
38   * </p>
39   * 
40   * @author Apache Software Foundation
41   * @version $Id: Metaphone.java 1157192 2011-08-12 17:27:38Z ggregory $
42   */
43  public class Metaphone implements StringEncoder {
44  
45      /**
46       * Five values in the English language 
47       */
48      private static final String VOWELS = "AEIOU" ;
49  
50      /**
51       * Variable used in Metaphone algorithm
52       */
53      private static final String FRONTV = "EIY"   ;
54  
55      /**
56       * Variable used in Metaphone algorithm
57       */
58      private static final String VARSON = "CSPTG" ;
59  
60      /**
61       * The max code length for metaphone is 4
62       */
63      private int maxCodeLen = 4 ;
64  
65      /**
66       * Creates an instance of the Metaphone encoder
67       */
68      public Metaphone() {
69          super();
70      }
71  
72      /**
73       * Find the metaphone value of a String. This is similar to the
74       * soundex algorithm, but better at finding similar sounding words.
75       * All input is converted to upper case.
76       * Limitations: Input format is expected to be a single ASCII word
77       * with only characters in the A - Z range, no punctuation or numbers.
78       *
79       * @param txt String to find the metaphone code for
80       * @return A metaphone code corresponding to the String supplied
81       */
82      public String metaphone(String txt) {
83          boolean hard = false ;
84          if ((txt == null) || (txt.length() == 0)) {
85              return "" ;
86          }
87          // single character is itself
88          if (txt.length() == 1) {
89              return txt.toUpperCase(java.util.Locale.ENGLISH) ;
90          }
91        
92          char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
93        
94          StringBuffer local = new StringBuffer(40); // manipulate
95          StringBuffer code = new StringBuffer(10) ; //   output
96          // handle initial 2 characters exceptions
97          switch(inwd[0]) {
98          case 'K' : 
99          case 'G' : 
100         case 'P' : /* looking for KN, etc*/
101             if (inwd[1] == 'N') {
102                 local.append(inwd, 1, inwd.length - 1);
103             } else {
104                 local.append(inwd);
105             }
106             break;
107         case 'A': /* looking for AE */
108             if (inwd[1] == 'E') {
109                 local.append(inwd, 1, inwd.length - 1);
110             } else {
111                 local.append(inwd);
112             }
113             break;
114         case 'W' : /* looking for WR or WH */
115             if (inwd[1] == 'R') {   // WR -> R
116                 local.append(inwd, 1, inwd.length - 1); 
117                 break ;
118             }
119             if (inwd[1] == 'H') {
120                 local.append(inwd, 1, inwd.length - 1);
121                 local.setCharAt(0, 'W'); // WH -> W
122             } else {
123                 local.append(inwd);
124             }
125             break;
126         case 'X' : /* initial X becomes S */
127             inwd[0] = 'S';
128             local.append(inwd);
129             break ;
130         default :
131             local.append(inwd);
132         } // now local has working string with initials fixed
133 
134         int wdsz = local.length();
135         int n = 0 ;
136 
137         while ((code.length() < this.getMaxCodeLen()) && 
138                (n < wdsz) ) { // max code size of 4 works well
139             char symb = local.charAt(n) ;
140             // remove duplicate letters except C
141             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
142                 n++ ;
143             } else { // not dup
144                 switch(symb) {
145                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
146                     if (n == 0) { 
147                         code.append(symb);
148                     }
149                     break ; // only use vowel if leading char
150                 case 'B' :
151                     if ( isPreviousChar(local, n, 'M') && 
152                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
153                         break;
154                     }
155                     code.append(symb);
156                     break;
157                 case 'C' : // lots of C special cases
158                     /* discard if SCI, SCE or SCY */
159                     if ( isPreviousChar(local, n, 'S') && 
160                          !isLastChar(wdsz, n) && 
161                          (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 
162                         break;
163                     }
164                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
165                         code.append('X'); 
166                         break;
167                     }
168                     if (!isLastChar(wdsz, n) && 
169                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
170                         code.append('S');
171                         break; // CI,CE,CY -> S
172                     }
173                     if (isPreviousChar(local, n, 'S') &&
174                         isNextChar(local, n, 'H') ) { // SCH->sk
175                         code.append('K') ; 
176                         break ;
177                     }
178                     if (isNextChar(local, n, 'H')) { // detect CH
179                         if ((n == 0) && 
180                             (wdsz >= 3) && 
181                             isVowel(local,2) ) { // CH consonant -> K consonant
182                             code.append('K');
183                         } else { 
184                             code.append('X'); // CHvowel -> X
185                         }
186                     } else { 
187                         code.append('K');
188                     }
189                     break ;
190                 case 'D' :
191                     if (!isLastChar(wdsz, n + 1) && 
192                         isNextChar(local, n, 'G') && 
193                         (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
194                         code.append('J'); n += 2 ;
195                     } else { 
196                         code.append('T');
197                     }
198                     break ;
199                 case 'G' : // GH silent at end or before consonant
200                     if (isLastChar(wdsz, n + 1) && 
201                         isNextChar(local, n, 'H')) {
202                         break;
203                     }
204                     if (!isLastChar(wdsz, n + 1) &&  
205                         isNextChar(local,n,'H') && 
206                         !isVowel(local,n+2)) {
207                         break;
208                     }
209                     if ((n > 0) && 
210                         ( regionMatch(local, n, "GN") ||
211                           regionMatch(local, n, "GNED") ) ) {
212                         break; // silent G
213                     }
214                     if (isPreviousChar(local, n, 'G')) {
215                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
216                         hard = true ;
217                     } else {
218                         hard = false ;
219                     }
220                     if (!isLastChar(wdsz, n) && 
221                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 
222                         (!hard)) {
223                         code.append('J');
224                     } else {
225                         code.append('K');
226                     }
227                     break ;
228                 case 'H':
229                     if (isLastChar(wdsz, n)) {
230                         break ; // terminal H
231                     }
232                     if ((n > 0) && 
233                         (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
234                         break;
235                     }
236                     if (isVowel(local,n+1)) {
237                         code.append('H'); // Hvowel
238                     }
239                     break;
240                 case 'F': 
241                 case 'J' : 
242                 case 'L' :
243                 case 'M': 
244                 case 'N' : 
245                 case 'R' :
246                     code.append(symb); 
247                     break;
248                 case 'K' :
249                     if (n > 0) { // not initial
250                         if (!isPreviousChar(local, n, 'C')) {
251                             code.append(symb);
252                         }
253                     } else {
254                         code.append(symb); // initial K
255                     }
256                     break ;
257                 case 'P' :
258                     if (isNextChar(local,n,'H')) {
259                         // PH -> F
260                         code.append('F');
261                     } else {
262                         code.append(symb);
263                     }
264                     break ;
265                 case 'Q' :
266                     code.append('K');
267                     break;
268                 case 'S' :
269                     if (regionMatch(local,n,"SH") || 
270                         regionMatch(local,n,"SIO") || 
271                         regionMatch(local,n,"SIA")) {
272                         code.append('X');
273                     } else {
274                         code.append('S');
275                     }
276                     break;
277                 case 'T' :
278                     if (regionMatch(local,n,"TIA") || 
279                         regionMatch(local,n,"TIO")) {
280                         code.append('X'); 
281                         break;
282                     }
283                     if (regionMatch(local,n,"TCH")) {
284                         // Silent if in "TCH"
285                         break;
286                     }
287                     // substitute numeral 0 for TH (resembles theta after all)
288                     if (regionMatch(local,n,"TH")) {
289                         code.append('0');
290                     } else {
291                         code.append('T');
292                     }
293                     break ;
294                 case 'V' :
295                     code.append('F'); break ;
296                 case 'W' : case 'Y' : // silent if not followed by vowel
297                     if (!isLastChar(wdsz,n) && 
298                         isVowel(local,n+1)) {
299                         code.append(symb);
300                     }
301                     break ;
302                 case 'X' :
303                     code.append('K'); code.append('S');
304                     break ;
305                 case 'Z' :
306                     code.append('S'); break ;
307                 } // end switch
308                 n++ ;
309             } // end else from symb != 'C'
310             if (code.length() > this.getMaxCodeLen()) { 
311                 code.setLength(this.getMaxCodeLen()); 
312             }
313         }
314         return code.toString();
315     }
316 
317     private boolean isVowel(StringBuffer string, int index) {
318         return VOWELS.indexOf(string.charAt(index)) >= 0;
319     }
320 
321     private boolean isPreviousChar(StringBuffer string, int index, char c) {
322         boolean matches = false;
323         if( index > 0 &&
324             index < string.length() ) {
325             matches = string.charAt(index - 1) == c;
326         }
327         return matches;
328     }
329 
330     private boolean isNextChar(StringBuffer string, int index, char c) {
331         boolean matches = false;
332         if( index >= 0 &&
333             index < string.length() - 1 ) {
334             matches = string.charAt(index + 1) == c;
335         }
336         return matches;
337     }
338 
339     private boolean regionMatch(StringBuffer string, int index, String test) {
340         boolean matches = false;
341         if( index >= 0 &&
342             (index + test.length() - 1) < string.length() ) {
343             String substring = string.substring( index, index + test.length());
344             matches = substring.equals( test );
345         }
346         return matches;
347     }
348 
349     private boolean isLastChar(int wdsz, int n) {
350         return n + 1 == wdsz;
351     } 
352     
353     
354     /**
355      * Encodes an Object using the metaphone algorithm.  This method
356      * is provided in order to satisfy the requirements of the
357      * Encoder interface, and will throw an EncoderException if the
358      * supplied object is not of type java.lang.String.
359      *
360      * @param pObject Object to encode
361      * @return An object (or type java.lang.String) containing the 
362      *         metaphone code which corresponds to the String supplied.
363      * @throws EncoderException if the parameter supplied is not
364      *                          of type java.lang.String
365      */
366     public Object encode(Object pObject) throws EncoderException {
367         if (!(pObject instanceof String)) {
368             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
369         }
370         return metaphone((String) pObject);
371     }
372 
373     /**
374      * Encodes a String using the Metaphone algorithm. 
375      *
376      * @param pString String object to encode
377      * @return The metaphone code corresponding to the String supplied
378      */
379     public String encode(String pString) {
380         return metaphone(pString);   
381     }
382 
383     /**
384      * Tests is the metaphones of two strings are identical.
385      *
386      * @param str1 First of two strings to compare
387      * @param str2 Second of two strings to compare
388      * @return <code>true</code> if the metaphones of these strings are identical, 
389      *        <code>false</code> otherwise.
390      */
391     public boolean isMetaphoneEqual(String str1, String str2) {
392         return metaphone(str1).equals(metaphone(str2));
393     }
394 
395     /**
396      * Returns the maxCodeLen.
397      * @return int
398      */
399     public int getMaxCodeLen() { return this.maxCodeLen; }
400 
401     /**
402      * Sets the maxCodeLen.
403      * @param maxCodeLen The maxCodeLen to set
404      */
405     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
406 
407 }