View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */ 
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a metaphone value. 
25   * <p>
26   * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 
27   * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28   * </p>
29   * <p>
30   * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
31   * 39.</CITE>
32   * </p>
33   * 
34   * @author Apache Software Foundation
35   * @version $Id: Metaphone.java 582447 2007-10-06 04:14:43Z bayard $
36   */
37  public class Metaphone implements StringEncoder {
38  
39      /**
40       * Five values in the English language 
41       */
42      private static final String VOWELS = "AEIOU" ;
43  
44      /**
45       * Variable used in Metaphone algorithm
46       */
47      private static final String FRONTV = "EIY"   ;
48  
49      /**
50       * Variable used in Metaphone algorithm
51       */
52      private static final String VARSON = "CSPTG" ;
53  
54      /**
55       * The max code length for metaphone is 4
56       */
57      private int maxCodeLen = 4 ;
58  
59      /**
60       * Creates an instance of the Metaphone encoder
61       */
62      public Metaphone() {
63          super();
64      }
65  
66      /**
67       * Find the metaphone value of a String. This is similar to the
68       * soundex algorithm, but better at finding similar sounding words.
69       * All input is converted to upper case.
70       * Limitations: Input format is expected to be a single ASCII word
71       * with only characters in the A - Z range, no punctuation or numbers.
72       *
73       * @param txt String to find the metaphone code for
74       * @return A metaphone code corresponding to the String supplied
75       */
76      public String metaphone(String txt) {
77          boolean hard = false ;
78          if ((txt == null) || (txt.length() == 0)) {
79              return "" ;
80          }
81          // single character is itself
82          if (txt.length() == 1) {
83              return txt.toUpperCase() ;
84          }
85        
86          char[] inwd = txt.toUpperCase().toCharArray() ;
87        
88          StringBuffer local = new StringBuffer(40); // manipulate
89          StringBuffer code = new StringBuffer(10) ; //   output
90          // handle initial 2 characters exceptions
91          switch(inwd[0]) {
92          case 'K' : 
93          case 'G' : 
94          case 'P' : /* looking for KN, etc*/
95              if (inwd[1] == 'N') {
96                  local.append(inwd, 1, inwd.length - 1);
97              } else {
98                  local.append(inwd);
99              }
100             break;
101         case 'A': /* looking for AE */
102             if (inwd[1] == 'E') {
103                 local.append(inwd, 1, inwd.length - 1);
104             } else {
105                 local.append(inwd);
106             }
107             break;
108         case 'W' : /* looking for WR or WH */
109             if (inwd[1] == 'R') {   // WR -> R
110                 local.append(inwd, 1, inwd.length - 1); 
111                 break ;
112             }
113             if (inwd[1] == 'H') {
114                 local.append(inwd, 1, inwd.length - 1);
115                 local.setCharAt(0, 'W'); // WH -> W
116             } else {
117                 local.append(inwd);
118             }
119             break;
120         case 'X' : /* initial X becomes S */
121             inwd[0] = 'S';
122             local.append(inwd);
123             break ;
124         default :
125             local.append(inwd);
126         } // now local has working string with initials fixed
127 
128         int wdsz = local.length();
129         int n = 0 ;
130 
131         while ((code.length() < this.getMaxCodeLen()) && 
132         	   (n < wdsz) ) { // max code size of 4 works well
133             char symb = local.charAt(n) ;
134             // remove duplicate letters except C
135             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
136                 n++ ;
137             } else { // not dup
138                 switch(symb) {
139                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
140                     if (n == 0) { 
141                         code.append(symb);
142                     }
143                     break ; // only use vowel if leading char
144                 case 'B' :
145                     if ( isPreviousChar(local, n, 'M') && 
146                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
147 						break;
148                     }
149                     code.append(symb);
150                     break;
151                 case 'C' : // lots of C special cases
152                     /* discard if SCI, SCE or SCY */
153                     if ( isPreviousChar(local, n, 'S') && 
154                          !isLastChar(wdsz, n) && 
155                          (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 
156                         break;
157                     }
158                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
159                         code.append('X'); 
160                         break;
161                     }
162                     if (!isLastChar(wdsz, n) && 
163                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
164                         code.append('S');
165                         break; // CI,CE,CY -> S
166                     }
167                     if (isPreviousChar(local, n, 'S') &&
168 						isNextChar(local, n, 'H') ) { // SCH->sk
169                         code.append('K') ; 
170                         break ;
171                     }
172                     if (isNextChar(local, n, 'H')) { // detect CH
173                         if ((n == 0) && 
174                         	(wdsz >= 3) && 
175                             isVowel(local,2) ) { // CH consonant -> K consonant
176                             code.append('K');
177                         } else { 
178                             code.append('X'); // CHvowel -> X
179                         }
180                     } else { 
181                         code.append('K');
182                     }
183                     break ;
184                 case 'D' :
185                     if (!isLastChar(wdsz, n + 1) && 
186                         isNextChar(local, n, 'G') && 
187                         (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
188                         code.append('J'); n += 2 ;
189                     } else { 
190                         code.append('T');
191                     }
192                     break ;
193                 case 'G' : // GH silent at end or before consonant
194                     if (isLastChar(wdsz, n + 1) && 
195                         isNextChar(local, n, 'H')) {
196                         break;
197                     }
198                     if (!isLastChar(wdsz, n + 1) &&  
199                         isNextChar(local,n,'H') && 
200                         !isVowel(local,n+2)) {
201                         break;
202                     }
203                     if ((n > 0) && 
204                     	( regionMatch(local, n, "GN") ||
205 					      regionMatch(local, n, "GNED") ) ) {
206                         break; // silent G
207                     }
208                     if (isPreviousChar(local, n, 'G')) {
209                         hard = true ;
210                     } else {
211                         hard = false ;
212                     }
213                     if (!isLastChar(wdsz, n) && 
214                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 
215                         (!hard)) {
216                         code.append('J');
217                     } else {
218                         code.append('K');
219                     }
220                     break ;
221                 case 'H':
222                     if (isLastChar(wdsz, n)) {
223                         break ; // terminal H
224                     }
225                     if ((n > 0) && 
226                         (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
227                         break;
228                     }
229                     if (isVowel(local,n+1)) {
230                         code.append('H'); // Hvowel
231                     }
232                     break;
233                 case 'F': 
234                 case 'J' : 
235                 case 'L' :
236                 case 'M': 
237                 case 'N' : 
238                 case 'R' :
239                     code.append(symb); 
240                     break;
241                 case 'K' :
242                     if (n > 0) { // not initial
243                         if (!isPreviousChar(local, n, 'C')) {
244                             code.append(symb);
245                         }
246                     } else {
247                         code.append(symb); // initial K
248                     }
249                     break ;
250                 case 'P' :
251                     if (isNextChar(local,n,'H')) {
252                         // PH -> F
253                         code.append('F');
254                     } else {
255                         code.append(symb);
256                     }
257                     break ;
258                 case 'Q' :
259                     code.append('K');
260                     break;
261                 case 'S' :
262                     if (regionMatch(local,n,"SH") || 
263 					    regionMatch(local,n,"SIO") || 
264 					    regionMatch(local,n,"SIA")) {
265                         code.append('X');
266                     } else {
267                         code.append('S');
268                     }
269                     break;
270                 case 'T' :
271                     if (regionMatch(local,n,"TIA") || 
272 						regionMatch(local,n,"TIO")) {
273                         code.append('X'); 
274                         break;
275                     }
276                     if (regionMatch(local,n,"TCH")) {
277 						// Silent if in "TCH"
278                         break;
279                     }
280                     // substitute numeral 0 for TH (resembles theta after all)
281                     if (regionMatch(local,n,"TH")) {
282                         code.append('0');
283                     } else {
284                         code.append('T');
285                     }
286                     break ;
287                 case 'V' :
288                     code.append('F'); break ;
289                 case 'W' : case 'Y' : // silent if not followed by vowel
290                     if (!isLastChar(wdsz,n) && 
291                     	isVowel(local,n+1)) {
292                         code.append(symb);
293                     }
294                     break ;
295                 case 'X' :
296                     code.append('K'); code.append('S');
297                     break ;
298                 case 'Z' :
299                     code.append('S'); break ;
300                 } // end switch
301                 n++ ;
302             } // end else from symb != 'C'
303             if (code.length() > this.getMaxCodeLen()) { 
304             	code.setLength(this.getMaxCodeLen()); 
305             }
306         }
307         return code.toString();
308     }
309 
310 	private boolean isVowel(StringBuffer string, int index) {
311 		return VOWELS.indexOf(string.charAt(index)) >= 0;
312 	}
313 
314 	private boolean isPreviousChar(StringBuffer string, int index, char c) {
315 		boolean matches = false;
316 		if( index > 0 &&
317 		    index < string.length() ) {
318 			matches = string.charAt(index - 1) == c;
319 		}
320 		return matches;
321 	}
322 
323 	private boolean isNextChar(StringBuffer string, int index, char c) {
324 		boolean matches = false;
325 		if( index >= 0 &&
326 		    index < string.length() - 1 ) {
327 			matches = string.charAt(index + 1) == c;
328 		}
329 		return matches;
330 	}
331 
332 	private boolean regionMatch(StringBuffer string, int index, String test) {
333 		boolean matches = false;
334 		if( index >= 0 &&
335 		    (index + test.length() - 1) < string.length() ) {
336 			String substring = string.substring( index, index + test.length());
337 			matches = substring.equals( test );
338 		}
339 		return matches;
340 	}
341 
342 	private boolean isLastChar(int wdsz, int n) {
343 		return n + 1 == wdsz;
344 	} 
345     
346     
347     /**
348      * Encodes an Object using the metaphone algorithm.  This method
349      * is provided in order to satisfy the requirements of the
350      * Encoder interface, and will throw an EncoderException if the
351      * supplied object is not of type java.lang.String.
352      *
353      * @param pObject Object to encode
354      * @return An object (or type java.lang.String) containing the 
355      *         metaphone code which corresponds to the String supplied.
356      * @throws EncoderException if the parameter supplied is not
357      *                          of type java.lang.String
358      */
359     public Object encode(Object pObject) throws EncoderException {
360         if (!(pObject instanceof java.lang.String)) {
361             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
362         }
363         return metaphone((String) pObject);
364     }
365 
366     /**
367      * Encodes a String using the Metaphone algorithm. 
368      *
369      * @param pString String object to encode
370      * @return The metaphone code corresponding to the String supplied
371      */
372     public String encode(String pString) {
373         return metaphone(pString);   
374     }
375 
376     /**
377      * Tests is the metaphones of two strings are identical.
378      *
379      * @param str1 First of two strings to compare
380      * @param str2 Second of two strings to compare
381      * @return <code>true</code> if the metaphones of these strings are identical, 
382      *        <code>false</code> otherwise.
383      */
384     public boolean isMetaphoneEqual(String str1, String str2) {
385         return metaphone(str1).equals(metaphone(str2));
386     }
387 
388     /**
389      * Returns the maxCodeLen.
390      * @return int
391      */
392     public int getMaxCodeLen() { return this.maxCodeLen; }
393 
394     /**
395      * Sets the maxCodeLen.
396      * @param maxCodeLen The maxCodeLen to set
397      */
398     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
399 
400 }