001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * <p> 029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 030 * p 39.</CITE> 031 * <p> 032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 033 * </p> 034 * <ul> 035 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 036 * (broken link 4/30/2013) </li> 037 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 038 * (link checked 4/30/2013) </li> 039 * </ul> 040 * <p> 041 * They have had undocumented changes from the originally published algorithm. 042 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 043 * <p> 044 * This class is conditionally thread-safe. 045 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} 046 * but is not volatile, and accesses are not synchronized. 047 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 048 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 049 * after initial setup. 050 * 051 */ 052public class Metaphone implements StringEncoder { 053 054 /** 055 * Five values in the English language 056 */ 057 private static final String VOWELS = "AEIOU"; 058 059 /** 060 * Variable used in Metaphone algorithm 061 */ 062 private static final String FRONTV = "EIY"; 063 064 /** 065 * Variable used in Metaphone algorithm 066 */ 067 private static final String VARSON = "CSPTG"; 068 069 /** 070 * The max code length for metaphone is 4 071 */ 072 private int maxCodeLen = 4; 073 074 /** 075 * Creates an instance of the Metaphone encoder 076 */ 077 public Metaphone() { 078 super(); 079 } 080 081 /** 082 * Find the metaphone value of a String. This is similar to the 083 * soundex algorithm, but better at finding similar sounding words. 084 * All input is converted to upper case. 085 * Limitations: Input format is expected to be a single ASCII word 086 * with only characters in the A - Z range, no punctuation or numbers. 087 * 088 * @param txt String to find the metaphone code for 089 * @return A metaphone code corresponding to the String supplied 090 */ 091 public String metaphone(final String txt) { 092 boolean hard = false; 093 int txtLength; 094 if (txt == null || (txtLength = txt.length()) == 0) { 095 return ""; 096 } 097 // single character is itself 098 if (txtLength == 1) { 099 return txt.toUpperCase(java.util.Locale.ENGLISH); 100 } 101 102 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 103 104 final StringBuilder local = new StringBuilder(40); // manipulate 105 final StringBuilder code = new StringBuilder(10); // output 106 // handle initial 2 characters exceptions 107 switch(inwd[0]) { 108 case 'K': 109 case 'G': 110 case 'P': /* looking for KN, etc*/ 111 if (inwd[1] == 'N') { 112 local.append(inwd, 1, inwd.length - 1); 113 } else { 114 local.append(inwd); 115 } 116 break; 117 case 'A': /* looking for AE */ 118 if (inwd[1] == 'E') { 119 local.append(inwd, 1, inwd.length - 1); 120 } else { 121 local.append(inwd); 122 } 123 break; 124 case 'W': /* looking for WR or WH */ 125 if (inwd[1] == 'R') { // WR -> R 126 local.append(inwd, 1, inwd.length - 1); 127 break; 128 } 129 if (inwd[1] == 'H') { 130 local.append(inwd, 1, inwd.length - 1); 131 local.setCharAt(0, 'W'); // WH -> W 132 } else { 133 local.append(inwd); 134 } 135 break; 136 case 'X': /* initial X becomes S */ 137 inwd[0] = 'S'; 138 local.append(inwd); 139 break; 140 default: 141 local.append(inwd); 142 } // now local has working string with initials fixed 143 144 final int wdsz = local.length(); 145 int n = 0; 146 147 while (code.length() < this.getMaxCodeLen() && 148 n < wdsz ) { // max code size of 4 works well 149 final char symb = local.charAt(n); 150 // remove duplicate letters except C 151 if (symb != 'C' && isPreviousChar( local, n, symb ) ) { 152 n++; 153 } else { // not dup 154 switch(symb) { 155 case 'A': 156 case 'E': 157 case 'I': 158 case 'O': 159 case 'U': 160 if (n == 0) { 161 code.append(symb); 162 } 163 break; // only use vowel if leading char 164 case 'B': 165 if ( isPreviousChar(local, n, 'M') && 166 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 167 break; 168 } 169 code.append(symb); 170 break; 171 case 'C': // lots of C special cases 172 /* discard if SCI, SCE or SCY */ 173 if ( isPreviousChar(local, n, 'S') && 174 !isLastChar(wdsz, n) && 175 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { 176 break; 177 } 178 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 179 code.append('X'); 180 break; 181 } 182 if (!isLastChar(wdsz, n) && 183 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 184 code.append('S'); 185 break; // CI,CE,CY -> S 186 } 187 if (isPreviousChar(local, n, 'S') && 188 isNextChar(local, n, 'H') ) { // SCH->sk 189 code.append('K'); 190 break; 191 } 192 if (isNextChar(local, n, 'H')) { // detect CH 193 if (n == 0 && 194 wdsz >= 3 && 195 isVowel(local,2) ) { // CH consonant -> K consonant 196 code.append('K'); 197 } else { 198 code.append('X'); // CHvowel -> X 199 } 200 } else { 201 code.append('K'); 202 } 203 break; 204 case 'D': 205 if (!isLastChar(wdsz, n + 1) && 206 isNextChar(local, n, 'G') && 207 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 208 code.append('J'); n += 2; 209 } else { 210 code.append('T'); 211 } 212 break; 213 case 'G': // GH silent at end or before consonant 214 if (isLastChar(wdsz, n + 1) && 215 isNextChar(local, n, 'H')) { 216 break; 217 } 218 if (!isLastChar(wdsz, n + 1) && 219 isNextChar(local,n,'H') && 220 !isVowel(local,n+2)) { 221 break; 222 } 223 if (n > 0 && 224 ( regionMatch(local, n, "GN") || 225 regionMatch(local, n, "GNED") ) ) { 226 break; // silent G 227 } 228 if (isPreviousChar(local, n, 'G')) { 229 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 230 hard = true; 231 } else { 232 hard = false; 233 } 234 if (!isLastChar(wdsz, n) && 235 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && 236 !hard) { 237 code.append('J'); 238 } else { 239 code.append('K'); 240 } 241 break; 242 case 'H': 243 if (isLastChar(wdsz, n)) { 244 break; // terminal H 245 } 246 if (n > 0 && 247 VARSON.indexOf(local.charAt(n - 1)) >= 0) { 248 break; 249 } 250 if (isVowel(local,n+1)) { 251 code.append('H'); // Hvowel 252 } 253 break; 254 case 'F': 255 case 'J': 256 case 'L': 257 case 'M': 258 case 'N': 259 case 'R': 260 code.append(symb); 261 break; 262 case 'K': 263 if (n > 0) { // not initial 264 if (!isPreviousChar(local, n, 'C')) { 265 code.append(symb); 266 } 267 } else { 268 code.append(symb); // initial K 269 } 270 break; 271 case 'P': 272 if (isNextChar(local,n,'H')) { 273 // PH -> F 274 code.append('F'); 275 } else { 276 code.append(symb); 277 } 278 break; 279 case 'Q': 280 code.append('K'); 281 break; 282 case 'S': 283 if (regionMatch(local,n,"SH") || 284 regionMatch(local,n,"SIO") || 285 regionMatch(local,n,"SIA")) { 286 code.append('X'); 287 } else { 288 code.append('S'); 289 } 290 break; 291 case 'T': 292 if (regionMatch(local,n,"TIA") || 293 regionMatch(local,n,"TIO")) { 294 code.append('X'); 295 break; 296 } 297 if (regionMatch(local,n,"TCH")) { 298 // Silent if in "TCH" 299 break; 300 } 301 // substitute numeral 0 for TH (resembles theta after all) 302 if (regionMatch(local,n,"TH")) { 303 code.append('0'); 304 } else { 305 code.append('T'); 306 } 307 break; 308 case 'V': 309 code.append('F'); break; 310 case 'W': 311 case 'Y': // silent if not followed by vowel 312 if (!isLastChar(wdsz,n) && 313 isVowel(local,n+1)) { 314 code.append(symb); 315 } 316 break; 317 case 'X': 318 code.append('K'); 319 code.append('S'); 320 break; 321 case 'Z': 322 code.append('S'); 323 break; 324 default: 325 // do nothing 326 break; 327 } // end switch 328 n++; 329 } // end else from symb != 'C' 330 if (code.length() > this.getMaxCodeLen()) { 331 code.setLength(this.getMaxCodeLen()); 332 } 333 } 334 return code.toString(); 335 } 336 337 private boolean isVowel(final StringBuilder string, final int index) { 338 return VOWELS.indexOf(string.charAt(index)) >= 0; 339 } 340 341 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 342 boolean matches = false; 343 if( index > 0 && 344 index < string.length() ) { 345 matches = string.charAt(index - 1) == c; 346 } 347 return matches; 348 } 349 350 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 351 boolean matches = false; 352 if( index >= 0 && 353 index < string.length() - 1 ) { 354 matches = string.charAt(index + 1) == c; 355 } 356 return matches; 357 } 358 359 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 360 boolean matches = false; 361 if( index >= 0 && 362 index + test.length() - 1 < string.length() ) { 363 final String substring = string.substring( index, index + test.length()); 364 matches = substring.equals( test ); 365 } 366 return matches; 367 } 368 369 private boolean isLastChar(final int wdsz, final int n) { 370 return n + 1 == wdsz; 371 } 372 373 374 /** 375 * Encodes an Object using the metaphone algorithm. This method 376 * is provided in order to satisfy the requirements of the 377 * Encoder interface, and will throw an EncoderException if the 378 * supplied object is not of type java.lang.String. 379 * 380 * @param obj Object to encode 381 * @return An object (or type java.lang.String) containing the 382 * metaphone code which corresponds to the String supplied. 383 * @throws EncoderException if the parameter supplied is not 384 * of type java.lang.String 385 */ 386 @Override 387 public Object encode(final Object obj) throws EncoderException { 388 if (!(obj instanceof String)) { 389 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 390 } 391 return metaphone((String) obj); 392 } 393 394 /** 395 * Encodes a String using the Metaphone algorithm. 396 * 397 * @param str String object to encode 398 * @return The metaphone code corresponding to the String supplied 399 */ 400 @Override 401 public String encode(final String str) { 402 return metaphone(str); 403 } 404 405 /** 406 * Tests is the metaphones of two strings are identical. 407 * 408 * @param str1 First of two strings to compare 409 * @param str2 Second of two strings to compare 410 * @return <code>true</code> if the metaphones of these strings are identical, 411 * <code>false</code> otherwise. 412 */ 413 public boolean isMetaphoneEqual(final String str1, final String str2) { 414 return metaphone(str1).equals(metaphone(str2)); 415 } 416 417 /** 418 * Returns the maxCodeLen. 419 * @return int 420 */ 421 public int getMaxCodeLen() { return this.maxCodeLen; } 422 423 /** 424 * Sets the maxCodeLen. 425 * @param maxCodeLen The maxCodeLen to set 426 */ 427 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 428 429}