001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * <p> 029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 030 * p 39.</CITE> 031 * <p> 032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 033 * </p> 034 * <ul> 035 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </li> 036 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </li> 037 * </ul> 038 * <p> 039 * They have had undocumented changes from the originally published algorithm. 040 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 041 * <p> 042 * This class is conditionally thread-safe. 043 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} 044 * but is not volatile, and accesses are not synchronized. 045 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 046 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 047 * after initial setup. 048 * 049 * @version $Id: Metaphone.java 1477596 2013-04-30 12:42:29Z ggregory $ 050 */ 051public class Metaphone implements StringEncoder { 052 053 /** 054 * Five values in the English language 055 */ 056 private static final String VOWELS = "AEIOU"; 057 058 /** 059 * Variable used in Metaphone algorithm 060 */ 061 private static final String FRONTV = "EIY"; 062 063 /** 064 * Variable used in Metaphone algorithm 065 */ 066 private static final String VARSON = "CSPTG"; 067 068 /** 069 * The max code length for metaphone is 4 070 */ 071 private int maxCodeLen = 4; 072 073 /** 074 * Creates an instance of the Metaphone encoder 075 */ 076 public Metaphone() { 077 super(); 078 } 079 080 /** 081 * Find the metaphone value of a String. This is similar to the 082 * soundex algorithm, but better at finding similar sounding words. 083 * All input is converted to upper case. 084 * Limitations: Input format is expected to be a single ASCII word 085 * with only characters in the A - Z range, no punctuation or numbers. 086 * 087 * @param txt String to find the metaphone code for 088 * @return A metaphone code corresponding to the String supplied 089 */ 090 public String metaphone(final String txt) { 091 boolean hard = false; 092 if (txt == null || txt.length() == 0) { 093 return ""; 094 } 095 // single character is itself 096 if (txt.length() == 1) { 097 return txt.toUpperCase(java.util.Locale.ENGLISH); 098 } 099 100 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 101 102 final StringBuilder local = new StringBuilder(40); // manipulate 103 final StringBuilder code = new StringBuilder(10); // output 104 // handle initial 2 characters exceptions 105 switch(inwd[0]) { 106 case 'K': 107 case 'G': 108 case 'P': /* looking for KN, etc*/ 109 if (inwd[1] == 'N') { 110 local.append(inwd, 1, inwd.length - 1); 111 } else { 112 local.append(inwd); 113 } 114 break; 115 case 'A': /* looking for AE */ 116 if (inwd[1] == 'E') { 117 local.append(inwd, 1, inwd.length - 1); 118 } else { 119 local.append(inwd); 120 } 121 break; 122 case 'W': /* looking for WR or WH */ 123 if (inwd[1] == 'R') { // WR -> R 124 local.append(inwd, 1, inwd.length - 1); 125 break; 126 } 127 if (inwd[1] == 'H') { 128 local.append(inwd, 1, inwd.length - 1); 129 local.setCharAt(0, 'W'); // WH -> W 130 } else { 131 local.append(inwd); 132 } 133 break; 134 case 'X': /* initial X becomes S */ 135 inwd[0] = 'S'; 136 local.append(inwd); 137 break; 138 default: 139 local.append(inwd); 140 } // now local has working string with initials fixed 141 142 final int wdsz = local.length(); 143 int n = 0; 144 145 while (code.length() < this.getMaxCodeLen() && 146 n < wdsz ) { // max code size of 4 works well 147 final char symb = local.charAt(n); 148 // remove duplicate letters except C 149 if (symb != 'C' && isPreviousChar( local, n, symb ) ) { 150 n++; 151 } else { // not dup 152 switch(symb) { 153 case 'A': 154 case 'E': 155 case 'I': 156 case 'O': 157 case 'U': 158 if (n == 0) { 159 code.append(symb); 160 } 161 break; // only use vowel if leading char 162 case 'B': 163 if ( isPreviousChar(local, n, 'M') && 164 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 165 break; 166 } 167 code.append(symb); 168 break; 169 case 'C': // lots of C special cases 170 /* discard if SCI, SCE or SCY */ 171 if ( isPreviousChar(local, n, 'S') && 172 !isLastChar(wdsz, n) && 173 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { 174 break; 175 } 176 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 177 code.append('X'); 178 break; 179 } 180 if (!isLastChar(wdsz, n) && 181 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 182 code.append('S'); 183 break; // CI,CE,CY -> S 184 } 185 if (isPreviousChar(local, n, 'S') && 186 isNextChar(local, n, 'H') ) { // SCH->sk 187 code.append('K'); 188 break; 189 } 190 if (isNextChar(local, n, 'H')) { // detect CH 191 if (n == 0 && 192 wdsz >= 3 && 193 isVowel(local,2) ) { // CH consonant -> K consonant 194 code.append('K'); 195 } else { 196 code.append('X'); // CHvowel -> X 197 } 198 } else { 199 code.append('K'); 200 } 201 break; 202 case 'D': 203 if (!isLastChar(wdsz, n + 1) && 204 isNextChar(local, n, 'G') && 205 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 206 code.append('J'); n += 2; 207 } else { 208 code.append('T'); 209 } 210 break; 211 case 'G': // GH silent at end or before consonant 212 if (isLastChar(wdsz, n + 1) && 213 isNextChar(local, n, 'H')) { 214 break; 215 } 216 if (!isLastChar(wdsz, n + 1) && 217 isNextChar(local,n,'H') && 218 !isVowel(local,n+2)) { 219 break; 220 } 221 if (n > 0 && 222 ( regionMatch(local, n, "GN") || 223 regionMatch(local, n, "GNED") ) ) { 224 break; // silent G 225 } 226 if (isPreviousChar(local, n, 'G')) { 227 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 228 hard = true; 229 } else { 230 hard = false; 231 } 232 if (!isLastChar(wdsz, n) && 233 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && 234 !hard) { 235 code.append('J'); 236 } else { 237 code.append('K'); 238 } 239 break; 240 case 'H': 241 if (isLastChar(wdsz, n)) { 242 break; // terminal H 243 } 244 if (n > 0 && 245 VARSON.indexOf(local.charAt(n - 1)) >= 0) { 246 break; 247 } 248 if (isVowel(local,n+1)) { 249 code.append('H'); // Hvowel 250 } 251 break; 252 case 'F': 253 case 'J': 254 case 'L': 255 case 'M': 256 case 'N': 257 case 'R': 258 code.append(symb); 259 break; 260 case 'K': 261 if (n > 0) { // not initial 262 if (!isPreviousChar(local, n, 'C')) { 263 code.append(symb); 264 } 265 } else { 266 code.append(symb); // initial K 267 } 268 break; 269 case 'P': 270 if (isNextChar(local,n,'H')) { 271 // PH -> F 272 code.append('F'); 273 } else { 274 code.append(symb); 275 } 276 break; 277 case 'Q': 278 code.append('K'); 279 break; 280 case 'S': 281 if (regionMatch(local,n,"SH") || 282 regionMatch(local,n,"SIO") || 283 regionMatch(local,n,"SIA")) { 284 code.append('X'); 285 } else { 286 code.append('S'); 287 } 288 break; 289 case 'T': 290 if (regionMatch(local,n,"TIA") || 291 regionMatch(local,n,"TIO")) { 292 code.append('X'); 293 break; 294 } 295 if (regionMatch(local,n,"TCH")) { 296 // Silent if in "TCH" 297 break; 298 } 299 // substitute numeral 0 for TH (resembles theta after all) 300 if (regionMatch(local,n,"TH")) { 301 code.append('0'); 302 } else { 303 code.append('T'); 304 } 305 break; 306 case 'V': 307 code.append('F'); break; 308 case 'W': 309 case 'Y': // silent if not followed by vowel 310 if (!isLastChar(wdsz,n) && 311 isVowel(local,n+1)) { 312 code.append(symb); 313 } 314 break; 315 case 'X': 316 code.append('K'); 317 code.append('S'); 318 break; 319 case 'Z': 320 code.append('S'); 321 break; 322 default: 323 // do nothing 324 break; 325 } // end switch 326 n++; 327 } // end else from symb != 'C' 328 if (code.length() > this.getMaxCodeLen()) { 329 code.setLength(this.getMaxCodeLen()); 330 } 331 } 332 return code.toString(); 333 } 334 335 private boolean isVowel(final StringBuilder string, final int index) { 336 return VOWELS.indexOf(string.charAt(index)) >= 0; 337 } 338 339 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 340 boolean matches = false; 341 if( index > 0 && 342 index < string.length() ) { 343 matches = string.charAt(index - 1) == c; 344 } 345 return matches; 346 } 347 348 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 349 boolean matches = false; 350 if( index >= 0 && 351 index < string.length() - 1 ) { 352 matches = string.charAt(index + 1) == c; 353 } 354 return matches; 355 } 356 357 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 358 boolean matches = false; 359 if( index >= 0 && 360 index + test.length() - 1 < string.length() ) { 361 final String substring = string.substring( index, index + test.length()); 362 matches = substring.equals( test ); 363 } 364 return matches; 365 } 366 367 private boolean isLastChar(final int wdsz, final int n) { 368 return n + 1 == wdsz; 369 } 370 371 372 /** 373 * Encodes an Object using the metaphone algorithm. This method 374 * is provided in order to satisfy the requirements of the 375 * Encoder interface, and will throw an EncoderException if the 376 * supplied object is not of type java.lang.String. 377 * 378 * @param obj Object to encode 379 * @return An object (or type java.lang.String) containing the 380 * metaphone code which corresponds to the String supplied. 381 * @throws EncoderException if the parameter supplied is not 382 * of type java.lang.String 383 */ 384 @Override 385 public Object encode(final Object obj) throws EncoderException { 386 if (!(obj instanceof String)) { 387 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 388 } 389 return metaphone((String) obj); 390 } 391 392 /** 393 * Encodes a String using the Metaphone algorithm. 394 * 395 * @param str String object to encode 396 * @return The metaphone code corresponding to the String supplied 397 */ 398 @Override 399 public String encode(final String str) { 400 return metaphone(str); 401 } 402 403 /** 404 * Tests is the metaphones of two strings are identical. 405 * 406 * @param str1 First of two strings to compare 407 * @param str2 Second of two strings to compare 408 * @return {@code true} if the metaphones of these strings are identical, 409 * {@code false} otherwise. 410 */ 411 public boolean isMetaphoneEqual(final String str1, final String str2) { 412 return metaphone(str1).equals(metaphone(str2)); 413 } 414 415 /** 416 * Returns the maxCodeLen. 417 * @return int 418 */ 419 public int getMaxCodeLen() { return this.maxCodeLen; } 420 421 /** 422 * Sets the maxCodeLen. 423 * @param maxCodeLen The maxCodeLen to set 424 */ 425 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 426 427}