001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * <p> 029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 030 * p 39.</CITE> 031 * <p> 032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 033 * </p> 034 * <ul> 035 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 036 * (broken link 4/30/2013) </li> 037 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 038 * (link checked 4/30/2013) </li> 039 * </ul> 040 * <p> 041 * They have had undocumented changes from the originally published algorithm. 042 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 043 * <p> 044 * This class is conditionally thread-safe. 045 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} 046 * but is not volatile, and accesses are not synchronized. 047 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 048 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 049 * after initial setup. 050 * 051 * @version $Id: Metaphone.html 928559 2014-11-10 02:53:54Z ggregory $ 052 */ 053public class Metaphone implements StringEncoder { 054 055 /** 056 * Five values in the English language 057 */ 058 private static final String VOWELS = "AEIOU"; 059 060 /** 061 * Variable used in Metaphone algorithm 062 */ 063 private static final String FRONTV = "EIY"; 064 065 /** 066 * Variable used in Metaphone algorithm 067 */ 068 private static final String VARSON = "CSPTG"; 069 070 /** 071 * The max code length for metaphone is 4 072 */ 073 private int maxCodeLen = 4; 074 075 /** 076 * Creates an instance of the Metaphone encoder 077 */ 078 public Metaphone() { 079 super(); 080 } 081 082 /** 083 * Find the metaphone value of a String. This is similar to the 084 * soundex algorithm, but better at finding similar sounding words. 085 * All input is converted to upper case. 086 * Limitations: Input format is expected to be a single ASCII word 087 * with only characters in the A - Z range, no punctuation or numbers. 088 * 089 * @param txt String to find the metaphone code for 090 * @return A metaphone code corresponding to the String supplied 091 */ 092 public String metaphone(final String txt) { 093 boolean hard = false; 094 int txtLength; 095 if (txt == null || (txtLength = txt.length()) == 0) { 096 return ""; 097 } 098 // single character is itself 099 if (txtLength == 1) { 100 return txt.toUpperCase(java.util.Locale.ENGLISH); 101 } 102 103 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 104 105 final StringBuilder local = new StringBuilder(40); // manipulate 106 final StringBuilder code = new StringBuilder(10); // output 107 // handle initial 2 characters exceptions 108 switch(inwd[0]) { 109 case 'K': 110 case 'G': 111 case 'P': /* looking for KN, etc*/ 112 if (inwd[1] == 'N') { 113 local.append(inwd, 1, inwd.length - 1); 114 } else { 115 local.append(inwd); 116 } 117 break; 118 case 'A': /* looking for AE */ 119 if (inwd[1] == 'E') { 120 local.append(inwd, 1, inwd.length - 1); 121 } else { 122 local.append(inwd); 123 } 124 break; 125 case 'W': /* looking for WR or WH */ 126 if (inwd[1] == 'R') { // WR -> R 127 local.append(inwd, 1, inwd.length - 1); 128 break; 129 } 130 if (inwd[1] == 'H') { 131 local.append(inwd, 1, inwd.length - 1); 132 local.setCharAt(0, 'W'); // WH -> W 133 } else { 134 local.append(inwd); 135 } 136 break; 137 case 'X': /* initial X becomes S */ 138 inwd[0] = 'S'; 139 local.append(inwd); 140 break; 141 default: 142 local.append(inwd); 143 } // now local has working string with initials fixed 144 145 final int wdsz = local.length(); 146 int n = 0; 147 148 while (code.length() < this.getMaxCodeLen() && 149 n < wdsz ) { // max code size of 4 works well 150 final char symb = local.charAt(n); 151 // remove duplicate letters except C 152 if (symb != 'C' && isPreviousChar( local, n, symb ) ) { 153 n++; 154 } else { // not dup 155 switch(symb) { 156 case 'A': 157 case 'E': 158 case 'I': 159 case 'O': 160 case 'U': 161 if (n == 0) { 162 code.append(symb); 163 } 164 break; // only use vowel if leading char 165 case 'B': 166 if ( isPreviousChar(local, n, 'M') && 167 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 168 break; 169 } 170 code.append(symb); 171 break; 172 case 'C': // lots of C special cases 173 /* discard if SCI, SCE or SCY */ 174 if ( isPreviousChar(local, n, 'S') && 175 !isLastChar(wdsz, n) && 176 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { 177 break; 178 } 179 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 180 code.append('X'); 181 break; 182 } 183 if (!isLastChar(wdsz, n) && 184 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 185 code.append('S'); 186 break; // CI,CE,CY -> S 187 } 188 if (isPreviousChar(local, n, 'S') && 189 isNextChar(local, n, 'H') ) { // SCH->sk 190 code.append('K'); 191 break; 192 } 193 if (isNextChar(local, n, 'H')) { // detect CH 194 if (n == 0 && 195 wdsz >= 3 && 196 isVowel(local,2) ) { // CH consonant -> K consonant 197 code.append('K'); 198 } else { 199 code.append('X'); // CHvowel -> X 200 } 201 } else { 202 code.append('K'); 203 } 204 break; 205 case 'D': 206 if (!isLastChar(wdsz, n + 1) && 207 isNextChar(local, n, 'G') && 208 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 209 code.append('J'); n += 2; 210 } else { 211 code.append('T'); 212 } 213 break; 214 case 'G': // GH silent at end or before consonant 215 if (isLastChar(wdsz, n + 1) && 216 isNextChar(local, n, 'H')) { 217 break; 218 } 219 if (!isLastChar(wdsz, n + 1) && 220 isNextChar(local,n,'H') && 221 !isVowel(local,n+2)) { 222 break; 223 } 224 if (n > 0 && 225 ( regionMatch(local, n, "GN") || 226 regionMatch(local, n, "GNED") ) ) { 227 break; // silent G 228 } 229 if (isPreviousChar(local, n, 'G')) { 230 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 231 hard = true; 232 } else { 233 hard = false; 234 } 235 if (!isLastChar(wdsz, n) && 236 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && 237 !hard) { 238 code.append('J'); 239 } else { 240 code.append('K'); 241 } 242 break; 243 case 'H': 244 if (isLastChar(wdsz, n)) { 245 break; // terminal H 246 } 247 if (n > 0 && 248 VARSON.indexOf(local.charAt(n - 1)) >= 0) { 249 break; 250 } 251 if (isVowel(local,n+1)) { 252 code.append('H'); // Hvowel 253 } 254 break; 255 case 'F': 256 case 'J': 257 case 'L': 258 case 'M': 259 case 'N': 260 case 'R': 261 code.append(symb); 262 break; 263 case 'K': 264 if (n > 0) { // not initial 265 if (!isPreviousChar(local, n, 'C')) { 266 code.append(symb); 267 } 268 } else { 269 code.append(symb); // initial K 270 } 271 break; 272 case 'P': 273 if (isNextChar(local,n,'H')) { 274 // PH -> F 275 code.append('F'); 276 } else { 277 code.append(symb); 278 } 279 break; 280 case 'Q': 281 code.append('K'); 282 break; 283 case 'S': 284 if (regionMatch(local,n,"SH") || 285 regionMatch(local,n,"SIO") || 286 regionMatch(local,n,"SIA")) { 287 code.append('X'); 288 } else { 289 code.append('S'); 290 } 291 break; 292 case 'T': 293 if (regionMatch(local,n,"TIA") || 294 regionMatch(local,n,"TIO")) { 295 code.append('X'); 296 break; 297 } 298 if (regionMatch(local,n,"TCH")) { 299 // Silent if in "TCH" 300 break; 301 } 302 // substitute numeral 0 for TH (resembles theta after all) 303 if (regionMatch(local,n,"TH")) { 304 code.append('0'); 305 } else { 306 code.append('T'); 307 } 308 break; 309 case 'V': 310 code.append('F'); break; 311 case 'W': 312 case 'Y': // silent if not followed by vowel 313 if (!isLastChar(wdsz,n) && 314 isVowel(local,n+1)) { 315 code.append(symb); 316 } 317 break; 318 case 'X': 319 code.append('K'); 320 code.append('S'); 321 break; 322 case 'Z': 323 code.append('S'); 324 break; 325 default: 326 // do nothing 327 break; 328 } // end switch 329 n++; 330 } // end else from symb != 'C' 331 if (code.length() > this.getMaxCodeLen()) { 332 code.setLength(this.getMaxCodeLen()); 333 } 334 } 335 return code.toString(); 336 } 337 338 private boolean isVowel(final StringBuilder string, final int index) { 339 return VOWELS.indexOf(string.charAt(index)) >= 0; 340 } 341 342 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 343 boolean matches = false; 344 if( index > 0 && 345 index < string.length() ) { 346 matches = string.charAt(index - 1) == c; 347 } 348 return matches; 349 } 350 351 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 352 boolean matches = false; 353 if( index >= 0 && 354 index < string.length() - 1 ) { 355 matches = string.charAt(index + 1) == c; 356 } 357 return matches; 358 } 359 360 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 361 boolean matches = false; 362 if( index >= 0 && 363 index + test.length() - 1 < string.length() ) { 364 final String substring = string.substring( index, index + test.length()); 365 matches = substring.equals( test ); 366 } 367 return matches; 368 } 369 370 private boolean isLastChar(final int wdsz, final int n) { 371 return n + 1 == wdsz; 372 } 373 374 375 /** 376 * Encodes an Object using the metaphone algorithm. This method 377 * is provided in order to satisfy the requirements of the 378 * Encoder interface, and will throw an EncoderException if the 379 * supplied object is not of type java.lang.String. 380 * 381 * @param obj Object to encode 382 * @return An object (or type java.lang.String) containing the 383 * metaphone code which corresponds to the String supplied. 384 * @throws EncoderException if the parameter supplied is not 385 * of type java.lang.String 386 */ 387 @Override 388 public Object encode(final Object obj) throws EncoderException { 389 if (!(obj instanceof String)) { 390 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 391 } 392 return metaphone((String) obj); 393 } 394 395 /** 396 * Encodes a String using the Metaphone algorithm. 397 * 398 * @param str String object to encode 399 * @return The metaphone code corresponding to the String supplied 400 */ 401 @Override 402 public String encode(final String str) { 403 return metaphone(str); 404 } 405 406 /** 407 * Tests is the metaphones of two strings are identical. 408 * 409 * @param str1 First of two strings to compare 410 * @param str2 Second of two strings to compare 411 * @return <code>true</code> if the metaphones of these strings are identical, 412 * <code>false</code> otherwise. 413 */ 414 public boolean isMetaphoneEqual(final String str1, final String str2) { 415 return metaphone(str1).equals(metaphone(str2)); 416 } 417 418 /** 419 * Returns the maxCodeLen. 420 * @return int 421 */ 422 public int getMaxCodeLen() { return this.maxCodeLen; } 423 424 /** 425 * Sets the maxCodeLen. 426 * @param maxCodeLen The maxCodeLen to set 427 */ 428 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 429 430}