001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language; 019 020 import org.apache.commons.codec.EncoderException; 021 import org.apache.commons.codec.StringEncoder; 022 023 /** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * <p> 029 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 030 * p 39.</CITE> 031 * <p> 032 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl 033 * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>. 034 * They have had undocumented changes from the originally published algorithm. 035 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 036 * <p> 037 * This class is conditionally thread-safe. 038 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} 039 * but is not volatile, and accesses are not synchronised. 040 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation 041 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 042 * after initial setup. 043 * 044 * @version $Id: Metaphone.html 889935 2013-12-11 05:05:13Z ggregory $ 045 */ 046 public class Metaphone implements StringEncoder { 047 048 /** 049 * Five values in the English language 050 */ 051 private static final String VOWELS = "AEIOU"; 052 053 /** 054 * Variable used in Metaphone algorithm 055 */ 056 private static final String FRONTV = "EIY"; 057 058 /** 059 * Variable used in Metaphone algorithm 060 */ 061 private static final String VARSON = "CSPTG"; 062 063 /** 064 * The max code length for metaphone is 4 065 */ 066 private int maxCodeLen = 4; 067 068 /** 069 * Creates an instance of the Metaphone encoder 070 */ 071 public Metaphone() { 072 super(); 073 } 074 075 /** 076 * Find the metaphone value of a String. This is similar to the 077 * soundex algorithm, but better at finding similar sounding words. 078 * All input is converted to upper case. 079 * Limitations: Input format is expected to be a single ASCII word 080 * with only characters in the A - Z range, no punctuation or numbers. 081 * 082 * @param txt String to find the metaphone code for 083 * @return A metaphone code corresponding to the String supplied 084 */ 085 public String metaphone(final String txt) { 086 boolean hard = false; 087 if (txt == null || txt.length() == 0) { 088 return ""; 089 } 090 // single character is itself 091 if (txt.length() == 1) { 092 return txt.toUpperCase(java.util.Locale.ENGLISH); 093 } 094 095 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 096 097 final StringBuilder local = new StringBuilder(40); // manipulate 098 final StringBuilder code = new StringBuilder(10); // output 099 // handle initial 2 characters exceptions 100 switch(inwd[0]) { 101 case 'K': 102 case 'G': 103 case 'P': /* looking for KN, etc*/ 104 if (inwd[1] == 'N') { 105 local.append(inwd, 1, inwd.length - 1); 106 } else { 107 local.append(inwd); 108 } 109 break; 110 case 'A': /* looking for AE */ 111 if (inwd[1] == 'E') { 112 local.append(inwd, 1, inwd.length - 1); 113 } else { 114 local.append(inwd); 115 } 116 break; 117 case 'W': /* looking for WR or WH */ 118 if (inwd[1] == 'R') { // WR -> R 119 local.append(inwd, 1, inwd.length - 1); 120 break; 121 } 122 if (inwd[1] == 'H') { 123 local.append(inwd, 1, inwd.length - 1); 124 local.setCharAt(0, 'W'); // WH -> W 125 } else { 126 local.append(inwd); 127 } 128 break; 129 case 'X': /* initial X becomes S */ 130 inwd[0] = 'S'; 131 local.append(inwd); 132 break; 133 default: 134 local.append(inwd); 135 } // now local has working string with initials fixed 136 137 final int wdsz = local.length(); 138 int n = 0; 139 140 while (code.length() < this.getMaxCodeLen() && 141 n < wdsz ) { // max code size of 4 works well 142 final char symb = local.charAt(n); 143 // remove duplicate letters except C 144 if (symb != 'C' && isPreviousChar( local, n, symb ) ) { 145 n++; 146 } else { // not dup 147 switch(symb) { 148 case 'A': 149 case 'E': 150 case 'I': 151 case 'O': 152 case 'U': 153 if (n == 0) { 154 code.append(symb); 155 } 156 break; // only use vowel if leading char 157 case 'B': 158 if ( isPreviousChar(local, n, 'M') && 159 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 160 break; 161 } 162 code.append(symb); 163 break; 164 case 'C': // lots of C special cases 165 /* discard if SCI, SCE or SCY */ 166 if ( isPreviousChar(local, n, 'S') && 167 !isLastChar(wdsz, n) && 168 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { 169 break; 170 } 171 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 172 code.append('X'); 173 break; 174 } 175 if (!isLastChar(wdsz, n) && 176 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 177 code.append('S'); 178 break; // CI,CE,CY -> S 179 } 180 if (isPreviousChar(local, n, 'S') && 181 isNextChar(local, n, 'H') ) { // SCH->sk 182 code.append('K'); 183 break; 184 } 185 if (isNextChar(local, n, 'H')) { // detect CH 186 if (n == 0 && 187 wdsz >= 3 && 188 isVowel(local,2) ) { // CH consonant -> K consonant 189 code.append('K'); 190 } else { 191 code.append('X'); // CHvowel -> X 192 } 193 } else { 194 code.append('K'); 195 } 196 break; 197 case 'D': 198 if (!isLastChar(wdsz, n + 1) && 199 isNextChar(local, n, 'G') && 200 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 201 code.append('J'); n += 2; 202 } else { 203 code.append('T'); 204 } 205 break; 206 case 'G': // GH silent at end or before consonant 207 if (isLastChar(wdsz, n + 1) && 208 isNextChar(local, n, 'H')) { 209 break; 210 } 211 if (!isLastChar(wdsz, n + 1) && 212 isNextChar(local,n,'H') && 213 !isVowel(local,n+2)) { 214 break; 215 } 216 if (n > 0 && 217 ( regionMatch(local, n, "GN") || 218 regionMatch(local, n, "GNED") ) ) { 219 break; // silent G 220 } 221 if (isPreviousChar(local, n, 'G')) { 222 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 223 hard = true; 224 } else { 225 hard = false; 226 } 227 if (!isLastChar(wdsz, n) && 228 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && 229 !hard) { 230 code.append('J'); 231 } else { 232 code.append('K'); 233 } 234 break; 235 case 'H': 236 if (isLastChar(wdsz, n)) { 237 break; // terminal H 238 } 239 if (n > 0 && 240 VARSON.indexOf(local.charAt(n - 1)) >= 0) { 241 break; 242 } 243 if (isVowel(local,n+1)) { 244 code.append('H'); // Hvowel 245 } 246 break; 247 case 'F': 248 case 'J': 249 case 'L': 250 case 'M': 251 case 'N': 252 case 'R': 253 code.append(symb); 254 break; 255 case 'K': 256 if (n > 0) { // not initial 257 if (!isPreviousChar(local, n, 'C')) { 258 code.append(symb); 259 } 260 } else { 261 code.append(symb); // initial K 262 } 263 break; 264 case 'P': 265 if (isNextChar(local,n,'H')) { 266 // PH -> F 267 code.append('F'); 268 } else { 269 code.append(symb); 270 } 271 break; 272 case 'Q': 273 code.append('K'); 274 break; 275 case 'S': 276 if (regionMatch(local,n,"SH") || 277 regionMatch(local,n,"SIO") || 278 regionMatch(local,n,"SIA")) { 279 code.append('X'); 280 } else { 281 code.append('S'); 282 } 283 break; 284 case 'T': 285 if (regionMatch(local,n,"TIA") || 286 regionMatch(local,n,"TIO")) { 287 code.append('X'); 288 break; 289 } 290 if (regionMatch(local,n,"TCH")) { 291 // Silent if in "TCH" 292 break; 293 } 294 // substitute numeral 0 for TH (resembles theta after all) 295 if (regionMatch(local,n,"TH")) { 296 code.append('0'); 297 } else { 298 code.append('T'); 299 } 300 break; 301 case 'V': 302 code.append('F'); break; 303 case 'W': 304 case 'Y': // silent if not followed by vowel 305 if (!isLastChar(wdsz,n) && 306 isVowel(local,n+1)) { 307 code.append(symb); 308 } 309 break; 310 case 'X': 311 code.append('K'); 312 code.append('S'); 313 break; 314 case 'Z': 315 code.append('S'); 316 break; 317 default: 318 // do nothing 319 break; 320 } // end switch 321 n++; 322 } // end else from symb != 'C' 323 if (code.length() > this.getMaxCodeLen()) { 324 code.setLength(this.getMaxCodeLen()); 325 } 326 } 327 return code.toString(); 328 } 329 330 private boolean isVowel(final StringBuilder string, final int index) { 331 return VOWELS.indexOf(string.charAt(index)) >= 0; 332 } 333 334 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 335 boolean matches = false; 336 if( index > 0 && 337 index < string.length() ) { 338 matches = string.charAt(index - 1) == c; 339 } 340 return matches; 341 } 342 343 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 344 boolean matches = false; 345 if( index >= 0 && 346 index < string.length() - 1 ) { 347 matches = string.charAt(index + 1) == c; 348 } 349 return matches; 350 } 351 352 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 353 boolean matches = false; 354 if( index >= 0 && 355 index + test.length() - 1 < string.length() ) { 356 final String substring = string.substring( index, index + test.length()); 357 matches = substring.equals( test ); 358 } 359 return matches; 360 } 361 362 private boolean isLastChar(final int wdsz, final int n) { 363 return n + 1 == wdsz; 364 } 365 366 367 /** 368 * Encodes an Object using the metaphone algorithm. This method 369 * is provided in order to satisfy the requirements of the 370 * Encoder interface, and will throw an EncoderException if the 371 * supplied object is not of type java.lang.String. 372 * 373 * @param obj Object to encode 374 * @return An object (or type java.lang.String) containing the 375 * metaphone code which corresponds to the String supplied. 376 * @throws EncoderException if the parameter supplied is not 377 * of type java.lang.String 378 */ 379 @Override 380 public Object encode(final Object obj) throws EncoderException { 381 if (!(obj instanceof String)) { 382 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 383 } 384 return metaphone((String) obj); 385 } 386 387 /** 388 * Encodes a String using the Metaphone algorithm. 389 * 390 * @param str String object to encode 391 * @return The metaphone code corresponding to the String supplied 392 */ 393 @Override 394 public String encode(final String str) { 395 return metaphone(str); 396 } 397 398 /** 399 * Tests is the metaphones of two strings are identical. 400 * 401 * @param str1 First of two strings to compare 402 * @param str2 Second of two strings to compare 403 * @return {@code true} if the metaphones of these strings are identical, 404 * {@code false} otherwise. 405 */ 406 public boolean isMetaphoneEqual(final String str1, final String str2) { 407 return metaphone(str1).equals(metaphone(str2)); 408 } 409 410 /** 411 * Returns the maxCodeLen. 412 * @return int 413 */ 414 public int getMaxCodeLen() { return this.maxCodeLen; } 415 416 /** 417 * Sets the maxCodeLen. 418 * @param maxCodeLen The maxCodeLen to set 419 */ 420 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 421 422 }