001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.Locale; 021 022import org.apache.commons.codec.EncoderException; 023import org.apache.commons.codec.StringEncoder; 024 025/** 026 * Encodes a string into a Metaphone value. 027 * <p> 028 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 029 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 030 * </p> 031 * <p> 032 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 033 * p 39.</CITE> 034 * </p> 035 * <p> 036 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 037 * </p> 038 * <ul> 039 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 040 * (broken link 4/30/2013) </li> 041 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 042 * (link checked 4/30/2013) </li> 043 * </ul> 044 * <p> 045 * They have had undocumented changes from the originally published algorithm. 046 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 047 * </p> 048 * <p> 049 * This class is conditionally thread-safe. 050 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)} 051 * but is not volatile, and accesses are not synchronized. 052 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 053 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 054 * after initial setup. 055 * </p> 056 */ 057public class Metaphone implements StringEncoder { 058 059 /** 060 * Five values in the English language. 061 */ 062 private static final String VOWELS = "AEIOU"; 063 064 /** 065 * Variable used in Metaphone algorithm. 066 */ 067 private static final String FRONTV = "EIY"; 068 069 /** 070 * Variable used in Metaphone algorithm. 071 */ 072 private static final String VARSON = "CSPTG"; 073 074 /** 075 * The max code length for Metaphone is 4. 076 */ 077 private int maxCodeLen = 4; 078 079 /** 080 * Constructs a new instance. 081 */ 082 public Metaphone() { 083 // empty 084 } 085 086 /** 087 * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 088 * EncoderException if the supplied object is not of type {@link String}. 089 * 090 * @param obj Object to encode. 091 * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied. 092 * @throws EncoderException if the parameter supplied is not of type {@link String}. 093 */ 094 @Override 095 public Object encode(final Object obj) throws EncoderException { 096 if (!(obj instanceof String)) { 097 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 098 } 099 return metaphone((String) obj); 100 } 101 102 /** 103 * Encodes a String using the Metaphone algorithm. 104 * 105 * @param str String object to encode. 106 * @return The Metaphone code corresponding to the String supplied. 107 */ 108 @Override 109 public String encode(final String str) { 110 return metaphone(str); 111 } 112 113 /** 114 * Gets the maxCodeLen. 115 * 116 * @return the maxCodeLen. 117 */ 118 public int getMaxCodeLen() { 119 return this.maxCodeLen; 120 } 121 122 private boolean isLastChar(final int wdsz, final int n) { 123 return n + 1 == wdsz; 124 } 125 126 /** 127 * Tests is the Metaphones of two strings are identical. 128 * 129 * @param str1 First of two strings to compare. 130 * @param str2 Second of two strings to compare. 131 * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise. 132 */ 133 public boolean isMetaphoneEqual(final String str1, final String str2) { 134 return metaphone(str1).equals(metaphone(str2)); 135 } 136 137 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 138 boolean matches = false; 139 if (index >= 0 && index < string.length() - 1) { 140 matches = string.charAt(index + 1) == c; 141 } 142 return matches; 143 } 144 145 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 146 boolean matches = false; 147 if (index > 0 && index < string.length()) { 148 matches = string.charAt(index - 1) == c; 149 } 150 return matches; 151 } 152 153 private boolean isVowel(final StringBuilder string, final int index) { 154 return VOWELS.indexOf(string.charAt(index)) >= 0; 155 } 156 157 /** 158 * Find the Metaphone value of a String. This is similar to the 159 * Soundex algorithm, but better at finding similar sounding words. 160 * All input is converted to upper case. 161 * Limitations: Input format is expected to be a single ASCII word 162 * with only characters in the A - Z range, no punctuation or numbers. 163 * 164 * @param txt String to find the Metaphone code for. 165 * @return A Metaphone code corresponding to the String supplied. 166 */ 167 public String metaphone(final String txt) { 168 boolean hard = false; 169 final int txtLength; 170 if (txt == null || (txtLength = txt.length()) == 0) { 171 return ""; 172 } 173 // single character is itself 174 if (txtLength == 1) { 175 return txt.toUpperCase(Locale.ENGLISH); 176 } 177 178 final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray(); 179 180 final StringBuilder local = new StringBuilder(40); // manipulate 181 final StringBuilder code = new StringBuilder(10); // output 182 // handle initial 2 characters exceptions 183 switch (inwd[0]) { 184 case 'K': 185 case 'G': 186 case 'P': /* looking for KN, etc */ 187 if (inwd[1] == 'N') { 188 local.append(inwd, 1, inwd.length - 1); 189 } else { 190 local.append(inwd); 191 } 192 break; 193 case 'A': /* looking for AE */ 194 if (inwd[1] == 'E') { 195 local.append(inwd, 1, inwd.length - 1); 196 } else { 197 local.append(inwd); 198 } 199 break; 200 case 'W': /* looking for WR or WH */ 201 if (inwd[1] == 'R') { // WR -> R 202 local.append(inwd, 1, inwd.length - 1); 203 break; 204 } 205 if (inwd[1] == 'H') { 206 local.append(inwd, 1, inwd.length - 1); 207 local.setCharAt(0, 'W'); // WH -> W 208 } else { 209 local.append(inwd); 210 } 211 break; 212 case 'X': /* initial X becomes S */ 213 inwd[0] = 'S'; 214 local.append(inwd); 215 break; 216 default: 217 local.append(inwd); 218 } // now local has working string with initials fixed 219 220 final int wdsz = local.length(); 221 int n = 0; 222 223 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well 224 final char symb = local.charAt(n); 225 // remove duplicate letters except C 226 if (symb == 'C' || !isPreviousChar(local, n, symb)) { 227 // not dup 228 switch (symb) { 229 case 'A': 230 case 'E': 231 case 'I': 232 case 'O': 233 case 'U': 234 if (n == 0) { 235 code.append(symb); 236 } 237 break; // only use vowel if leading char 238 case 'B': 239 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB 240 break; 241 } 242 code.append(symb); 243 break; 244 case 'C': // lots of C special cases 245 /* discard if SCI, SCE or SCY */ 246 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 247 break; 248 } 249 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk 250 code.append('K'); 251 break; 252 } 253 if (regionMatch(local, n, "CIA") || isNextChar(local, n, 'H')) { // "CIA" -> X or CH -> X 254 code.append('X'); 255 break; 256 } 257 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 258 code.append('S'); 259 break; // CI,CE,CY -> S 260 } 261 code.append('K'); // default C -> K 262 break; 263 case 'D': 264 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 265 code.append('J'); 266 n += 2; 267 } else { 268 code.append('T'); 269 } 270 break; 271 case 'G': // GH silent at end or before consonant 272 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) { 273 break; 274 } 275 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) { 276 break; 277 } 278 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) { 279 break; // silent G 280 } 281 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 282 hard = isPreviousChar(local, n, 'G'); 283 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) { 284 code.append('J'); 285 } else { 286 code.append('K'); 287 } 288 break; 289 case 'H': 290 if (isLastChar(wdsz, n)) { 291 break; // terminal H 292 } 293 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) { 294 break; 295 } 296 if (isVowel(local, n + 1)) { 297 code.append('H'); // Hvowel 298 } 299 break; 300 case 'F': 301 case 'J': 302 case 'L': 303 case 'M': 304 case 'N': 305 case 'R': 306 code.append(symb); 307 break; 308 case 'K': 309 if (n > 0) { // not initial 310 if (!isPreviousChar(local, n, 'C')) { 311 code.append(symb); 312 } 313 } else { 314 code.append(symb); // initial K 315 } 316 break; 317 case 'P': 318 if (isNextChar(local, n, 'H')) { 319 // PH -> F 320 code.append('F'); 321 } else { 322 code.append(symb); 323 } 324 break; 325 case 'Q': 326 code.append('K'); 327 break; 328 case 'S': 329 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) { 330 code.append('X'); 331 } else { 332 code.append('S'); 333 } 334 break; 335 case 'T': 336 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) { 337 code.append('X'); 338 break; 339 } 340 if (regionMatch(local, n, "TCH")) { 341 // Silent if in "TCH" 342 break; 343 } 344 // substitute numeral 0 for TH (resembles theta after all) 345 if (regionMatch(local, n, "TH")) { 346 code.append('0'); 347 } else { 348 code.append('T'); 349 } 350 break; 351 case 'V': 352 code.append('F'); 353 break; 354 case 'W': 355 case 'Y': // silent if not followed by vowel 356 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) { 357 code.append(symb); 358 } 359 break; 360 case 'X': 361 code.append('K'); 362 code.append('S'); 363 break; 364 case 'Z': 365 code.append('S'); 366 break; 367 default: 368 // do nothing 369 break; 370 } // end switch 371 } // end else from symb != 'C' 372 n++; 373 if (code.length() > getMaxCodeLen()) { 374 code.setLength(getMaxCodeLen()); 375 } 376 } 377 return code.toString(); 378 } 379 380 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 381 boolean matches = false; 382 if (index >= 0 && index + test.length() - 1 < string.length()) { 383 final String substring = string.substring(index, index + test.length()); 384 matches = substring.equals(test); 385 } 386 return matches; 387 } 388 389 /** 390 * Sets the maxCodeLen. 391 * 392 * @param maxCodeLen The maxCodeLen to set. 393 */ 394 public void setMaxCodeLen(final int maxCodeLen) { 395 this.maxCodeLen = maxCodeLen; 396 } 397 398}