| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Metaphone |
|
| 9.333333333333334;9.333 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.commons.codec.language; | |
| 19 | ||
| 20 | import org.apache.commons.codec.EncoderException; | |
| 21 | import org.apache.commons.codec.StringEncoder; | |
| 22 | ||
| 23 | /** | |
| 24 | * Encodes a string into a Metaphone value. | |
| 25 | * <p> | |
| 26 | * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. | |
| 27 | * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. | |
| 28 | * <p> | |
| 29 | * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, | |
| 30 | * p 39.</CITE> | |
| 31 | * <p> | |
| 32 | * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: | |
| 33 | * </p> | |
| 34 | * <ul> | |
| 35 | * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </li> | |
| 36 | * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </li> | |
| 37 | * </ul> | |
| 38 | * <p> | |
| 39 | * They have had undocumented changes from the originally published algorithm. | |
| 40 | * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. | |
| 41 | * <p> | |
| 42 | * This class is conditionally thread-safe. | |
| 43 | * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} | |
| 44 | * but is not volatile, and accesses are not synchronized. | |
| 45 | * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization | |
| 46 | * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} | |
| 47 | * after initial setup. | |
| 48 | * | |
| 49 | * @version $Id: Metaphone.java 1477596 2013-04-30 12:42:29Z ggregory $ | |
| 50 | */ | |
| 51 | public class Metaphone implements StringEncoder { | |
| 52 | ||
| 53 | /** | |
| 54 | * Five values in the English language | |
| 55 | */ | |
| 56 | private static final String VOWELS = "AEIOU"; | |
| 57 | ||
| 58 | /** | |
| 59 | * Variable used in Metaphone algorithm | |
| 60 | */ | |
| 61 | private static final String FRONTV = "EIY"; | |
| 62 | ||
| 63 | /** | |
| 64 | * Variable used in Metaphone algorithm | |
| 65 | */ | |
| 66 | private static final String VARSON = "CSPTG"; | |
| 67 | ||
| 68 | /** | |
| 69 | * The max code length for metaphone is 4 | |
| 70 | */ | |
| 71 | 34 | private int maxCodeLen = 4; |
| 72 | ||
| 73 | /** | |
| 74 | * Creates an instance of the Metaphone encoder | |
| 75 | */ | |
| 76 | public Metaphone() { | |
| 77 | 34 | super(); |
| 78 | 34 | } |
| 79 | ||
| 80 | /** | |
| 81 | * Find the metaphone value of a String. This is similar to the | |
| 82 | * soundex algorithm, but better at finding similar sounding words. | |
| 83 | * All input is converted to upper case. | |
| 84 | * Limitations: Input format is expected to be a single ASCII word | |
| 85 | * with only characters in the A - Z range, no punctuation or numbers. | |
| 86 | * | |
| 87 | * @param txt String to find the metaphone code for | |
| 88 | * @return A metaphone code corresponding to the String supplied | |
| 89 | */ | |
| 90 | public String metaphone(final String txt) { | |
| 91 | 13774 | boolean hard = false; |
| 92 | 13774 | if (txt == null || txt.length() == 0) { |
| 93 | 2 | return ""; |
| 94 | } | |
| 95 | // single character is itself | |
| 96 | 13772 | if (txt.length() == 1) { |
| 97 | 8 | return txt.toUpperCase(java.util.Locale.ENGLISH); |
| 98 | } | |
| 99 | ||
| 100 | 13764 | final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); |
| 101 | ||
| 102 | 13764 | final StringBuilder local = new StringBuilder(40); // manipulate |
| 103 | 13764 | final StringBuilder code = new StringBuilder(10); // output |
| 104 | // handle initial 2 characters exceptions | |
| 105 | 13764 | switch(inwd[0]) { |
| 106 | case 'K': | |
| 107 | case 'G': | |
| 108 | case 'P': /* looking for KN, etc*/ | |
| 109 | 3901 | if (inwd[1] == 'N') { |
| 110 | 17 | local.append(inwd, 1, inwd.length - 1); |
| 111 | } else { | |
| 112 | 3884 | local.append(inwd); |
| 113 | } | |
| 114 | 3884 | break; |
| 115 | case 'A': /* looking for AE */ | |
| 116 | 64 | if (inwd[1] == 'E') { |
| 117 | 1 | local.append(inwd, 1, inwd.length - 1); |
| 118 | } else { | |
| 119 | 63 | local.append(inwd); |
| 120 | } | |
| 121 | 63 | break; |
| 122 | case 'W': /* looking for WR or WH */ | |
| 123 | 318 | if (inwd[1] == 'R') { // WR -> R |
| 124 | 3 | local.append(inwd, 1, inwd.length - 1); |
| 125 | 3 | break; |
| 126 | } | |
| 127 | 315 | if (inwd[1] == 'H') { |
| 128 | 38 | local.append(inwd, 1, inwd.length - 1); |
| 129 | 38 | local.setCharAt(0, 'W'); // WH -> W |
| 130 | } else { | |
| 131 | 277 | local.append(inwd); |
| 132 | } | |
| 133 | 277 | break; |
| 134 | case 'X': /* initial X becomes S */ | |
| 135 | 28 | inwd[0] = 'S'; |
| 136 | 28 | local.append(inwd); |
| 137 | 28 | break; |
| 138 | default: | |
| 139 | 9453 | local.append(inwd); |
| 140 | } // now local has working string with initials fixed | |
| 141 | ||
| 142 | 13764 | final int wdsz = local.length(); |
| 143 | 13764 | int n = 0; |
| 144 | ||
| 145 | 81745 | while (code.length() < this.getMaxCodeLen() && |
| 146 | n < wdsz ) { // max code size of 4 works well | |
| 147 | 67981 | final char symb = local.charAt(n); |
| 148 | // remove duplicate letters except C | |
| 149 | 67981 | if (symb != 'C' && isPreviousChar( local, n, symb ) ) { |
| 150 | 5607 | n++; |
| 151 | } else { // not dup | |
| 152 | 62374 | switch(symb) { |
| 153 | case 'A': | |
| 154 | case 'E': | |
| 155 | case 'I': | |
| 156 | case 'O': | |
| 157 | case 'U': | |
| 158 | 30521 | if (n == 0) { |
| 159 | 70 | code.append(symb); |
| 160 | } | |
| 161 | break; // only use vowel if leading char | |
| 162 | case 'B': | |
| 163 | 65 | if ( isPreviousChar(local, n, 'M') && |
| 164 | isLastChar(wdsz, n) ) { // B is silent if word ends in MB | |
| 165 | 3 | break; |
| 166 | } | |
| 167 | 62 | code.append(symb); |
| 168 | 62 | break; |
| 169 | case 'C': // lots of C special cases | |
| 170 | /* discard if SCI, SCE or SCY */ | |
| 171 | 1982 | if ( isPreviousChar(local, n, 'S') && |
| 172 | !isLastChar(wdsz, n) && | |
| 173 | FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { | |
| 174 | 3 | break; |
| 175 | } | |
| 176 | 1979 | if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
| 177 | 1 | code.append('X'); |
| 178 | 1 | break; |
| 179 | } | |
| 180 | 1978 | if (!isLastChar(wdsz, n) && |
| 181 | FRONTV.indexOf(local.charAt(n + 1)) >= 0) { | |
| 182 | 82 | code.append('S'); |
| 183 | 82 | break; // CI,CE,CY -> S |
| 184 | } | |
| 185 | 1896 | if (isPreviousChar(local, n, 'S') && |
| 186 | isNextChar(local, n, 'H') ) { // SCH->sk | |
| 187 | 2 | code.append('K'); |
| 188 | 2 | break; |
| 189 | } | |
| 190 | 1894 | if (isNextChar(local, n, 'H')) { // detect CH |
| 191 | 4 | if (n == 0 && |
| 192 | wdsz >= 3 && | |
| 193 | isVowel(local,2) ) { // CH consonant -> K consonant | |
| 194 | 1 | code.append('K'); |
| 195 | } else { | |
| 196 | 3 | code.append('X'); // CHvowel -> X |
| 197 | } | |
| 198 | } else { | |
| 199 | 1890 | code.append('K'); |
| 200 | } | |
| 201 | 1890 | break; |
| 202 | case 'D': | |
| 203 | 445 | if (!isLastChar(wdsz, n + 1) && |
| 204 | isNextChar(local, n, 'G') && | |
| 205 | FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J | |
| 206 | 3 | code.append('J'); n += 2; |
| 207 | } else { | |
| 208 | 442 | code.append('T'); |
| 209 | } | |
| 210 | 442 | break; |
| 211 | case 'G': // GH silent at end or before consonant | |
| 212 | 1705 | if (isLastChar(wdsz, n + 1) && |
| 213 | isNextChar(local, n, 'H')) { | |
| 214 | 1 | break; |
| 215 | } | |
| 216 | 1704 | if (!isLastChar(wdsz, n + 1) && |
| 217 | isNextChar(local,n,'H') && | |
| 218 | !isVowel(local,n+2)) { | |
| 219 | 19 | break; |
| 220 | } | |
| 221 | 1685 | if (n > 0 && |
| 222 | ( regionMatch(local, n, "GN") || | |
| 223 | regionMatch(local, n, "GNED") ) ) { | |
| 224 | 0 | break; // silent G |
| 225 | } | |
| 226 | 1684 | if (isPreviousChar(local, n, 'G')) { |
| 227 | // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true | |
| 228 | 0 | hard = true; |
| 229 | } else { | |
| 230 | 1684 | hard = false; |
| 231 | } | |
| 232 | 1684 | if (!isLastChar(wdsz, n) && |
| 233 | FRONTV.indexOf(local.charAt(n + 1)) >= 0 && | |
| 234 | !hard) { | |
| 235 | 1547 | code.append('J'); |
| 236 | } else { | |
| 237 | 137 | code.append('K'); |
| 238 | } | |
| 239 | 137 | break; |
| 240 | case 'H': | |
| 241 | 654 | if (isLastChar(wdsz, n)) { |
| 242 | 205 | break; // terminal H |
| 243 | } | |
| 244 | 449 | if (n > 0 && |
| 245 | VARSON.indexOf(local.charAt(n - 1)) >= 0) { | |
| 246 | 27 | break; |
| 247 | } | |
| 248 | 422 | if (isVowel(local,n+1)) { |
| 249 | 1 | code.append('H'); // Hvowel |
| 250 | } | |
| 251 | break; | |
| 252 | case 'F': | |
| 253 | case 'J': | |
| 254 | case 'L': | |
| 255 | case 'M': | |
| 256 | case 'N': | |
| 257 | case 'R': | |
| 258 | 20128 | code.append(symb); |
| 259 | 20128 | break; |
| 260 | case 'K': | |
| 261 | 1963 | if (n > 0) { // not initial |
| 262 | 5 | if (!isPreviousChar(local, n, 'C')) { |
| 263 | 2 | code.append(symb); |
| 264 | } | |
| 265 | } else { | |
| 266 | 1958 | code.append(symb); // initial K |
| 267 | } | |
| 268 | 1958 | break; |
| 269 | case 'P': | |
| 270 | 245 | if (isNextChar(local,n,'H')) { |
| 271 | // PH -> F | |
| 272 | 1 | code.append('F'); |
| 273 | } else { | |
| 274 | 244 | code.append(symb); |
| 275 | } | |
| 276 | 244 | break; |
| 277 | case 'Q': | |
| 278 | 3 | code.append('K'); |
| 279 | 3 | break; |
| 280 | case 'S': | |
| 281 | 673 | if (regionMatch(local,n,"SH") || |
| 282 | regionMatch(local,n,"SIO") || | |
| 283 | regionMatch(local,n,"SIA")) { | |
| 284 | 4 | code.append('X'); |
| 285 | } else { | |
| 286 | 669 | code.append('S'); |
| 287 | } | |
| 288 | 669 | break; |
| 289 | case 'T': | |
| 290 | 640 | if (regionMatch(local,n,"TIA") || |
| 291 | regionMatch(local,n,"TIO")) { | |
| 292 | 2 | code.append('X'); |
| 293 | 2 | break; |
| 294 | } | |
| 295 | 638 | if (regionMatch(local,n,"TCH")) { |
| 296 | // Silent if in "TCH" | |
| 297 | 2 | break; |
| 298 | } | |
| 299 | // substitute numeral 0 for TH (resembles theta after all) | |
| 300 | 636 | if (regionMatch(local,n,"TH")) { |
| 301 | 2 | code.append('0'); |
| 302 | } else { | |
| 303 | 634 | code.append('T'); |
| 304 | } | |
| 305 | 634 | break; |
| 306 | case 'V': | |
| 307 | 1 | code.append('F'); break; |
| 308 | case 'W': | |
| 309 | case 'Y': // silent if not followed by vowel | |
| 310 | 2847 | if (!isLastChar(wdsz,n) && |
| 311 | isVowel(local,n+1)) { | |
| 312 | 314 | code.append(symb); |
| 313 | } | |
| 314 | break; | |
| 315 | case 'X': | |
| 316 | 6 | code.append('K'); |
| 317 | 6 | code.append('S'); |
| 318 | 6 | break; |
| 319 | case 'Z': | |
| 320 | 139 | code.append('S'); |
| 321 | 139 | break; |
| 322 | default: | |
| 323 | // do nothing | |
| 324 | break; | |
| 325 | } // end switch | |
| 326 | 62374 | n++; |
| 327 | } // end else from symb != 'C' | |
| 328 | 67981 | if (code.length() > this.getMaxCodeLen()) { |
| 329 | 2 | code.setLength(this.getMaxCodeLen()); |
| 330 | } | |
| 331 | 67981 | } |
| 332 | 13764 | return code.toString(); |
| 333 | } | |
| 334 | ||
| 335 | private boolean isVowel(final StringBuilder string, final int index) { | |
| 336 | 1132 | return VOWELS.indexOf(string.charAt(index)) >= 0; |
| 337 | } | |
| 338 | ||
| 339 | private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { | |
| 340 | 71631 | boolean matches = false; |
| 341 | 71631 | if( index > 0 && |
| 342 | index < string.length() ) { | |
| 343 | 54295 | matches = string.charAt(index - 1) == c; |
| 344 | } | |
| 345 | 71631 | return matches; |
| 346 | } | |
| 347 | ||
| 348 | private boolean isNextChar(final StringBuilder string, final int index, final char c) { | |
| 349 | 4062 | boolean matches = false; |
| 350 | 4062 | if( index >= 0 && |
| 351 | index < string.length() - 1 ) { | |
| 352 | 4035 | matches = string.charAt(index + 1) == c; |
| 353 | } | |
| 354 | 4062 | return matches; |
| 355 | } | |
| 356 | ||
| 357 | private boolean regionMatch(final StringBuilder string, final int index, final String test) { | |
| 358 | 6549 | boolean matches = false; |
| 359 | 6549 | if( index >= 0 && |
| 360 | index + test.length() - 1 < string.length() ) { | |
| 361 | 5282 | final String substring = string.substring( index, index + test.length()); |
| 362 | 5282 | matches = substring.equals( test ); |
| 363 | } | |
| 364 | 6549 | return matches; |
| 365 | } | |
| 366 | ||
| 367 | private boolean isLastChar(final int wdsz, final int n) { | |
| 368 | 11025 | return n + 1 == wdsz; |
| 369 | } | |
| 370 | ||
| 371 | ||
| 372 | /** | |
| 373 | * Encodes an Object using the metaphone algorithm. This method | |
| 374 | * is provided in order to satisfy the requirements of the | |
| 375 | * Encoder interface, and will throw an EncoderException if the | |
| 376 | * supplied object is not of type java.lang.String. | |
| 377 | * | |
| 378 | * @param obj Object to encode | |
| 379 | * @return An object (or type java.lang.String) containing the | |
| 380 | * metaphone code which corresponds to the String supplied. | |
| 381 | * @throws EncoderException if the parameter supplied is not | |
| 382 | * of type java.lang.String | |
| 383 | */ | |
| 384 | @Override | |
| 385 | public Object encode(final Object obj) throws EncoderException { | |
| 386 | 4 | if (!(obj instanceof String)) { |
| 387 | 1 | throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
| 388 | } | |
| 389 | 3 | return metaphone((String) obj); |
| 390 | } | |
| 391 | ||
| 392 | /** | |
| 393 | * Encodes a String using the Metaphone algorithm. | |
| 394 | * | |
| 395 | * @param str String object to encode | |
| 396 | * @return The metaphone code corresponding to the String supplied | |
| 397 | */ | |
| 398 | @Override | |
| 399 | public String encode(final String str) { | |
| 400 | 7 | return metaphone(str); |
| 401 | } | |
| 402 | ||
| 403 | /** | |
| 404 | * Tests is the metaphones of two strings are identical. | |
| 405 | * | |
| 406 | * @param str1 First of two strings to compare | |
| 407 | * @param str2 Second of two strings to compare | |
| 408 | * @return {@code true} if the metaphones of these strings are identical, | |
| 409 | * {@code false} otherwise. | |
| 410 | */ | |
| 411 | public boolean isMetaphoneEqual(final String str1, final String str2) { | |
| 412 | 6862 | return metaphone(str1).equals(metaphone(str2)); |
| 413 | } | |
| 414 | ||
| 415 | /** | |
| 416 | * Returns the maxCodeLen. | |
| 417 | * @return int | |
| 418 | */ | |
| 419 | 149728 | public int getMaxCodeLen() { return this.maxCodeLen; } |
| 420 | ||
| 421 | /** | |
| 422 | * Sets the maxCodeLen. | |
| 423 | * @param maxCodeLen The maxCodeLen to set | |
| 424 | */ | |
| 425 | 1 | public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
| 426 | ||
| 427 | } |