Coverage Report - org.apache.commons.codec.language.Metaphone
 
Classes in this File Line Coverage Branch Coverage Complexity
Metaphone
98%
144/146
84%
146/172
9.333
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language;
 19  
 
 20  
 import org.apache.commons.codec.EncoderException;
 21  
 import org.apache.commons.codec.StringEncoder;
 22  
 
 23  
 /**
 24  
  * Encodes a string into a Metaphone value.
 25  
  * <p>
 26  
  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
 27  
  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
 28  
  * <p>
 29  
  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
 30  
  * p 39.</CITE>
 31  
  * <p>
 32  
  * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
 33  
  * </p>
 34  
  * <ul>
 35  
  * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </li>
 36  
  * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </li>
 37  
  * </ul>
 38  
  * <p>
 39  
  * They have had undocumented changes from the originally published algorithm.
 40  
  * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
 41  
  * <p>
 42  
  * This class is conditionally thread-safe.
 43  
  * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
 44  
  * but is not volatile, and accesses are not synchronized.
 45  
  * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
 46  
  * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
 47  
  * after initial setup.
 48  
  *
 49  
  * @version $Id: Metaphone.java 1477596 2013-04-30 12:42:29Z ggregory $
 50  
  */
 51  
 public class Metaphone implements StringEncoder {
 52  
 
 53  
     /**
 54  
      * Five values in the English language
 55  
      */
 56  
     private static final String VOWELS = "AEIOU";
 57  
 
 58  
     /**
 59  
      * Variable used in Metaphone algorithm
 60  
      */
 61  
     private static final String FRONTV = "EIY";
 62  
 
 63  
     /**
 64  
      * Variable used in Metaphone algorithm
 65  
      */
 66  
     private static final String VARSON = "CSPTG";
 67  
 
 68  
     /**
 69  
      * The max code length for metaphone is 4
 70  
      */
 71  34
     private int maxCodeLen = 4;
 72  
 
 73  
     /**
 74  
      * Creates an instance of the Metaphone encoder
 75  
      */
 76  
     public Metaphone() {
 77  34
         super();
 78  34
     }
 79  
 
 80  
     /**
 81  
      * Find the metaphone value of a String. This is similar to the
 82  
      * soundex algorithm, but better at finding similar sounding words.
 83  
      * All input is converted to upper case.
 84  
      * Limitations: Input format is expected to be a single ASCII word
 85  
      * with only characters in the A - Z range, no punctuation or numbers.
 86  
      *
 87  
      * @param txt String to find the metaphone code for
 88  
      * @return A metaphone code corresponding to the String supplied
 89  
      */
 90  
     public String metaphone(final String txt) {
 91  13774
         boolean hard = false;
 92  13774
         if (txt == null || txt.length() == 0) {
 93  2
             return "";
 94  
         }
 95  
         // single character is itself
 96  13772
         if (txt.length() == 1) {
 97  8
             return txt.toUpperCase(java.util.Locale.ENGLISH);
 98  
         }
 99  
 
 100  13764
         final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
 101  
 
 102  13764
         final StringBuilder local = new StringBuilder(40); // manipulate
 103  13764
         final StringBuilder code = new StringBuilder(10); //   output
 104  
         // handle initial 2 characters exceptions
 105  13764
         switch(inwd[0]) {
 106  
         case 'K':
 107  
         case 'G':
 108  
         case 'P': /* looking for KN, etc*/
 109  3901
             if (inwd[1] == 'N') {
 110  17
                 local.append(inwd, 1, inwd.length - 1);
 111  
             } else {
 112  3884
                 local.append(inwd);
 113  
             }
 114  3884
             break;
 115  
         case 'A': /* looking for AE */
 116  64
             if (inwd[1] == 'E') {
 117  1
                 local.append(inwd, 1, inwd.length - 1);
 118  
             } else {
 119  63
                 local.append(inwd);
 120  
             }
 121  63
             break;
 122  
         case 'W': /* looking for WR or WH */
 123  318
             if (inwd[1] == 'R') {   // WR -> R
 124  3
                 local.append(inwd, 1, inwd.length - 1);
 125  3
                 break;
 126  
             }
 127  315
             if (inwd[1] == 'H') {
 128  38
                 local.append(inwd, 1, inwd.length - 1);
 129  38
                 local.setCharAt(0, 'W'); // WH -> W
 130  
             } else {
 131  277
                 local.append(inwd);
 132  
             }
 133  277
             break;
 134  
         case 'X': /* initial X becomes S */
 135  28
             inwd[0] = 'S';
 136  28
             local.append(inwd);
 137  28
             break;
 138  
         default:
 139  9453
             local.append(inwd);
 140  
         } // now local has working string with initials fixed
 141  
 
 142  13764
         final int wdsz = local.length();
 143  13764
         int n = 0;
 144  
 
 145  81745
         while (code.length() < this.getMaxCodeLen() &&
 146  
                n < wdsz ) { // max code size of 4 works well
 147  67981
             final char symb = local.charAt(n);
 148  
             // remove duplicate letters except C
 149  67981
             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
 150  5607
                 n++;
 151  
             } else { // not dup
 152  62374
                 switch(symb) {
 153  
                 case 'A':
 154  
                 case 'E':
 155  
                 case 'I':
 156  
                 case 'O':
 157  
                 case 'U':
 158  30521
                     if (n == 0) {
 159  70
                         code.append(symb);
 160  
                     }
 161  
                     break; // only use vowel if leading char
 162  
                 case 'B':
 163  65
                     if ( isPreviousChar(local, n, 'M') &&
 164  
                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
 165  3
                         break;
 166  
                     }
 167  62
                     code.append(symb);
 168  62
                     break;
 169  
                 case 'C': // lots of C special cases
 170  
                     /* discard if SCI, SCE or SCY */
 171  1982
                     if ( isPreviousChar(local, n, 'S') &&
 172  
                          !isLastChar(wdsz, n) &&
 173  
                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
 174  3
                         break;
 175  
                     }
 176  1979
                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
 177  1
                         code.append('X');
 178  1
                         break;
 179  
                     }
 180  1978
                     if (!isLastChar(wdsz, n) &&
 181  
                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
 182  82
                         code.append('S');
 183  82
                         break; // CI,CE,CY -> S
 184  
                     }
 185  1896
                     if (isPreviousChar(local, n, 'S') &&
 186  
                         isNextChar(local, n, 'H') ) { // SCH->sk
 187  2
                         code.append('K');
 188  2
                         break;
 189  
                     }
 190  1894
                     if (isNextChar(local, n, 'H')) { // detect CH
 191  4
                         if (n == 0 &&
 192  
                             wdsz >= 3 &&
 193  
                             isVowel(local,2) ) { // CH consonant -> K consonant
 194  1
                             code.append('K');
 195  
                         } else {
 196  3
                             code.append('X'); // CHvowel -> X
 197  
                         }
 198  
                     } else {
 199  1890
                         code.append('K');
 200  
                     }
 201  1890
                     break;
 202  
                 case 'D':
 203  445
                     if (!isLastChar(wdsz, n + 1) &&
 204  
                         isNextChar(local, n, 'G') &&
 205  
                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
 206  3
                         code.append('J'); n += 2;
 207  
                     } else {
 208  442
                         code.append('T');
 209  
                     }
 210  442
                     break;
 211  
                 case 'G': // GH silent at end or before consonant
 212  1705
                     if (isLastChar(wdsz, n + 1) &&
 213  
                         isNextChar(local, n, 'H')) {
 214  1
                         break;
 215  
                     }
 216  1704
                     if (!isLastChar(wdsz, n + 1) &&
 217  
                         isNextChar(local,n,'H') &&
 218  
                         !isVowel(local,n+2)) {
 219  19
                         break;
 220  
                     }
 221  1685
                     if (n > 0 &&
 222  
                         ( regionMatch(local, n, "GN") ||
 223  
                           regionMatch(local, n, "GNED") ) ) {
 224  0
                         break; // silent G
 225  
                     }
 226  1684
                     if (isPreviousChar(local, n, 'G')) {
 227  
                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
 228  0
                         hard = true;
 229  
                     } else {
 230  1684
                         hard = false;
 231  
                     }
 232  1684
                     if (!isLastChar(wdsz, n) &&
 233  
                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
 234  
                         !hard) {
 235  1547
                         code.append('J');
 236  
                     } else {
 237  137
                         code.append('K');
 238  
                     }
 239  137
                     break;
 240  
                 case 'H':
 241  654
                     if (isLastChar(wdsz, n)) {
 242  205
                         break; // terminal H
 243  
                     }
 244  449
                     if (n > 0 &&
 245  
                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
 246  27
                         break;
 247  
                     }
 248  422
                     if (isVowel(local,n+1)) {
 249  1
                         code.append('H'); // Hvowel
 250  
                     }
 251  
                     break;
 252  
                 case 'F':
 253  
                 case 'J':
 254  
                 case 'L':
 255  
                 case 'M':
 256  
                 case 'N':
 257  
                 case 'R':
 258  20128
                     code.append(symb);
 259  20128
                     break;
 260  
                 case 'K':
 261  1963
                     if (n > 0) { // not initial
 262  5
                         if (!isPreviousChar(local, n, 'C')) {
 263  2
                             code.append(symb);
 264  
                         }
 265  
                     } else {
 266  1958
                         code.append(symb); // initial K
 267  
                     }
 268  1958
                     break;
 269  
                 case 'P':
 270  245
                     if (isNextChar(local,n,'H')) {
 271  
                         // PH -> F
 272  1
                         code.append('F');
 273  
                     } else {
 274  244
                         code.append(symb);
 275  
                     }
 276  244
                     break;
 277  
                 case 'Q':
 278  3
                     code.append('K');
 279  3
                     break;
 280  
                 case 'S':
 281  673
                     if (regionMatch(local,n,"SH") ||
 282  
                         regionMatch(local,n,"SIO") ||
 283  
                         regionMatch(local,n,"SIA")) {
 284  4
                         code.append('X');
 285  
                     } else {
 286  669
                         code.append('S');
 287  
                     }
 288  669
                     break;
 289  
                 case 'T':
 290  640
                     if (regionMatch(local,n,"TIA") ||
 291  
                         regionMatch(local,n,"TIO")) {
 292  2
                         code.append('X');
 293  2
                         break;
 294  
                     }
 295  638
                     if (regionMatch(local,n,"TCH")) {
 296  
                         // Silent if in "TCH"
 297  2
                         break;
 298  
                     }
 299  
                     // substitute numeral 0 for TH (resembles theta after all)
 300  636
                     if (regionMatch(local,n,"TH")) {
 301  2
                         code.append('0');
 302  
                     } else {
 303  634
                         code.append('T');
 304  
                     }
 305  634
                     break;
 306  
                 case 'V':
 307  1
                     code.append('F'); break;
 308  
                 case 'W':
 309  
                 case 'Y': // silent if not followed by vowel
 310  2847
                     if (!isLastChar(wdsz,n) &&
 311  
                         isVowel(local,n+1)) {
 312  314
                         code.append(symb);
 313  
                     }
 314  
                     break;
 315  
                 case 'X':
 316  6
                     code.append('K');
 317  6
                     code.append('S');
 318  6
                     break;
 319  
                 case 'Z':
 320  139
                     code.append('S');
 321  139
                     break;
 322  
                 default:
 323  
                     // do nothing
 324  
                     break;
 325  
                 } // end switch
 326  62374
                 n++;
 327  
             } // end else from symb != 'C'
 328  67981
             if (code.length() > this.getMaxCodeLen()) {
 329  2
                 code.setLength(this.getMaxCodeLen());
 330  
             }
 331  67981
         }
 332  13764
         return code.toString();
 333  
     }
 334  
 
 335  
     private boolean isVowel(final StringBuilder string, final int index) {
 336  1132
         return VOWELS.indexOf(string.charAt(index)) >= 0;
 337  
     }
 338  
 
 339  
     private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
 340  71631
         boolean matches = false;
 341  71631
         if( index > 0 &&
 342  
             index < string.length() ) {
 343  54295
             matches = string.charAt(index - 1) == c;
 344  
         }
 345  71631
         return matches;
 346  
     }
 347  
 
 348  
     private boolean isNextChar(final StringBuilder string, final int index, final char c) {
 349  4062
         boolean matches = false;
 350  4062
         if( index >= 0 &&
 351  
             index < string.length() - 1 ) {
 352  4035
             matches = string.charAt(index + 1) == c;
 353  
         }
 354  4062
         return matches;
 355  
     }
 356  
 
 357  
     private boolean regionMatch(final StringBuilder string, final int index, final String test) {
 358  6549
         boolean matches = false;
 359  6549
         if( index >= 0 &&
 360  
             index + test.length() - 1 < string.length() ) {
 361  5282
             final String substring = string.substring( index, index + test.length());
 362  5282
             matches = substring.equals( test );
 363  
         }
 364  6549
         return matches;
 365  
     }
 366  
 
 367  
     private boolean isLastChar(final int wdsz, final int n) {
 368  11025
         return n + 1 == wdsz;
 369  
     }
 370  
 
 371  
 
 372  
     /**
 373  
      * Encodes an Object using the metaphone algorithm.  This method
 374  
      * is provided in order to satisfy the requirements of the
 375  
      * Encoder interface, and will throw an EncoderException if the
 376  
      * supplied object is not of type java.lang.String.
 377  
      *
 378  
      * @param obj Object to encode
 379  
      * @return An object (or type java.lang.String) containing the
 380  
      *         metaphone code which corresponds to the String supplied.
 381  
      * @throws EncoderException if the parameter supplied is not
 382  
      *                          of type java.lang.String
 383  
      */
 384  
     @Override
 385  
     public Object encode(final Object obj) throws EncoderException {
 386  4
         if (!(obj instanceof String)) {
 387  1
             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
 388  
         }
 389  3
         return metaphone((String) obj);
 390  
     }
 391  
 
 392  
     /**
 393  
      * Encodes a String using the Metaphone algorithm.
 394  
      *
 395  
      * @param str String object to encode
 396  
      * @return The metaphone code corresponding to the String supplied
 397  
      */
 398  
     @Override
 399  
     public String encode(final String str) {
 400  7
         return metaphone(str);
 401  
     }
 402  
 
 403  
     /**
 404  
      * Tests is the metaphones of two strings are identical.
 405  
      *
 406  
      * @param str1 First of two strings to compare
 407  
      * @param str2 Second of two strings to compare
 408  
      * @return {@code true} if the metaphones of these strings are identical,
 409  
      *        {@code false} otherwise.
 410  
      */
 411  
     public boolean isMetaphoneEqual(final String str1, final String str2) {
 412  6862
         return metaphone(str1).equals(metaphone(str2));
 413  
     }
 414  
 
 415  
     /**
 416  
      * Returns the maxCodeLen.
 417  
      * @return int
 418  
      */
 419  149728
     public int getMaxCodeLen() { return this.maxCodeLen; }
 420  
 
 421  
     /**
 422  
      * Sets the maxCodeLen.
 423  
      * @param maxCodeLen The maxCodeLen to set
 424  
      */
 425  1
     public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
 426  
 
 427  
 }