001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.codec.language; 018 019 import java.util.Locale; 020 021 import org.apache.commons.codec.EncoderException; 022 import org.apache.commons.codec.StringEncoder; 023 024 /** 025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977. 026 * 027 * This class is immutable and thread-safe. 028 * 029 * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> 030 * @since 1.8 031 */ 032 public class MatchRatingApproachEncoder implements StringEncoder { 033 034 private static final String SPACE = " "; 035 036 private static final String EMPTY = ""; 037 038 /** 039 * Constants used mainly for the min rating value. 040 */ 041 private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8, 042 ELEVEN = 11, TWELVE = 12; 043 044 /** 045 * The plain letter equivalent of the accented letters. 046 */ 047 private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave 048 "AaEeIiOoUuYy" + // acute 049 "AaEeIiOoUuYy" + // circumflex 050 "AaOoNn" + // tilde 051 "AaEeIiOoUuYy" + // umlaut 052 "Aa" + // ring 053 "Cc" + // cedilla 054 "OoUu"; // double acute 055 056 /** 057 * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... 058 */ 059 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + 060 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + 061 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + 062 "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + 063 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + 064 "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171"; 065 066 private static final String[] DOUBLE_CONSONANT = 067 new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", 068 "TT", "VV", "WW", "XX", "YY", "ZZ" }; 069 070 /** 071 * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any 072 * spaces. 073 * 074 * <h2>API Usage</h2> 075 * <p> 076 * Consider this method private, it is package protected for unit testing only. 077 * </p> 078 * 079 * @param name 080 * The name to be cleaned 081 * @return The cleaned name 082 */ 083 String cleanName(final String name) { 084 String upperName = name.toUpperCase(Locale.ENGLISH); 085 086 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; 087 for (final String str : charsToTrim) { 088 upperName = upperName.replaceAll(str, EMPTY); 089 } 090 091 upperName = removeAccents(upperName); 092 upperName = upperName.replaceAll("\\s+", EMPTY); 093 094 return upperName; 095 } 096 097 /** 098 * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy the requirements of the 099 * Encoder interface Throws an EncoderException if input object is not of type java.lang.String. 100 * 101 * @param pObject 102 * Object to encode 103 * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the 104 * String supplied. 105 * @throws EncoderException 106 * if the parameter supplied is not of type java.lang.String 107 */ 108 @Override 109 public final Object encode(final Object pObject) throws EncoderException { 110 if (!(pObject instanceof String)) { 111 throw new EncoderException( 112 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); 113 } 114 return encode((String) pObject); 115 } 116 117 /** 118 * Encodes a String using the Match Rating Approach (MRA) algorithm. 119 * 120 * @param name 121 * String object to encode 122 * @return The MRA code corresponding to the String supplied 123 */ 124 @Override 125 public final String encode(String name) { 126 // Bulletproof for trivial input - NINO 127 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) { 128 return EMPTY; 129 } 130 131 // Preprocessing 132 name = cleanName(name); 133 134 // BEGIN: Actual encoding part of the algorithm... 135 // 1. Delete all vowels unless the vowel begins the word 136 name = removeVowels(name); 137 138 // 2. Remove second consonant from any double consonant 139 name = removeDoubleConsonants(name); 140 141 // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters 142 name = getFirst3Last3(name); 143 144 return name; 145 } 146 147 /** 148 * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name. 149 * 150 * <h2>API Usage</h2> 151 * <p> 152 * Consider this method private, it is package protected for unit testing only. 153 * </p> 154 * 155 * @param name 156 * The string to get the substrings from 157 * @return Annexed first & last 3 letters of input word. 158 */ 159 String getFirst3Last3(final String name) { 160 final int nameLength = name.length(); 161 162 if (nameLength > SIX) { 163 final String firstThree = name.substring(0, THREE); 164 final String lastThree = name.substring(nameLength - THREE, nameLength); 165 return firstThree + lastThree; 166 } else { 167 return name; 168 } 169 } 170 171 /** 172 * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the 173 * min rating. Values strictly from documentation. 174 * 175 * <h2>API Usage</h2> 176 * <p> 177 * Consider this method private, it is package protected for unit testing only. 178 * </p> 179 * 180 * @param sumLength 181 * The length of 2 strings sent down 182 * @return The min rating value 183 */ 184 int getMinRating(final int sumLength) { 185 int minRating = 0; 186 187 if (sumLength <= FOUR) { 188 minRating = FIVE; 189 } else if (sumLength >= FIVE && sumLength <= SEVEN) { 190 minRating = FOUR; 191 } else if (sumLength >= EIGHT && sumLength <= ELEVEN) { 192 minRating = THREE; 193 } else if (sumLength == TWELVE) { 194 minRating = TWO; 195 } else { 196 minRating = ONE; // docs said little here. 197 } 198 199 return minRating; 200 } 201 202 /** 203 * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the 204 * strings are cleaned in the same way as {@link #encode(String)}. 205 * 206 * @param name1 207 * First of the 2 strings (names) to compare 208 * @param name2 209 * Second of the 2 names to compare 210 * @return <code>true</code> if the encodings are identical <code>false</code> otherwise. 211 */ 212 public boolean isEncodeEquals(String name1, String name2) { 213 // Bulletproof for trivial input - NINO 214 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) { 215 return false; 216 } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) { 217 return false; 218 } else if (name1.length() == 1 || name2.length() == 1) { 219 return false; 220 } else if (name1.equalsIgnoreCase(name2)) { 221 return true; 222 } 223 224 // Preprocessing 225 name1 = cleanName(name1); 226 name2 = cleanName(name2); 227 228 // Actual MRA Algorithm 229 230 // 1. Remove vowels 231 name1 = removeVowels(name1); 232 name2 = removeVowels(name2); 233 234 // 2. Remove double consonants 235 name1 = removeDoubleConsonants(name1); 236 name2 = removeDoubleConsonants(name2); 237 238 // 3. Reduce down to 3 letters 239 name1 = getFirst3Last3(name1); 240 name2 = getFirst3Last3(name2); 241 242 // 4. Check for length difference - if 3 or greater then no similarity 243 // comparison is done 244 if (Math.abs(name1.length() - name2.length()) >= THREE) { 245 return false; 246 } 247 248 // 5. Obtain the minimum rating value by calculating the length sum of the 249 // encoded Strings and sending it down. 250 final int sumLength = Math.abs(name1.length() + name2.length()); 251 int minRating = 0; 252 minRating = getMinRating(sumLength); 253 254 // 6. Process the encoded Strings from left to right and remove any 255 // identical characters found from both Strings respectively. 256 final int count = leftToRightThenRightToLeftProcessing(name1, name2); 257 258 // 7. Each PNI item that has a similarity rating equal to or greater than 259 // the min is considered to be a good candidate match 260 return count >= minRating; 261 262 } 263 264 /** 265 * Determines if a letter is a vowel. 266 * 267 * <h2>API Usage</h2> 268 * <p> 269 * Consider this method private, it is package protected for unit testing only. 270 * </p> 271 * 272 * @param letter 273 * The letter under investiagtion 274 * @return True if a vowel, else false 275 */ 276 boolean isVowel(final String letter) { 277 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") || 278 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U"); 279 } 280 281 /** 282 * Processes the names from left to right (first) then right to left removing identical letters in same positions. 283 * Then subtracts the longer string that remains from 6 and returns this. 284 * 285 * <h2>API Usage</h2> 286 * <p> 287 * Consider this method private, it is package protected for unit testing only. 288 * </p> 289 * 290 * @param name1 291 * name2 292 * @return 293 */ 294 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) { 295 final char[] name1Char = name1.toCharArray(); 296 final char[] name2Char = name2.toCharArray(); 297 298 final int name1Size = name1.length() - 1; 299 final int name2Size = name2.length() - 1; 300 301 String name1LtRStart = EMPTY; 302 String name1LtREnd = EMPTY; 303 304 String name2RtLStart = EMPTY; 305 String name2RtLEnd = EMPTY; 306 307 for (int i = 0; i < name1Char.length; i++) { 308 if (i > name2Size) { 309 break; 310 } 311 312 name1LtRStart = name1.substring(i, i + 1); 313 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1); 314 315 name2RtLStart = name2.substring(i, i + 1); 316 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1); 317 318 // Left to right... 319 if (name1LtRStart.equals(name2RtLStart)) { 320 name1Char[i] = ' '; 321 name2Char[i] = ' '; 322 } 323 324 // Right to left... 325 if (name1LtREnd.equals(name2RtLEnd)) { 326 name1Char[name1Size - i] = ' '; 327 name2Char[name2Size - i] = ' '; 328 } 329 } 330 331 // Char arrays -> string & remove extraneous space 332 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); 333 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); 334 335 // Final bit - subtract longest string from 6 and return this int value 336 if (strA.length() > strB.length()) { 337 return Math.abs(SIX - strA.length()); 338 } else { 339 return Math.abs(SIX - strB.length()); 340 } 341 } 342 343 /** 344 * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved. 345 * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 346 * 347 * @param accentedWord 348 * The word that may have accents in it. 349 * @return De-accented word 350 */ 351 String removeAccents(final String accentedWord) { 352 if (accentedWord == null) { 353 return null; 354 } 355 356 final StringBuilder sb = new StringBuilder(); 357 final int n = accentedWord.length(); 358 359 for (int i = 0; i < n; i++) { 360 final char c = accentedWord.charAt(i); 361 final int pos = UNICODE.indexOf(c); 362 if (pos > -1) { 363 sb.append(PLAIN_ASCII.charAt(pos)); 364 } else { 365 sb.append(c); 366 } 367 } 368 369 return sb.toString(); 370 } 371 372 /** 373 * Replaces any double consonant pair with the single letter equivalent. 374 * 375 * <h2>API Usage</h2> 376 * <p> 377 * Consider this method private, it is package protected for unit testing only. 378 * </p> 379 * 380 * @param name 381 * String to have double consonants removed 382 * @return Single consonant word 383 */ 384 String removeDoubleConsonants(final String name) { 385 String replacedName = name.toUpperCase(); 386 for (final String dc : DOUBLE_CONSONANT) { 387 if (replacedName.contains(dc)) { 388 final String singleLetter = dc.substring(0, 1); 389 replacedName = replacedName.replace(dc, singleLetter); 390 } 391 } 392 return replacedName; 393 } 394 395 /** 396 * Deletes all vowels unless the vowel begins the word. 397 * 398 * <h2>API Usage</h2> 399 * <p> 400 * Consider this method private, it is package protected for unit testing only. 401 * </p> 402 * 403 * @param name 404 * The name to have vowels removed 405 * @return De-voweled word 406 */ 407 String removeVowels(String name) { 408 // Extract first letter 409 final String firstLetter = name.substring(0, 1); 410 411 name = name.replaceAll("A", EMPTY); 412 name = name.replaceAll("E", EMPTY); 413 name = name.replaceAll("I", EMPTY); 414 name = name.replaceAll("O", EMPTY); 415 name = name.replaceAll("U", EMPTY); 416 417 name = name.replaceAll("\\s{2,}\\b", SPACE); 418 419 // return isVowel(firstLetter) ? (firstLetter + name) : name; 420 if (isVowel(firstLetter)) { 421 return firstLetter + name; 422 } else { 423 return name; 424 } 425 } 426 }