1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.codec.language;
18
19 import java.util.Locale;
20
21 import org.apache.commons.codec.EncoderException;
22 import org.apache.commons.codec.StringEncoder;
23
24 /**
25 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
26 *
27 * This class is immutable and thread-safe.
28 *
29 * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
30 * @since 1.8
31 */
32 public class MatchRatingApproachEncoder implements StringEncoder {
33
34 private static final String SPACE = " ";
35
36 private static final String EMPTY = "";
37
38 /**
39 * Constants used mainly for the min rating value.
40 */
41 private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7,
42 ELEVEN = 11, TWELVE = 12;
43
44 /**
45 * The plain letter equivalent of the accented letters.
46 */
47 private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
48 "AaEeIiOoUuYy" + // acute
49 "AaEeIiOoUuYy" + // circumflex
50 "AaOoNn" + // tilde
51 "AaEeIiOoUuYy" + // umlaut
52 "Aa" + // ring
53 "Cc" + // cedilla
54 "OoUu"; // double acute
55
56 /**
57 * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
58 */
59 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
60 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
61 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
62 "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
63 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
64 "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
65
66 private static final String[] DOUBLE_CONSONANT =
67 new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
68 "TT", "VV", "WW", "XX", "YY", "ZZ" };
69
70 /**
71 * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
72 * spaces.
73 *
74 * <h2>API Usage</h2>
75 * <p>
76 * Consider this method private, it is package protected for unit testing only.
77 * </p>
78 *
79 * @param name
80 * The name to be cleaned
81 * @return The cleaned name
82 */
83 String cleanName(final String name) {
84 String upperName = name.toUpperCase(Locale.ENGLISH);
85
86 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
87 for (final String str : charsToTrim) {
88 upperName = upperName.replaceAll(str, EMPTY);
89 }
90
91 upperName = removeAccents(upperName);
92 upperName = upperName.replaceAll("\\s+", EMPTY);
93
94 return upperName;
95 }
96
97 /**
98 * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
99 * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
100 *
101 * @param pObject
102 * Object to encode
103 * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
104 * String supplied.
105 * @throws EncoderException
106 * if the parameter supplied is not of type java.lang.String
107 */
108 @Override
109 public final Object encode(final Object pObject) throws EncoderException {
110 if (!(pObject instanceof String)) {
111 throw new EncoderException(
112 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113 }
114 return encode((String) pObject);
115 }
116
117 /**
118 * Encodes a String using the Match Rating Approach (MRA) algorithm.
119 *
120 * @param name
121 * String object to encode
122 * @return The MRA code corresponding to the String supplied
123 */
124 @Override
125 public final String encode(String name) {
126 // Bulletproof for trivial input - NINO
127 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128 return EMPTY;
129 }
130
131 // Preprocessing
132 name = cleanName(name);
133
134 // BEGIN: Actual encoding part of the algorithm...
135 // 1. Delete all vowels unless the vowel begins the word
136 name = removeVowels(name);
137
138 // 2. Remove second consonant from any double consonant
139 name = removeDoubleConsonants(name);
140
141 // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
142 name = getFirst3Last3(name);
143
144 return name;
145 }
146
147 /**
148 * Gets the first and last 3 letters of a name (if > 6 characters) Else just returns the name.
149 *
150 * <h2>API Usage</h2>
151 * <p>
152 * Consider this method private, it is package protected for unit testing only.
153 * </p>
154 *
155 * @param name
156 * The string to get the substrings from
157 * @return Annexed first and last 3 letters of input word.
158 */
159 String getFirst3Last3(final String name) {
160 final int nameLength = name.length();
161
162 if (nameLength > SIX) {
163 final String firstThree = name.substring(0, THREE);
164 final String lastThree = name.substring(nameLength - THREE, nameLength);
165 return firstThree + lastThree;
166 }
167 return name;
168 }
169
170 /**
171 * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
172 * min rating. Values strictly from documentation.
173 *
174 * <h2>API Usage</h2>
175 * <p>
176 * Consider this method private, it is package protected for unit testing only.
177 * </p>
178 *
179 * @param sumLength
180 * The length of 2 strings sent down
181 * @return The min rating value
182 */
183 int getMinRating(final int sumLength) {
184 int minRating = 0;
185
186 if (sumLength <= FOUR) {
187 minRating = FIVE;
188 } else if (sumLength <= SEVEN) { // aready know it is at least 5
189 minRating = FOUR;
190 } else if (sumLength <= ELEVEN) { // aready know it is at least 8
191 minRating = THREE;
192 } else if (sumLength == TWELVE) {
193 minRating = TWO;
194 } else {
195 minRating = ONE; // docs said little here.
196 }
197
198 return minRating;
199 }
200
201 /**
202 * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
203 * strings are cleaned in the same way as {@link #encode(String)}.
204 *
205 * @param name1
206 * First of the 2 strings (names) to compare
207 * @param name2
208 * Second of the 2 names to compare
209 * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
210 */
211 public boolean isEncodeEquals(String name1, String name2) {
212 // Bulletproof for trivial input - NINO
213 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
214 return false;
215 } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
216 return false;
217 } else if (name1.length() == 1 || name2.length() == 1) {
218 return false;
219 } else if (name1.equalsIgnoreCase(name2)) {
220 return true;
221 }
222
223 // Preprocessing
224 name1 = cleanName(name1);
225 name2 = cleanName(name2);
226
227 // Actual MRA Algorithm
228
229 // 1. Remove vowels
230 name1 = removeVowels(name1);
231 name2 = removeVowels(name2);
232
233 // 2. Remove double consonants
234 name1 = removeDoubleConsonants(name1);
235 name2 = removeDoubleConsonants(name2);
236
237 // 3. Reduce down to 3 letters
238 name1 = getFirst3Last3(name1);
239 name2 = getFirst3Last3(name2);
240
241 // 4. Check for length difference - if 3 or greater then no similarity
242 // comparison is done
243 if (Math.abs(name1.length() - name2.length()) >= THREE) {
244 return false;
245 }
246
247 // 5. Obtain the minimum rating value by calculating the length sum of the
248 // encoded Strings and sending it down.
249 final int sumLength = Math.abs(name1.length() + name2.length());
250 int minRating = 0;
251 minRating = getMinRating(sumLength);
252
253 // 6. Process the encoded Strings from left to right and remove any
254 // identical characters found from both Strings respectively.
255 final int count = leftToRightThenRightToLeftProcessing(name1, name2);
256
257 // 7. Each PNI item that has a similarity rating equal to or greater than
258 // the min is considered to be a good candidate match
259 return count >= minRating;
260
261 }
262
263 /**
264 * Determines if a letter is a vowel.
265 *
266 * <h2>API Usage</h2>
267 * <p>
268 * Consider this method private, it is package protected for unit testing only.
269 * </p>
270 *
271 * @param letter
272 * The letter under investiagtion
273 * @return True if a vowel, else false
274 */
275 boolean isVowel(final String letter) {
276 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
277 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
278 }
279
280 /**
281 * Processes the names from left to right (first) then right to left removing identical letters in same positions.
282 * Then subtracts the longer string that remains from 6 and returns this.
283 *
284 * <h2>API Usage</h2>
285 * <p>
286 * Consider this method private, it is package protected for unit testing only.
287 * </p>
288 *
289 * @param name1
290 * name2
291 * @return the length as above
292 */
293 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
294 final char[] name1Char = name1.toCharArray();
295 final char[] name2Char = name2.toCharArray();
296
297 final int name1Size = name1.length() - 1;
298 final int name2Size = name2.length() - 1;
299
300 String name1LtRStart = EMPTY;
301 String name1LtREnd = EMPTY;
302
303 String name2RtLStart = EMPTY;
304 String name2RtLEnd = EMPTY;
305
306 for (int i = 0; i < name1Char.length; i++) {
307 if (i > name2Size) {
308 break;
309 }
310
311 name1LtRStart = name1.substring(i, i + 1);
312 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
313
314 name2RtLStart = name2.substring(i, i + 1);
315 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
316
317 // Left to right...
318 if (name1LtRStart.equals(name2RtLStart)) {
319 name1Char[i] = ' ';
320 name2Char[i] = ' ';
321 }
322
323 // Right to left...
324 if (name1LtREnd.equals(name2RtLEnd)) {
325 name1Char[name1Size - i] = ' ';
326 name2Char[name2Size - i] = ' ';
327 }
328 }
329
330 // Char arrays -> string & remove extraneous space
331 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
332 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
333
334 // Final bit - subtract longest string from 6 and return this int value
335 if (strA.length() > strB.length()) {
336 return Math.abs(SIX - strA.length());
337 }
338 return Math.abs(SIX - strB.length());
339 }
340
341 /**
342 * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
343 * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
344 *
345 * @param accentedWord
346 * The word that may have accents in it.
347 * @return De-accented word
348 */
349 String removeAccents(final String accentedWord) {
350 if (accentedWord == null) {
351 return null;
352 }
353
354 final StringBuilder sb = new StringBuilder();
355 final int n = accentedWord.length();
356
357 for (int i = 0; i < n; i++) {
358 final char c = accentedWord.charAt(i);
359 final int pos = UNICODE.indexOf(c);
360 if (pos > -1) {
361 sb.append(PLAIN_ASCII.charAt(pos));
362 } else {
363 sb.append(c);
364 }
365 }
366
367 return sb.toString();
368 }
369
370 /**
371 * Replaces any double consonant pair with the single letter equivalent.
372 *
373 * <h2>API Usage</h2>
374 * <p>
375 * Consider this method private, it is package protected for unit testing only.
376 * </p>
377 *
378 * @param name
379 * String to have double consonants removed
380 * @return Single consonant word
381 */
382 String removeDoubleConsonants(final String name) {
383 String replacedName = name.toUpperCase(Locale.ENGLISH);
384 for (final String dc : DOUBLE_CONSONANT) {
385 if (replacedName.contains(dc)) {
386 final String singleLetter = dc.substring(0, 1);
387 replacedName = replacedName.replace(dc, singleLetter);
388 }
389 }
390 return replacedName;
391 }
392
393 /**
394 * Deletes all vowels unless the vowel begins the word.
395 *
396 * <h2>API Usage</h2>
397 * <p>
398 * Consider this method private, it is package protected for unit testing only.
399 * </p>
400 *
401 * @param name
402 * The name to have vowels removed
403 * @return De-voweled word
404 */
405 String removeVowels(String name) {
406 // Extract first letter
407 final String firstLetter = name.substring(0, 1);
408
409 name = name.replaceAll("A", EMPTY);
410 name = name.replaceAll("E", EMPTY);
411 name = name.replaceAll("I", EMPTY);
412 name = name.replaceAll("O", EMPTY);
413 name = name.replaceAll("U", EMPTY);
414
415 name = name.replaceAll("\\s{2,}\\b", SPACE);
416
417 // return isVowel(firstLetter) ? (firstLetter + name) : name;
418 if (isVowel(firstLetter)) {
419 return firstLetter + name;
420 }
421 return name;
422 }
423 }