1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.codec.language;
18
19 import java.util.Locale;
20
21 import org.apache.commons.codec.EncoderException;
22 import org.apache.commons.codec.StringEncoder;
23
24 /**
25 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
26 * <p>
27 * This class is immutable and thread-safe.
28 * </p>
29 *
30 * @see <a href="https://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
31 * @since 1.8
32 */
33 public class MatchRatingApproachEncoder implements StringEncoder {
34
35 private static final String SPACE = " ";
36
37 private static final String EMPTY = "";
38
39 /**
40 * The plain letter equivalent of the accented letters.
41 */
42 private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
43 "AaEeIiOoUuYy" + // acute
44 "AaEeIiOoUuYy" + // circumflex
45 "AaOoNn" + // tilde
46 "AaEeIiOoUuYy" + // umlaut
47 "Aa" + // ring
48 "Cc" + // cedilla
49 "OoUu"; // double acute
50
51 /**
52 * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
53 */
54 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
55 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
56 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
57 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF\u00C5\u00E5\u00C7\u00E7\u0150\u0151\u0170\u0171";
58
59 /**
60 * Double consonants.
61 */
62 private static final String[] DOUBLE_CONSONANT =
63 { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
64 "TT", "VV", "WW", "XX", "YY", "ZZ" };
65
66 /**
67 * Constructs a new instance.
68 */
69 public MatchRatingApproachEncoder() {
70 // empty
71 }
72
73 /**
74 * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
75 * spaces.
76 *
77 * <h2>API Usage</h2>
78 * <p>
79 * Consider this method private, it is package protected for unit testing only.
80 * </p>
81 *
82 * @param name
83 * The name to be cleaned.
84 * @return The cleaned name.
85 */
86 String cleanName(final String name) {
87 String upperName = name.toUpperCase(Locale.ENGLISH);
88
89 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
90 for (final String str : charsToTrim) {
91 upperName = upperName.replaceAll(str, EMPTY);
92 }
93
94 upperName = removeAccents(upperName);
95 return upperName.replaceAll("\\s+", EMPTY);
96 }
97
98 /**
99 * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
100 * Encoder interface Throws an EncoderException if input object is not of type {@link String}.
101 *
102 * @param object
103 * Object to encode.
104 * @return An object (or type {@link String}) containing the Match Rating Approach code which corresponds to the
105 * String supplied.
106 * @throws EncoderException
107 * if the parameter supplied is not of type {@link String}.
108 */
109 @Override
110 public final Object encode(final Object object) throws EncoderException {
111 if (!(object instanceof String)) {
112 throw new EncoderException(
113 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
114 }
115 return encode((String) object);
116 }
117
118 /**
119 * Encodes a String using the Match Rating Approach (MRA) algorithm.
120 *
121 * @param name
122 * String object to encode.
123 * @return The MRA code corresponding to the String supplied.
124 */
125 @Override
126 public final String encode(String name) {
127 // Bulletproof for trivial input - NINO
128 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
129 return EMPTY;
130 }
131
132 // Preprocessing
133 name = cleanName(name);
134
135 // Bulletproof if name becomes empty after cleanName(name)
136 if (SPACE.equals(name) || name.isEmpty()) {
137 return EMPTY;
138 }
139
140 // BEGIN: Actual encoding part of the algorithm...
141 // 1. Delete all vowels unless the vowel begins the word
142 name = removeVowels(name);
143
144 // Bulletproof if name becomes empty after removeVowels(name)
145 if (SPACE.equals(name) || name.isEmpty()) {
146 return EMPTY;
147 }
148
149 // 2. Remove second consonant from any double consonant
150 name = removeDoubleConsonants(name);
151
152 return getFirst3Last3(name);
153 }
154
155 /**
156 * Gets the first and last 3 letters of a name (if > 6 characters) Else just returns the name.
157 *
158 * <h2>API Usage</h2>
159 * <p>
160 * Consider this method private, it is package protected for unit testing only.
161 * </p>
162 *
163 * @param name
164 * The string to get the substrings from.
165 * @return Annexed first and last 3 letters of input word.
166 */
167 String getFirst3Last3(final String name) {
168 final int nameLength = name.length();
169
170 if (nameLength > 6) {
171 final String firstThree = name.substring(0, 3);
172 final String lastThree = name.substring(nameLength - 3, nameLength);
173 return firstThree + lastThree;
174 }
175 return name;
176 }
177
178 /**
179 * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
180 * min rating. Values strictly from documentation.
181 *
182 * <h2>API Usage</h2>
183 * <p>
184 * Consider this method private, it is package protected for unit testing only.
185 * </p>
186 *
187 * @param sumLength
188 * The length of 2 strings sent down.
189 * @return The min rating value.
190 */
191 int getMinRating(final int sumLength) {
192 int minRating = 0;
193
194 if (sumLength <= 4) {
195 minRating = 5;
196 } else if (sumLength <= 7) { // already know it is at least 5
197 minRating = 4;
198 } else if (sumLength <= 11) { // already know it is at least 8
199 minRating = 3;
200 } else if (sumLength == 12) {
201 minRating = 2;
202 } else {
203 minRating = 1; // docs said little here.
204 }
205
206 return minRating;
207 }
208
209 /**
210 * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
211 * strings are cleaned in the same way as {@link #encode(String)}.
212 *
213 * @param name1
214 * First of the 2 strings (names) to compare.
215 * @param name2
216 * Second of the 2 names to compare.
217 * @return {@code true} if the encodings are identical {@code false} otherwise.
218 */
219 public boolean isEncodeEquals(String name1, String name2) {
220 // Bulletproof for trivial input - NINO
221 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
222 return false;
223 }
224 if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
225 return false;
226 }
227 if (name1.length() == 1 || name2.length() == 1) {
228 return false;
229 }
230 if (name1.equalsIgnoreCase(name2)) {
231 return true;
232 }
233
234 // Preprocessing
235 name1 = cleanName(name1);
236 name2 = cleanName(name2);
237
238 // Actual MRA Algorithm
239
240 // 1. Remove vowels
241 name1 = removeVowels(name1);
242 name2 = removeVowels(name2);
243
244 // 2. Remove double consonants
245 name1 = removeDoubleConsonants(name1);
246 name2 = removeDoubleConsonants(name2);
247
248 // 3. Reduce down to 3 letters
249 name1 = getFirst3Last3(name1);
250 name2 = getFirst3Last3(name2);
251
252 // 4. Check for length difference - if 3 or greater, then no similarity
253 // comparison is done
254 if (Math.abs(name1.length() - name2.length()) >= 3) {
255 return false;
256 }
257
258 // 5. Obtain the minimum rating value by calculating the length sum of the
259 // encoded Strings and sending it down.
260 final int sumLength = Math.abs(name1.length() + name2.length());
261 final int minRating = getMinRating(sumLength);
262
263 // 6. Process the encoded Strings from left to right and remove any
264 // identical characters found from both Strings respectively.
265 final int count = leftToRightThenRightToLeftProcessing(name1, name2);
266
267 // 7. Each PNI item that has a similarity rating equal to or greater than
268 // the min is considered to be a good candidate match
269 return count >= minRating;
270
271 }
272
273 /**
274 * Determines if a letter is a vowel.
275 *
276 * <h2>API Usage</h2>
277 * <p>
278 * Consider this method private, it is package protected for unit testing only.
279 * </p>
280 *
281 * @param letter
282 * The letter under investigation
283 * @return True if a vowel, else false
284 */
285 boolean isVowel(final String letter) {
286 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
287 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
288 }
289
290 /**
291 * Processes the names from left to right (first) then right to left removing identical letters in same positions.
292 * Then subtracts the longer string that remains from 6 and returns this.
293 *
294 * <h2>API Usage</h2>
295 * <p>
296 * Consider this method private, it is package protected for unit testing only.
297 * </p>
298 *
299 * @param name1 first name.
300 * @param name1 second name.
301 * @return the length as above.
302 */
303 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
304 final char[] name1Char = name1.toCharArray();
305 final char[] name2Char = name2.toCharArray();
306
307 final int name1Size = name1.length() - 1;
308 final int name2Size = name2.length() - 1;
309
310 String name1LtRStart = EMPTY;
311 String name1LtREnd = EMPTY;
312
313 String name2RtLStart = EMPTY;
314 String name2RtLEnd = EMPTY;
315
316 for (int i = 0; i < name1Char.length; i++) {
317 if (i > name2Size) {
318 break;
319 }
320
321 name1LtRStart = name1.substring(i, i + 1);
322 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
323
324 name2RtLStart = name2.substring(i, i + 1);
325 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
326
327 // Left to right...
328 if (name1LtRStart.equals(name2RtLStart)) {
329 name1Char[i] = ' ';
330 name2Char[i] = ' ';
331 }
332
333 // Right to left...
334 if (name1LtREnd.equals(name2RtLEnd)) {
335 name1Char[name1Size - i] = ' ';
336 name2Char[name2Size - i] = ' ';
337 }
338 }
339
340 // Char arrays -> string & remove extraneous space
341 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
342 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
343
344 // Final bit - subtract the longest string from 6 and return this int value
345 if (strA.length() > strB.length()) {
346 return Math.abs(6 - strA.length());
347 }
348 return Math.abs(6 - strB.length());
349 }
350
351 /**
352 * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved.
353 * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
354 *
355 * @param accentedWord
356 * The word that may have accents in it.
357 * @return De-accented word.
358 */
359 String removeAccents(final String accentedWord) {
360 if (accentedWord == null) {
361 return null;
362 }
363
364 final StringBuilder sb = new StringBuilder();
365 final int n = accentedWord.length();
366
367 for (int i = 0; i < n; i++) {
368 final char c = accentedWord.charAt(i);
369 final int pos = UNICODE.indexOf(c);
370 if (pos > -1) {
371 sb.append(PLAIN_ASCII.charAt(pos));
372 } else {
373 sb.append(c);
374 }
375 }
376
377 return sb.toString();
378 }
379
380 /**
381 * Replaces any double consonant pair with the single letter equivalent.
382 *
383 * <h2>API Usage</h2>
384 * <p>
385 * Consider this method private, it is package protected for unit testing only.
386 * </p>
387 *
388 * @param name
389 * String to have double consonants removed.
390 * @return Single consonant word.
391 */
392 String removeDoubleConsonants(final String name) {
393 String replacedName = name.toUpperCase(Locale.ENGLISH);
394 for (final String dc : DOUBLE_CONSONANT) {
395 if (replacedName.contains(dc)) {
396 final String singleLetter = dc.substring(0, 1);
397 replacedName = replacedName.replace(dc, singleLetter);
398 }
399 }
400 return replacedName;
401 }
402
403 /**
404 * Deletes all vowels unless the vowel begins the word.
405 *
406 * <h2>API Usage</h2>
407 * <p>
408 * Consider this method private, it is package protected for unit testing only.
409 * </p>
410 *
411 * @param name
412 * The name to have vowels removed.
413 * @return De-voweled word.
414 */
415 String removeVowels(String name) {
416 // Extract first letter
417 final String firstLetter = name.substring(0, 1);
418
419 name = name.replace("A", EMPTY);
420 name = name.replace("E", EMPTY);
421 name = name.replace("I", EMPTY);
422 name = name.replace("O", EMPTY);
423 name = name.replace("U", EMPTY);
424
425 name = name.replaceAll("\\s{2,}\\b", SPACE);
426
427 // return isVowel(firstLetter) ? (firstLetter + name) : name;
428 if (isVowel(firstLetter)) {
429 return firstLetter + name;
430 }
431 return name;
432 }
433 }