View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.util.Locale;
20  
21  import org.apache.commons.codec.EncoderException;
22  import org.apache.commons.codec.StringEncoder;
23  
24  /**
25   * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
26   * <p>
27   * This class is immutable and thread-safe.
28   * </p>
29   *
30   * @see <a href="https://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
31   * @since 1.8
32   */
33  public class MatchRatingApproachEncoder implements StringEncoder {
34  
35      private static final String SPACE = " ";
36  
37      private static final String EMPTY = "";
38  
39      /**
40       * The plain letter equivalent of the accented letters.
41       */
42      private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
43              "AaEeIiOoUuYy" + // acute
44              "AaEeIiOoUuYy" + // circumflex
45              "AaOoNn" + // tilde
46              "AaEeIiOoUuYy" + // umlaut
47              "Aa" + // ring
48              "Cc" + // cedilla
49              "OoUu"; // double acute
50  
51      /**
52       * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
53       */
54      private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
55              "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
56              "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
57              "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF\u00C5\u00E5\u00C7\u00E7\u0150\u0151\u0170\u0171";
58  
59      /**
60       * Double consonants.
61       */
62      private static final String[] DOUBLE_CONSONANT =
63              { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
64                     "TT", "VV", "WW", "XX", "YY", "ZZ" };
65  
66      /**
67       * Constructs a new instance.
68       */
69      public MatchRatingApproachEncoder() {
70          // empty
71      }
72  
73      /**
74       * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
75       * spaces.
76       *
77       * <h2>API Usage</h2>
78       * <p>
79       * Consider this method private, it is package protected for unit testing only.
80       * </p>
81       *
82       * @param name
83       *            The name to be cleaned.
84       * @return The cleaned name.
85       */
86      String cleanName(final String name) {
87          String upperName = name.toUpperCase(Locale.ENGLISH);
88  
89          final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
90          for (final String str : charsToTrim) {
91              upperName = upperName.replaceAll(str, EMPTY);
92          }
93  
94          upperName = removeAccents(upperName);
95          return upperName.replaceAll("\\s+", EMPTY);
96      }
97  
98      /**
99       * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
100      * Encoder interface Throws an EncoderException if input object is not of type {@link String}.
101      *
102      * @param object
103      *            Object to encode.
104      * @return An object (or type {@link String}) containing the Match Rating Approach code which corresponds to the
105      *         String supplied.
106      * @throws EncoderException
107      *             if the parameter supplied is not of type {@link String}.
108      */
109     @Override
110     public final Object encode(final Object object) throws EncoderException {
111         if (!(object instanceof String)) {
112             throw new EncoderException(
113                     "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
114         }
115         return encode((String) object);
116     }
117 
118     /**
119      * Encodes a String using the Match Rating Approach (MRA) algorithm.
120      *
121      * @param name
122      *            String object to encode.
123      * @return The MRA code corresponding to the String supplied.
124      */
125     @Override
126     public final String encode(String name) {
127         // Bulletproof for trivial input - NINO
128         if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
129             return EMPTY;
130         }
131 
132         // Preprocessing
133         name = cleanName(name);
134 
135         // Bulletproof if name becomes empty after cleanName(name)
136         if (SPACE.equals(name) || name.isEmpty()) {
137             return EMPTY;
138         }
139 
140         // BEGIN: Actual encoding part of the algorithm...
141         // 1. Delete all vowels unless the vowel begins the word
142         name = removeVowels(name);
143 
144         // Bulletproof if name becomes empty after removeVowels(name)
145         if (SPACE.equals(name) || name.isEmpty()) {
146             return EMPTY;
147         }
148 
149         // 2. Remove second consonant from any double consonant
150         name = removeDoubleConsonants(name);
151 
152         return getFirst3Last3(name);
153     }
154 
155     /**
156      * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
157      *
158      * <h2>API Usage</h2>
159      * <p>
160      * Consider this method private, it is package protected for unit testing only.
161      * </p>
162      *
163      * @param name
164      *            The string to get the substrings from.
165      * @return Annexed first and last 3 letters of input word.
166      */
167     String getFirst3Last3(final String name) {
168         final int nameLength = name.length();
169 
170         if (nameLength > 6) {
171             final String firstThree = name.substring(0, 3);
172             final String lastThree = name.substring(nameLength - 3, nameLength);
173             return firstThree + lastThree;
174         }
175         return name;
176     }
177 
178     /**
179      * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
180      * min rating. Values strictly from documentation.
181      *
182      * <h2>API Usage</h2>
183      * <p>
184      * Consider this method private, it is package protected for unit testing only.
185      * </p>
186      *
187      * @param sumLength
188      *            The length of 2 strings sent down.
189      * @return The min rating value.
190      */
191     int getMinRating(final int sumLength) {
192         int minRating = 0;
193 
194         if (sumLength <= 4) {
195             minRating = 5;
196         } else if (sumLength <= 7) { // already know it is at least 5
197             minRating = 4;
198         } else if (sumLength <= 11) { // already know it is at least 8
199             minRating = 3;
200         } else if (sumLength == 12) {
201             minRating = 2;
202         } else {
203             minRating = 1; // docs said little here.
204         }
205 
206         return minRating;
207     }
208 
209     /**
210      * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
211      * strings are cleaned in the same way as {@link #encode(String)}.
212      *
213      * @param name1
214      *            First of the 2 strings (names) to compare.
215      * @param name2
216      *            Second of the 2 names to compare.
217      * @return {@code true} if the encodings are identical {@code false} otherwise.
218      */
219     public boolean isEncodeEquals(String name1, String name2) {
220         // Bulletproof for trivial input - NINO
221         if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
222             return false;
223         }
224         if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
225             return false;
226         }
227         if (name1.length() == 1 || name2.length() == 1) {
228             return false;
229         }
230         if (name1.equalsIgnoreCase(name2)) {
231             return true;
232         }
233 
234         // Preprocessing
235         name1 = cleanName(name1);
236         name2 = cleanName(name2);
237 
238         // Actual MRA Algorithm
239 
240         // 1. Remove vowels
241         name1 = removeVowels(name1);
242         name2 = removeVowels(name2);
243 
244         // 2. Remove double consonants
245         name1 = removeDoubleConsonants(name1);
246         name2 = removeDoubleConsonants(name2);
247 
248         // 3. Reduce down to 3 letters
249         name1 = getFirst3Last3(name1);
250         name2 = getFirst3Last3(name2);
251 
252         // 4. Check for length difference - if 3 or greater, then no similarity
253         // comparison is done
254         if (Math.abs(name1.length() - name2.length()) >= 3) {
255             return false;
256         }
257 
258         // 5. Obtain the minimum rating value by calculating the length sum of the
259         // encoded Strings and sending it down.
260         final int sumLength = Math.abs(name1.length() + name2.length());
261         final int minRating = getMinRating(sumLength);
262 
263         // 6. Process the encoded Strings from left to right and remove any
264         // identical characters found from both Strings respectively.
265         final int count = leftToRightThenRightToLeftProcessing(name1, name2);
266 
267         // 7. Each PNI item that has a similarity rating equal to or greater than
268         // the min is considered to be a good candidate match
269         return count >= minRating;
270 
271     }
272 
273     /**
274      * Determines if a letter is a vowel.
275      *
276      * <h2>API Usage</h2>
277      * <p>
278      * Consider this method private, it is package protected for unit testing only.
279      * </p>
280      *
281      * @param letter
282      *            The letter under investigation
283      * @return True if a vowel, else false
284      */
285     boolean isVowel(final String letter) {
286         return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
287                letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
288     }
289 
290     /**
291      * Processes the names from left to right (first) then right to left removing identical letters in same positions.
292      * Then subtracts the longer string that remains from 6 and returns this.
293      *
294      * <h2>API Usage</h2>
295      * <p>
296      * Consider this method private, it is package protected for unit testing only.
297      * </p>
298      *
299      * @param name1 first name.
300      * @param name1 second name.
301      * @return the length as above.
302      */
303     int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
304         final char[] name1Char = name1.toCharArray();
305         final char[] name2Char = name2.toCharArray();
306 
307         final int name1Size = name1.length() - 1;
308         final int name2Size = name2.length() - 1;
309 
310         String name1LtRStart = EMPTY;
311         String name1LtREnd = EMPTY;
312 
313         String name2RtLStart = EMPTY;
314         String name2RtLEnd = EMPTY;
315 
316         for (int i = 0; i < name1Char.length; i++) {
317             if (i > name2Size) {
318                 break;
319             }
320 
321             name1LtRStart = name1.substring(i, i + 1);
322             name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
323 
324             name2RtLStart = name2.substring(i, i + 1);
325             name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
326 
327             // Left to right...
328             if (name1LtRStart.equals(name2RtLStart)) {
329                 name1Char[i] = ' ';
330                 name2Char[i] = ' ';
331             }
332 
333             // Right to left...
334             if (name1LtREnd.equals(name2RtLEnd)) {
335                 name1Char[name1Size - i] = ' ';
336                 name2Char[name2Size - i] = ' ';
337             }
338         }
339 
340         // Char arrays -> string & remove extraneous space
341         final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
342         final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
343 
344         // Final bit - subtract the longest string from 6 and return this int value
345         if (strA.length() > strB.length()) {
346             return Math.abs(6 - strA.length());
347         }
348         return Math.abs(6 - strB.length());
349     }
350 
351     /**
352      * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved.
353      * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
354      *
355      * @param accentedWord
356      *            The word that may have accents in it.
357      * @return De-accented word.
358      */
359     String removeAccents(final String accentedWord) {
360         if (accentedWord == null) {
361             return null;
362         }
363 
364         final StringBuilder sb = new StringBuilder();
365         final int n = accentedWord.length();
366 
367         for (int i = 0; i < n; i++) {
368             final char c = accentedWord.charAt(i);
369             final int pos = UNICODE.indexOf(c);
370             if (pos > -1) {
371                 sb.append(PLAIN_ASCII.charAt(pos));
372             } else {
373                 sb.append(c);
374             }
375         }
376 
377         return sb.toString();
378     }
379 
380     /**
381      * Replaces any double consonant pair with the single letter equivalent.
382      *
383      * <h2>API Usage</h2>
384      * <p>
385      * Consider this method private, it is package protected for unit testing only.
386      * </p>
387      *
388      * @param name
389      *            String to have double consonants removed.
390      * @return Single consonant word.
391      */
392     String removeDoubleConsonants(final String name) {
393         String replacedName = name.toUpperCase(Locale.ENGLISH);
394         for (final String dc : DOUBLE_CONSONANT) {
395             if (replacedName.contains(dc)) {
396                 final String singleLetter = dc.substring(0, 1);
397                 replacedName = replacedName.replace(dc, singleLetter);
398             }
399         }
400         return replacedName;
401     }
402 
403     /**
404      * Deletes all vowels unless the vowel begins the word.
405      *
406      * <h2>API Usage</h2>
407      * <p>
408      * Consider this method private, it is package protected for unit testing only.
409      * </p>
410      *
411      * @param name
412      *            The name to have vowels removed.
413      * @return De-voweled word.
414      */
415     String removeVowels(String name) {
416         // Extract first letter
417         final String firstLetter = name.substring(0, 1);
418 
419         name = name.replace("A", EMPTY);
420         name = name.replace("E", EMPTY);
421         name = name.replace("I", EMPTY);
422         name = name.replace("O", EMPTY);
423         name = name.replace("U", EMPTY);
424 
425         name = name.replaceAll("\\s{2,}\\b", SPACE);
426 
427         // return isVowel(firstLetter) ? (firstLetter + name) : name;
428         if (isVowel(firstLetter)) {
429             return firstLetter + name;
430         }
431         return name;
432     }
433 }