View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.util.Locale;
20  
21  import org.apache.commons.codec.EncoderException;
22  import org.apache.commons.codec.StringEncoder;
23  
24  /**
25   * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
26   *
27   * This class is immutable and thread-safe.
28   *
29   * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
30   * @since 1.8
31   */
32  public class MatchRatingApproachEncoder implements StringEncoder {
33  
34      private static final String SPACE = " ";
35  
36      private static final String EMPTY = "";
37  
38      /**
39       * Constants used mainly for the min rating value.
40       */
41      private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
42                               ELEVEN = 11, TWELVE = 12;
43  
44      /**
45       * The plain letter equivalent of the accented letters.
46       */
47      private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
48              "AaEeIiOoUuYy" + // acute
49              "AaEeIiOoUuYy" + // circumflex
50              "AaOoNn" + // tilde
51              "AaEeIiOoUuYy" + // umlaut
52              "Aa" + // ring
53              "Cc" + // cedilla
54              "OoUu"; // double acute
55  
56      /**
57       * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
58       */
59      private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
60              "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
61              "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
62              "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
63              "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
64              "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
65  
66      private static final String[] DOUBLE_CONSONANT =
67              new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
68                             "TT", "VV", "WW", "XX", "YY", "ZZ" };
69  
70      /**
71       * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
72       * spaces.
73       *
74       * <h2>API Usage</h2>
75       * <p>
76       * Consider this method private, it is package protected for unit testing only.
77       * </p>
78       *
79       * @param name
80       *            The name to be cleaned
81       * @return The cleaned name
82       */
83      String cleanName(final String name) {
84          String upperName = name.toUpperCase(Locale.ENGLISH);
85  
86          final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
87          for (final String str : charsToTrim) {
88              upperName = upperName.replaceAll(str, EMPTY);
89          }
90  
91          upperName = removeAccents(upperName);
92          upperName = upperName.replaceAll("\\s+", EMPTY);
93  
94          return upperName;
95      }
96  
97      /**
98       * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
99       * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
100      *
101      * @param pObject
102      *            Object to encode
103      * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
104      *         String supplied.
105      * @throws EncoderException
106      *             if the parameter supplied is not of type java.lang.String
107      */
108     @Override
109     public final Object encode(final Object pObject) throws EncoderException {
110         if (!(pObject instanceof String)) {
111             throw new EncoderException(
112                     "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113         }
114         return encode((String) pObject);
115     }
116 
117     /**
118      * Encodes a String using the Match Rating Approach (MRA) algorithm.
119      *
120      * @param name
121      *            String object to encode
122      * @return The MRA code corresponding to the String supplied
123      */
124     @Override
125     public final String encode(String name) {
126         // Bulletproof for trivial input - NINO
127         if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128             return EMPTY;
129         }
130 
131         // Preprocessing
132         name = cleanName(name);
133 
134         // BEGIN: Actual encoding part of the algorithm...
135         // 1. Delete all vowels unless the vowel begins the word
136         name = removeVowels(name);
137 
138         // 2. Remove second consonant from any double consonant
139         name = removeDoubleConsonants(name);
140 
141         // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
142         name = getFirst3Last3(name);
143 
144         return name;
145     }
146 
147     /**
148      * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name.
149      *
150      * <h2>API Usage</h2>
151      * <p>
152      * Consider this method private, it is package protected for unit testing only.
153      * </p>
154      *
155      * @param name
156      *            The string to get the substrings from
157      * @return Annexed first & last 3 letters of input word.
158      */
159     String getFirst3Last3(final String name) {
160         final int nameLength = name.length();
161 
162         if (nameLength > SIX) {
163             final String firstThree = name.substring(0, THREE);
164             final String lastThree = name.substring(nameLength - THREE, nameLength);
165             return firstThree + lastThree;
166         } else {
167             return name;
168         }
169     }
170 
171     /**
172      * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
173      * min rating. Values strictly from documentation.
174      *
175      * <h2>API Usage</h2>
176      * <p>
177      * Consider this method private, it is package protected for unit testing only.
178      * </p>
179      *
180      * @param sumLength
181      *            The length of 2 strings sent down
182      * @return The min rating value
183      */
184     int getMinRating(final int sumLength) {
185         int minRating = 0;
186 
187         if (sumLength <= FOUR) {
188             minRating = FIVE;
189         } else if (sumLength >= FIVE && sumLength <= SEVEN) {
190             minRating = FOUR;
191         } else if (sumLength >= EIGHT && sumLength <= ELEVEN) {
192             minRating = THREE;
193         } else if (sumLength == TWELVE) {
194             minRating = TWO;
195         } else {
196             minRating = ONE; // docs said little here.
197         }
198 
199         return minRating;
200     }
201 
202     /**
203      * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
204      * strings are cleaned in the same way as {@link #encode(String)}.
205      *
206      * @param name1
207      *            First of the 2 strings (names) to compare
208      * @param name2
209      *            Second of the 2 names to compare
210      * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
211      */
212     public boolean isEncodeEquals(String name1, String name2) {
213         // Bulletproof for trivial input - NINO
214         if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
215             return false;
216         } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
217             return false;
218         } else if (name1.length() == 1 || name2.length() == 1) {
219             return false;
220         } else if (name1.equalsIgnoreCase(name2)) {
221             return true;
222         }
223 
224         // Preprocessing
225         name1 = cleanName(name1);
226         name2 = cleanName(name2);
227 
228         // Actual MRA Algorithm
229 
230         // 1. Remove vowels
231         name1 = removeVowels(name1);
232         name2 = removeVowels(name2);
233 
234         // 2. Remove double consonants
235         name1 = removeDoubleConsonants(name1);
236         name2 = removeDoubleConsonants(name2);
237 
238         // 3. Reduce down to 3 letters
239         name1 = getFirst3Last3(name1);
240         name2 = getFirst3Last3(name2);
241 
242         // 4. Check for length difference - if 3 or greater then no similarity
243         // comparison is done
244         if (Math.abs(name1.length() - name2.length()) >= THREE) {
245             return false;
246         }
247 
248         // 5. Obtain the minimum rating value by calculating the length sum of the
249         // encoded Strings and sending it down.
250         final int sumLength = Math.abs(name1.length() + name2.length());
251         int minRating = 0;
252         minRating = getMinRating(sumLength);
253 
254         // 6. Process the encoded Strings from left to right and remove any
255         // identical characters found from both Strings respectively.
256         final int count = leftToRightThenRightToLeftProcessing(name1, name2);
257 
258         // 7. Each PNI item that has a similarity rating equal to or greater than
259         // the min is considered to be a good candidate match
260         return count >= minRating;
261 
262     }
263 
264     /**
265      * Determines if a letter is a vowel.
266      *
267      * <h2>API Usage</h2>
268      * <p>
269      * Consider this method private, it is package protected for unit testing only.
270      * </p>
271      *
272      * @param letter
273      *            The letter under investiagtion
274      * @return True if a vowel, else false
275      */
276     boolean isVowel(final String letter) {
277         return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
278                letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
279     }
280 
281     /**
282      * Processes the names from left to right (first) then right to left removing identical letters in same positions.
283      * Then subtracts the longer string that remains from 6 and returns this.
284      *
285      * <h2>API Usage</h2>
286      * <p>
287      * Consider this method private, it is package protected for unit testing only.
288      * </p>
289      *
290      * @param name1
291      *            name2
292      * @return
293      */
294     int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
295         final char[] name1Char = name1.toCharArray();
296         final char[] name2Char = name2.toCharArray();
297 
298         final int name1Size = name1.length() - 1;
299         final int name2Size = name2.length() - 1;
300 
301         String name1LtRStart = EMPTY;
302         String name1LtREnd = EMPTY;
303 
304         String name2RtLStart = EMPTY;
305         String name2RtLEnd = EMPTY;
306 
307         for (int i = 0; i < name1Char.length; i++) {
308             if (i > name2Size) {
309                 break;
310             }
311 
312             name1LtRStart = name1.substring(i, i + 1);
313             name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
314 
315             name2RtLStart = name2.substring(i, i + 1);
316             name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
317 
318             // Left to right...
319             if (name1LtRStart.equals(name2RtLStart)) {
320                 name1Char[i] = ' ';
321                 name2Char[i] = ' ';
322             }
323 
324             // Right to left...
325             if (name1LtREnd.equals(name2RtLEnd)) {
326                 name1Char[name1Size - i] = ' ';
327                 name2Char[name2Size - i] = ' ';
328             }
329         }
330 
331         // Char arrays -> string & remove extraneous space
332         final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
333         final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
334 
335         // Final bit - subtract longest string from 6 and return this int value
336         if (strA.length() > strB.length()) {
337             return Math.abs(SIX - strA.length());
338         } else {
339             return Math.abs(SIX - strB.length());
340         }
341     }
342 
343     /**
344      * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
345      * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
346      *
347      * @param accentedWord
348      *            The word that may have accents in it.
349      * @return De-accented word
350      */
351     String removeAccents(final String accentedWord) {
352         if (accentedWord == null) {
353             return null;
354         }
355 
356         final StringBuilder sb = new StringBuilder();
357         final int n = accentedWord.length();
358 
359         for (int i = 0; i < n; i++) {
360             final char c = accentedWord.charAt(i);
361             final int pos = UNICODE.indexOf(c);
362             if (pos > -1) {
363                 sb.append(PLAIN_ASCII.charAt(pos));
364             } else {
365                 sb.append(c);
366             }
367         }
368 
369         return sb.toString();
370     }
371 
372     /**
373      * Replaces any double consonant pair with the single letter equivalent.
374      *
375      * <h2>API Usage</h2>
376      * <p>
377      * Consider this method private, it is package protected for unit testing only.
378      * </p>
379      *
380      * @param name
381      *            String to have double consonants removed
382      * @return Single consonant word
383      */
384     String removeDoubleConsonants(final String name) {
385         String replacedName = name.toUpperCase();
386         for (final String dc : DOUBLE_CONSONANT) {
387             if (replacedName.contains(dc)) {
388                 final String singleLetter = dc.substring(0, 1);
389                 replacedName = replacedName.replace(dc, singleLetter);
390             }
391         }
392         return replacedName;
393     }
394 
395     /**
396      * Deletes all vowels unless the vowel begins the word.
397      *
398      * <h2>API Usage</h2>
399      * <p>
400      * Consider this method private, it is package protected for unit testing only.
401      * </p>
402      *
403      * @param name
404      *            The name to have vowels removed
405      * @return De-voweled word
406      */
407     String removeVowels(String name) {
408         // Extract first letter
409         final String firstLetter = name.substring(0, 1);
410 
411         name = name.replaceAll("A", EMPTY);
412         name = name.replaceAll("E", EMPTY);
413         name = name.replaceAll("I", EMPTY);
414         name = name.replaceAll("O", EMPTY);
415         name = name.replaceAll("U", EMPTY);
416 
417         name = name.replaceAll("\\s{2,}\\b", SPACE);
418 
419         // return isVowel(firstLetter) ? (firstLetter + name) : name;
420         if (isVowel(firstLetter)) {
421             return firstLetter + name;
422         } else {
423             return name;
424         }
425     }
426 }