Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.Locale;
020
021import org.apache.commons.codec.EncoderException;
022import org.apache.commons.codec.StringEncoder;
023
024/**
025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
026 * <p>
027 * This class is immutable and thread-safe.
028 * </p>
029 *
030 * @see <a href="https://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
031 * @since 1.8
032 */
033public class MatchRatingApproachEncoder implements StringEncoder {
034
035    private static final String SPACE = " ";
036
037    private static final String EMPTY = "";
038
039    /**
040     * The plain letter equivalent of the accented letters.
041     */
042    private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
043            "AaEeIiOoUuYy" + // acute
044            "AaEeIiOoUuYy" + // circumflex
045            "AaOoNn" + // tilde
046            "AaEeIiOoUuYy" + // umlaut
047            "Aa" + // ring
048            "Cc" + // cedilla
049            "OoUu"; // double acute
050
051    /**
052     * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
053     */
054    private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
055            "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
056            "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
057            "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF\u00C5\u00E5\u00C7\u00E7\u0150\u0151\u0170\u0171";
058
059    /**
060     * Double consonants.
061     */
062    private static final String[] DOUBLE_CONSONANT =
063            { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
064                   "TT", "VV", "WW", "XX", "YY", "ZZ" };
065
066    /**
067     * Constructs a new instance.
068     */
069    public MatchRatingApproachEncoder() {
070        // empty
071    }
072
073    /**
074     * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
075     * spaces.
076     *
077     * <h2>API Usage</h2>
078     * <p>
079     * Consider this method private, it is package protected for unit testing only.
080     * </p>
081     *
082     * @param name
083     *            The name to be cleaned.
084     * @return The cleaned name.
085     */
086    String cleanName(final String name) {
087        String upperName = name.toUpperCase(Locale.ENGLISH);
088
089        final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
090        for (final String str : charsToTrim) {
091            upperName = upperName.replaceAll(str, EMPTY);
092        }
093
094        upperName = removeAccents(upperName);
095        return upperName.replaceAll("\\s+", EMPTY);
096    }
097
098    /**
099     * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
100     * Encoder interface Throws an EncoderException if input object is not of type {@link String}.
101     *
102     * @param object
103     *            Object to encode.
104     * @return An object (or type {@link String}) containing the Match Rating Approach code which corresponds to the
105     *         String supplied.
106     * @throws EncoderException
107     *             if the parameter supplied is not of type {@link String}.
108     */
109    @Override
110    public final Object encode(final Object object) throws EncoderException {
111        if (!(object instanceof String)) {
112            throw new EncoderException(
113                    "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
114        }
115        return encode((String) object);
116    }
117
118    /**
119     * Encodes a String using the Match Rating Approach (MRA) algorithm.
120     *
121     * @param name
122     *            String object to encode.
123     * @return The MRA code corresponding to the String supplied.
124     */
125    @Override
126    public final String encode(String name) {
127        // Bulletproof for trivial input - NINO
128        if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
129            return EMPTY;
130        }
131
132        // Preprocessing
133        name = cleanName(name);
134
135        // Bulletproof if name becomes empty after cleanName(name)
136        if (SPACE.equals(name) || name.isEmpty()) {
137            return EMPTY;
138        }
139
140        // BEGIN: Actual encoding part of the algorithm...
141        // 1. Delete all vowels unless the vowel begins the word
142        name = removeVowels(name);
143
144        // Bulletproof if name becomes empty after removeVowels(name)
145        if (SPACE.equals(name) || name.isEmpty()) {
146            return EMPTY;
147        }
148
149        // 2. Remove second consonant from any double consonant
150        name = removeDoubleConsonants(name);
151
152        return getFirst3Last3(name);
153    }
154
155    /**
156     * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
157     *
158     * <h2>API Usage</h2>
159     * <p>
160     * Consider this method private, it is package protected for unit testing only.
161     * </p>
162     *
163     * @param name
164     *            The string to get the substrings from.
165     * @return Annexed first and last 3 letters of input word.
166     */
167    String getFirst3Last3(final String name) {
168        final int nameLength = name.length();
169
170        if (nameLength > 6) {
171            final String firstThree = name.substring(0, 3);
172            final String lastThree = name.substring(nameLength - 3, nameLength);
173            return firstThree + lastThree;
174        }
175        return name;
176    }
177
178    /**
179     * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
180     * min rating. Values strictly from documentation.
181     *
182     * <h2>API Usage</h2>
183     * <p>
184     * Consider this method private, it is package protected for unit testing only.
185     * </p>
186     *
187     * @param sumLength
188     *            The length of 2 strings sent down.
189     * @return The min rating value.
190     */
191    int getMinRating(final int sumLength) {
192        int minRating = 0;
193
194        if (sumLength <= 4) {
195            minRating = 5;
196        } else if (sumLength <= 7) { // already know it is at least 5
197            minRating = 4;
198        } else if (sumLength <= 11) { // already know it is at least 8
199            minRating = 3;
200        } else if (sumLength == 12) {
201            minRating = 2;
202        } else {
203            minRating = 1; // docs said little here.
204        }
205
206        return minRating;
207    }
208
209    /**
210     * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
211     * strings are cleaned in the same way as {@link #encode(String)}.
212     *
213     * @param name1
214     *            First of the 2 strings (names) to compare.
215     * @param name2
216     *            Second of the 2 names to compare.
217     * @return {@code true} if the encodings are identical {@code false} otherwise.
218     */
219    public boolean isEncodeEquals(String name1, String name2) {
220        // Bulletproof for trivial input - NINO
221        if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
222            return false;
223        }
224        if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
225            return false;
226        }
227        if (name1.length() == 1 || name2.length() == 1) {
228            return false;
229        }
230        if (name1.equalsIgnoreCase(name2)) {
231            return true;
232        }
233
234        // Preprocessing
235        name1 = cleanName(name1);
236        name2 = cleanName(name2);
237
238        // Actual MRA Algorithm
239
240        // 1. Remove vowels
241        name1 = removeVowels(name1);
242        name2 = removeVowels(name2);
243
244        // 2. Remove double consonants
245        name1 = removeDoubleConsonants(name1);
246        name2 = removeDoubleConsonants(name2);
247
248        // 3. Reduce down to 3 letters
249        name1 = getFirst3Last3(name1);
250        name2 = getFirst3Last3(name2);
251
252        // 4. Check for length difference - if 3 or greater, then no similarity
253        // comparison is done
254        if (Math.abs(name1.length() - name2.length()) >= 3) {
255            return false;
256        }
257
258        // 5. Obtain the minimum rating value by calculating the length sum of the
259        // encoded Strings and sending it down.
260        final int sumLength = Math.abs(name1.length() + name2.length());
261        final int minRating = getMinRating(sumLength);
262
263        // 6. Process the encoded Strings from left to right and remove any
264        // identical characters found from both Strings respectively.
265        final int count = leftToRightThenRightToLeftProcessing(name1, name2);
266
267        // 7. Each PNI item that has a similarity rating equal to or greater than
268        // the min is considered to be a good candidate match
269        return count >= minRating;
270
271    }
272
273    /**
274     * Determines if a letter is a vowel.
275     *
276     * <h2>API Usage</h2>
277     * <p>
278     * Consider this method private, it is package protected for unit testing only.
279     * </p>
280     *
281     * @param letter
282     *            The letter under investigation
283     * @return True if a vowel, else false
284     */
285    boolean isVowel(final String letter) {
286        return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
287               letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
288    }
289
290    /**
291     * Processes the names from left to right (first) then right to left removing identical letters in same positions.
292     * Then subtracts the longer string that remains from 6 and returns this.
293     *
294     * <h2>API Usage</h2>
295     * <p>
296     * Consider this method private, it is package protected for unit testing only.
297     * </p>
298     *
299     * @param name1 first name.
300     * @param name1 second name.
301     * @return the length as above.
302     */
303    int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
304        final char[] name1Char = name1.toCharArray();
305        final char[] name2Char = name2.toCharArray();
306
307        final int name1Size = name1.length() - 1;
308        final int name2Size = name2.length() - 1;
309
310        String name1LtRStart = EMPTY;
311        String name1LtREnd = EMPTY;
312
313        String name2RtLStart = EMPTY;
314        String name2RtLEnd = EMPTY;
315
316        for (int i = 0; i < name1Char.length; i++) {
317            if (i > name2Size) {
318                break;
319            }
320
321            name1LtRStart = name1.substring(i, i + 1);
322            name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
323
324            name2RtLStart = name2.substring(i, i + 1);
325            name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
326
327            // Left to right...
328            if (name1LtRStart.equals(name2RtLStart)) {
329                name1Char[i] = ' ';
330                name2Char[i] = ' ';
331            }
332
333            // Right to left...
334            if (name1LtREnd.equals(name2RtLEnd)) {
335                name1Char[name1Size - i] = ' ';
336                name2Char[name2Size - i] = ' ';
337            }
338        }
339
340        // Char arrays -> string & remove extraneous space
341        final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
342        final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
343
344        // Final bit - subtract the longest string from 6 and return this int value
345        if (strA.length() > strB.length()) {
346            return Math.abs(6 - strA.length());
347        }
348        return Math.abs(6 - strB.length());
349    }
350
351    /**
352     * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved.
353     * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
354     *
355     * @param accentedWord
356     *            The word that may have accents in it.
357     * @return De-accented word.
358     */
359    String removeAccents(final String accentedWord) {
360        if (accentedWord == null) {
361            return null;
362        }
363
364        final StringBuilder sb = new StringBuilder();
365        final int n = accentedWord.length();
366
367        for (int i = 0; i < n; i++) {
368            final char c = accentedWord.charAt(i);
369            final int pos = UNICODE.indexOf(c);
370            if (pos > -1) {
371                sb.append(PLAIN_ASCII.charAt(pos));
372            } else {
373                sb.append(c);
374            }
375        }
376
377        return sb.toString();
378    }
379
380    /**
381     * Replaces any double consonant pair with the single letter equivalent.
382     *
383     * <h2>API Usage</h2>
384     * <p>
385     * Consider this method private, it is package protected for unit testing only.
386     * </p>
387     *
388     * @param name
389     *            String to have double consonants removed.
390     * @return Single consonant word.
391     */
392    String removeDoubleConsonants(final String name) {
393        String replacedName = name.toUpperCase(Locale.ENGLISH);
394        for (final String dc : DOUBLE_CONSONANT) {
395            if (replacedName.contains(dc)) {
396                final String singleLetter = dc.substring(0, 1);
397                replacedName = replacedName.replace(dc, singleLetter);
398            }
399        }
400        return replacedName;
401    }
402
403    /**
404     * Deletes all vowels unless the vowel begins the word.
405     *
406     * <h2>API Usage</h2>
407     * <p>
408     * Consider this method private, it is package protected for unit testing only.
409     * </p>
410     *
411     * @param name
412     *            The name to have vowels removed.
413     * @return De-voweled word.
414     */
415    String removeVowels(String name) {
416        // Extract first letter
417        final String firstLetter = name.substring(0, 1);
418
419        name = name.replace("A", EMPTY);
420        name = name.replace("E", EMPTY);
421        name = name.replace("I", EMPTY);
422        name = name.replace("O", EMPTY);
423        name = name.replace("U", EMPTY);
424
425        name = name.replaceAll("\\s{2,}\\b", SPACE);
426
427        // return isVowel(firstLetter) ? (firstLetter + name) : name;
428        if (isVowel(firstLetter)) {
429            return firstLetter + name;
430        }
431        return name;
432    }
433}