Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.Locale;
020
021import org.apache.commons.codec.EncoderException;
022import org.apache.commons.codec.StringEncoder;
023
024/**
025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
026 *
027 * This class is immutable and thread-safe.
028 *
029 * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
030 * @since 1.8
031 */
032public class MatchRatingApproachEncoder implements StringEncoder {
033
034    private static final String SPACE = " ";
035
036    private static final String EMPTY = "";
037
038    /**
039     * Constants used mainly for the min rating value.
040     */
041    private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
042                             ELEVEN = 11, TWELVE = 12;
043
044    /**
045     * The plain letter equivalent of the accented letters.
046     */
047    private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
048            "AaEeIiOoUuYy" + // acute
049            "AaEeIiOoUuYy" + // circumflex
050            "AaOoNn" + // tilde
051            "AaEeIiOoUuYy" + // umlaut
052            "Aa" + // ring
053            "Cc" + // cedilla
054            "OoUu"; // double acute
055
056    /**
057     * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
058     */
059    private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
060            "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
061            "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
062            "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
063            "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
064            "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
065
066    private static final String[] DOUBLE_CONSONANT =
067            new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
068                           "TT", "VV", "WW", "XX", "YY", "ZZ" };
069
070    /**
071     * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
072     * spaces.
073     *
074     * <h2>API Usage</h2>
075     * <p>
076     * Consider this method private, it is package protected for unit testing only.
077     * </p>
078     *
079     * @param name
080     *            The name to be cleaned
081     * @return The cleaned name
082     */
083    String cleanName(final String name) {
084        String upperName = name.toUpperCase(Locale.ENGLISH);
085
086        final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
087        for (final String str : charsToTrim) {
088            upperName = upperName.replaceAll(str, EMPTY);
089        }
090
091        upperName = removeAccents(upperName);
092        upperName = upperName.replaceAll("\\s+", EMPTY);
093
094        return upperName;
095    }
096
097    /**
098     * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
099     * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
100     *
101     * @param pObject
102     *            Object to encode
103     * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
104     *         String supplied.
105     * @throws EncoderException
106     *             if the parameter supplied is not of type java.lang.String
107     */
108    @Override
109    public final Object encode(final Object pObject) throws EncoderException {
110        if (!(pObject instanceof String)) {
111            throw new EncoderException(
112                    "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113        }
114        return encode((String) pObject);
115    }
116
117    /**
118     * Encodes a String using the Match Rating Approach (MRA) algorithm.
119     *
120     * @param name
121     *            String object to encode
122     * @return The MRA code corresponding to the String supplied
123     */
124    @Override
125    public final String encode(String name) {
126        // Bulletproof for trivial input - NINO
127        if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128            return EMPTY;
129        }
130
131        // Preprocessing
132        name = cleanName(name);
133
134        // BEGIN: Actual encoding part of the algorithm...
135        // 1. Delete all vowels unless the vowel begins the word
136        name = removeVowels(name);
137
138        // 2. Remove second consonant from any double consonant
139        name = removeDoubleConsonants(name);
140
141        // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
142        name = getFirst3Last3(name);
143
144        return name;
145    }
146
147    /**
148     * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
149     *
150     * <h2>API Usage</h2>
151     * <p>
152     * Consider this method private, it is package protected for unit testing only.
153     * </p>
154     *
155     * @param name
156     *            The string to get the substrings from
157     * @return Annexed first and last 3 letters of input word.
158     */
159    String getFirst3Last3(final String name) {
160        final int nameLength = name.length();
161
162        if (nameLength > SIX) {
163            final String firstThree = name.substring(0, THREE);
164            final String lastThree = name.substring(nameLength - THREE, nameLength);
165            return firstThree + lastThree;
166        } else {
167            return name;
168        }
169    }
170
171    /**
172     * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
173     * min rating. Values strictly from documentation.
174     *
175     * <h2>API Usage</h2>
176     * <p>
177     * Consider this method private, it is package protected for unit testing only.
178     * </p>
179     *
180     * @param sumLength
181     *            The length of 2 strings sent down
182     * @return The min rating value
183     */
184    int getMinRating(final int sumLength) {
185        int minRating = 0;
186
187        if (sumLength <= FOUR) {
188            minRating = FIVE;
189        } else if (sumLength >= FIVE && sumLength <= SEVEN) {
190            minRating = FOUR;
191        } else if (sumLength >= EIGHT && sumLength <= ELEVEN) {
192            minRating = THREE;
193        } else if (sumLength == TWELVE) {
194            minRating = TWO;
195        } else {
196            minRating = ONE; // docs said little here.
197        }
198
199        return minRating;
200    }
201
202    /**
203     * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
204     * strings are cleaned in the same way as {@link #encode(String)}.
205     *
206     * @param name1
207     *            First of the 2 strings (names) to compare
208     * @param name2
209     *            Second of the 2 names to compare
210     * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
211     */
212    public boolean isEncodeEquals(String name1, String name2) {
213        // Bulletproof for trivial input - NINO
214        if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
215            return false;
216        } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
217            return false;
218        } else if (name1.length() == 1 || name2.length() == 1) {
219            return false;
220        } else if (name1.equalsIgnoreCase(name2)) {
221            return true;
222        }
223
224        // Preprocessing
225        name1 = cleanName(name1);
226        name2 = cleanName(name2);
227
228        // Actual MRA Algorithm
229
230        // 1. Remove vowels
231        name1 = removeVowels(name1);
232        name2 = removeVowels(name2);
233
234        // 2. Remove double consonants
235        name1 = removeDoubleConsonants(name1);
236        name2 = removeDoubleConsonants(name2);
237
238        // 3. Reduce down to 3 letters
239        name1 = getFirst3Last3(name1);
240        name2 = getFirst3Last3(name2);
241
242        // 4. Check for length difference - if 3 or greater then no similarity
243        // comparison is done
244        if (Math.abs(name1.length() - name2.length()) >= THREE) {
245            return false;
246        }
247
248        // 5. Obtain the minimum rating value by calculating the length sum of the
249        // encoded Strings and sending it down.
250        final int sumLength = Math.abs(name1.length() + name2.length());
251        int minRating = 0;
252        minRating = getMinRating(sumLength);
253
254        // 6. Process the encoded Strings from left to right and remove any
255        // identical characters found from both Strings respectively.
256        final int count = leftToRightThenRightToLeftProcessing(name1, name2);
257
258        // 7. Each PNI item that has a similarity rating equal to or greater than
259        // the min is considered to be a good candidate match
260        return count >= minRating;
261
262    }
263
264    /**
265     * Determines if a letter is a vowel.
266     *
267     * <h2>API Usage</h2>
268     * <p>
269     * Consider this method private, it is package protected for unit testing only.
270     * </p>
271     *
272     * @param letter
273     *            The letter under investiagtion
274     * @return True if a vowel, else false
275     */
276    boolean isVowel(final String letter) {
277        return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
278               letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
279    }
280
281    /**
282     * Processes the names from left to right (first) then right to left removing identical letters in same positions.
283     * Then subtracts the longer string that remains from 6 and returns this.
284     *
285     * <h2>API Usage</h2>
286     * <p>
287     * Consider this method private, it is package protected for unit testing only.
288     * </p>
289     *
290     * @param name1
291     *            name2
292     * @return
293     */
294    int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
295        final char[] name1Char = name1.toCharArray();
296        final char[] name2Char = name2.toCharArray();
297
298        final int name1Size = name1.length() - 1;
299        final int name2Size = name2.length() - 1;
300
301        String name1LtRStart = EMPTY;
302        String name1LtREnd = EMPTY;
303
304        String name2RtLStart = EMPTY;
305        String name2RtLEnd = EMPTY;
306
307        for (int i = 0; i < name1Char.length; i++) {
308            if (i > name2Size) {
309                break;
310            }
311
312            name1LtRStart = name1.substring(i, i + 1);
313            name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
314
315            name2RtLStart = name2.substring(i, i + 1);
316            name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
317
318            // Left to right...
319            if (name1LtRStart.equals(name2RtLStart)) {
320                name1Char[i] = ' ';
321                name2Char[i] = ' ';
322            }
323
324            // Right to left...
325            if (name1LtREnd.equals(name2RtLEnd)) {
326                name1Char[name1Size - i] = ' ';
327                name2Char[name2Size - i] = ' ';
328            }
329        }
330
331        // Char arrays -> string & remove extraneous space
332        final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
333        final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
334
335        // Final bit - subtract longest string from 6 and return this int value
336        if (strA.length() > strB.length()) {
337            return Math.abs(SIX - strA.length());
338        } else {
339            return Math.abs(SIX - strB.length());
340        }
341    }
342
343    /**
344     * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
345     * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
346     *
347     * @param accentedWord
348     *            The word that may have accents in it.
349     * @return De-accented word
350     */
351    String removeAccents(final String accentedWord) {
352        if (accentedWord == null) {
353            return null;
354        }
355
356        final StringBuilder sb = new StringBuilder();
357        final int n = accentedWord.length();
358
359        for (int i = 0; i < n; i++) {
360            final char c = accentedWord.charAt(i);
361            final int pos = UNICODE.indexOf(c);
362            if (pos > -1) {
363                sb.append(PLAIN_ASCII.charAt(pos));
364            } else {
365                sb.append(c);
366            }
367        }
368
369        return sb.toString();
370    }
371
372    /**
373     * Replaces any double consonant pair with the single letter equivalent.
374     *
375     * <h2>API Usage</h2>
376     * <p>
377     * Consider this method private, it is package protected for unit testing only.
378     * </p>
379     *
380     * @param name
381     *            String to have double consonants removed
382     * @return Single consonant word
383     */
384    String removeDoubleConsonants(final String name) {
385        String replacedName = name.toUpperCase();
386        for (final String dc : DOUBLE_CONSONANT) {
387            if (replacedName.contains(dc)) {
388                final String singleLetter = dc.substring(0, 1);
389                replacedName = replacedName.replace(dc, singleLetter);
390            }
391        }
392        return replacedName;
393    }
394
395    /**
396     * Deletes all vowels unless the vowel begins the word.
397     *
398     * <h2>API Usage</h2>
399     * <p>
400     * Consider this method private, it is package protected for unit testing only.
401     * </p>
402     *
403     * @param name
404     *            The name to have vowels removed
405     * @return De-voweled word
406     */
407    String removeVowels(String name) {
408        // Extract first letter
409        final String firstLetter = name.substring(0, 1);
410
411        name = name.replaceAll("A", EMPTY);
412        name = name.replaceAll("E", EMPTY);
413        name = name.replaceAll("I", EMPTY);
414        name = name.replaceAll("O", EMPTY);
415        name = name.replaceAll("U", EMPTY);
416
417        name = name.replaceAll("\\s{2,}\\b", SPACE);
418
419        // return isVowel(firstLetter) ? (firstLetter + name) : name;
420        if (isVowel(firstLetter)) {
421            return firstLetter + name;
422        } else {
423            return name;
424        }
425    }
426}