001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.codec.language;
018    
019    import java.util.Locale;
020    
021    import org.apache.commons.codec.EncoderException;
022    import org.apache.commons.codec.StringEncoder;
023    
024    /**
025     * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
026     *
027     * This class is immutable and thread-safe.
028     *
029     * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
030     * @since 1.8
031     */
032    public class MatchRatingApproachEncoder implements StringEncoder {
033    
034        private static final String SPACE = " ";
035    
036        private static final String EMPTY = "";
037    
038        /**
039         * Constants used mainly for the min rating value.
040         */
041        private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
042                                 ELEVEN = 11, TWELVE = 12;
043    
044        /**
045         * The plain letter equivalent of the accented letters.
046         */
047        private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
048                "AaEeIiOoUuYy" + // acute
049                "AaEeIiOoUuYy" + // circumflex
050                "AaOoNn" + // tilde
051                "AaEeIiOoUuYy" + // umlaut
052                "Aa" + // ring
053                "Cc" + // cedilla
054                "OoUu"; // double acute
055    
056        /**
057         * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
058         */
059        private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
060                "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
061                "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
062                "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
063                "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
064                "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
065    
066        private static final String[] DOUBLE_CONSONANT =
067                new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
068                               "TT", "VV", "WW", "XX", "YY", "ZZ" };
069    
070        /**
071         * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
072         * spaces.
073         *
074         * <h2>API Usage</h2>
075         * <p>
076         * Consider this method private, it is package protected for unit testing only.
077         * </p>
078         *
079         * @param name
080         *            The name to be cleaned
081         * @return The cleaned name
082         */
083        String cleanName(final String name) {
084            String upperName = name.toUpperCase(Locale.ENGLISH);
085    
086            final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
087            for (final String str : charsToTrim) {
088                upperName = upperName.replaceAll(str, EMPTY);
089            }
090    
091            upperName = removeAccents(upperName);
092            upperName = upperName.replaceAll("\\s+", EMPTY);
093    
094            return upperName;
095        }
096    
097        /**
098         * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy the requirements of the
099         * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
100         *
101         * @param pObject
102         *            Object to encode
103         * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
104         *         String supplied.
105         * @throws EncoderException
106         *             if the parameter supplied is not of type java.lang.String
107         */
108        @Override
109        public final Object encode(final Object pObject) throws EncoderException {
110            if (!(pObject instanceof String)) {
111                throw new EncoderException(
112                        "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113            }
114            return encode((String) pObject);
115        }
116    
117        /**
118         * Encodes a String using the Match Rating Approach (MRA) algorithm.
119         *
120         * @param name
121         *            String object to encode
122         * @return The MRA code corresponding to the String supplied
123         */
124        @Override
125        public final String encode(String name) {
126            // Bulletproof for trivial input - NINO
127            if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128                return EMPTY;
129            }
130    
131            // Preprocessing
132            name = cleanName(name);
133    
134            // BEGIN: Actual encoding part of the algorithm...
135            // 1. Delete all vowels unless the vowel begins the word
136            name = removeVowels(name);
137    
138            // 2. Remove second consonant from any double consonant
139            name = removeDoubleConsonants(name);
140    
141            // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
142            name = getFirst3Last3(name);
143    
144            return name;
145        }
146    
147        /**
148         * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name.
149         *
150         * <h2>API Usage</h2>
151         * <p>
152         * Consider this method private, it is package protected for unit testing only.
153         * </p>
154         *
155         * @param name
156         *            The string to get the substrings from
157         * @return Annexed first & last 3 letters of input word.
158         */
159        String getFirst3Last3(final String name) {
160            final int nameLength = name.length();
161    
162            if (nameLength > SIX) {
163                final String firstThree = name.substring(0, THREE);
164                final String lastThree = name.substring(nameLength - THREE, nameLength);
165                return firstThree + lastThree;
166            } else {
167                return name;
168            }
169        }
170    
171        /**
172         * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
173         * min rating. Values strictly from documentation.
174         *
175         * <h2>API Usage</h2>
176         * <p>
177         * Consider this method private, it is package protected for unit testing only.
178         * </p>
179         *
180         * @param sumLength
181         *            The length of 2 strings sent down
182         * @return The min rating value
183         */
184        int getMinRating(final int sumLength) {
185            int minRating = 0;
186    
187            if (sumLength <= FOUR) {
188                minRating = FIVE;
189            } else if (sumLength >= FIVE && sumLength <= SEVEN) {
190                minRating = FOUR;
191            } else if (sumLength >= EIGHT && sumLength <= ELEVEN) {
192                minRating = THREE;
193            } else if (sumLength == TWELVE) {
194                minRating = TWO;
195            } else {
196                minRating = ONE; // docs said little here.
197            }
198    
199            return minRating;
200        }
201    
202        /**
203         * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
204         * strings are cleaned in the same way as {@link #encode(String)}.
205         *
206         * @param name1
207         *            First of the 2 strings (names) to compare
208         * @param name2
209         *            Second of the 2 names to compare
210         * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
211         */
212        public boolean isEncodeEquals(String name1, String name2) {
213            // Bulletproof for trivial input - NINO
214            if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
215                return false;
216            } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
217                return false;
218            } else if (name1.length() == 1 || name2.length() == 1) {
219                return false;
220            } else if (name1.equalsIgnoreCase(name2)) {
221                return true;
222            }
223    
224            // Preprocessing
225            name1 = cleanName(name1);
226            name2 = cleanName(name2);
227    
228            // Actual MRA Algorithm
229    
230            // 1. Remove vowels
231            name1 = removeVowels(name1);
232            name2 = removeVowels(name2);
233    
234            // 2. Remove double consonants
235            name1 = removeDoubleConsonants(name1);
236            name2 = removeDoubleConsonants(name2);
237    
238            // 3. Reduce down to 3 letters
239            name1 = getFirst3Last3(name1);
240            name2 = getFirst3Last3(name2);
241    
242            // 4. Check for length difference - if 3 or greater then no similarity
243            // comparison is done
244            if (Math.abs(name1.length() - name2.length()) >= THREE) {
245                return false;
246            }
247    
248            // 5. Obtain the minimum rating value by calculating the length sum of the
249            // encoded Strings and sending it down.
250            final int sumLength = Math.abs(name1.length() + name2.length());
251            int minRating = 0;
252            minRating = getMinRating(sumLength);
253    
254            // 6. Process the encoded Strings from left to right and remove any
255            // identical characters found from both Strings respectively.
256            final int count = leftToRightThenRightToLeftProcessing(name1, name2);
257    
258            // 7. Each PNI item that has a similarity rating equal to or greater than
259            // the min is considered to be a good candidate match
260            return count >= minRating;
261    
262        }
263    
264        /**
265         * Determines if a letter is a vowel.
266         *
267         * <h2>API Usage</h2>
268         * <p>
269         * Consider this method private, it is package protected for unit testing only.
270         * </p>
271         *
272         * @param letter
273         *            The letter under investiagtion
274         * @return True if a vowel, else false
275         */
276        boolean isVowel(final String letter) {
277            return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
278                   letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
279        }
280    
281        /**
282         * Processes the names from left to right (first) then right to left removing identical letters in same positions.
283         * Then subtracts the longer string that remains from 6 and returns this.
284         *
285         * <h2>API Usage</h2>
286         * <p>
287         * Consider this method private, it is package protected for unit testing only.
288         * </p>
289         *
290         * @param name1
291         *            name2
292         * @return
293         */
294        int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
295            final char[] name1Char = name1.toCharArray();
296            final char[] name2Char = name2.toCharArray();
297    
298            final int name1Size = name1.length() - 1;
299            final int name2Size = name2.length() - 1;
300    
301            String name1LtRStart = EMPTY;
302            String name1LtREnd = EMPTY;
303    
304            String name2RtLStart = EMPTY;
305            String name2RtLEnd = EMPTY;
306    
307            for (int i = 0; i < name1Char.length; i++) {
308                if (i > name2Size) {
309                    break;
310                }
311    
312                name1LtRStart = name1.substring(i, i + 1);
313                name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
314    
315                name2RtLStart = name2.substring(i, i + 1);
316                name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
317    
318                // Left to right...
319                if (name1LtRStart.equals(name2RtLStart)) {
320                    name1Char[i] = ' ';
321                    name2Char[i] = ' ';
322                }
323    
324                // Right to left...
325                if (name1LtREnd.equals(name2RtLEnd)) {
326                    name1Char[name1Size - i] = ' ';
327                    name2Char[name2Size - i] = ' ';
328                }
329            }
330    
331            // Char arrays -> string & remove extraneous space
332            final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
333            final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
334    
335            // Final bit - subtract longest string from 6 and return this int value
336            if (strA.length() > strB.length()) {
337                return Math.abs(SIX - strA.length());
338            } else {
339                return Math.abs(SIX - strB.length());
340            }
341        }
342    
343        /**
344         * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
345         * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
346         *
347         * @param accentedWord
348         *            The word that may have accents in it.
349         * @return De-accented word
350         */
351        String removeAccents(final String accentedWord) {
352            if (accentedWord == null) {
353                return null;
354            }
355    
356            final StringBuilder sb = new StringBuilder();
357            final int n = accentedWord.length();
358    
359            for (int i = 0; i < n; i++) {
360                final char c = accentedWord.charAt(i);
361                final int pos = UNICODE.indexOf(c);
362                if (pos > -1) {
363                    sb.append(PLAIN_ASCII.charAt(pos));
364                } else {
365                    sb.append(c);
366                }
367            }
368    
369            return sb.toString();
370        }
371    
372        /**
373         * Replaces any double consonant pair with the single letter equivalent.
374         *
375         * <h2>API Usage</h2>
376         * <p>
377         * Consider this method private, it is package protected for unit testing only.
378         * </p>
379         *
380         * @param name
381         *            String to have double consonants removed
382         * @return Single consonant word
383         */
384        String removeDoubleConsonants(final String name) {
385            String replacedName = name.toUpperCase();
386            for (final String dc : DOUBLE_CONSONANT) {
387                if (replacedName.contains(dc)) {
388                    final String singleLetter = dc.substring(0, 1);
389                    replacedName = replacedName.replace(dc, singleLetter);
390                }
391            }
392            return replacedName;
393        }
394    
395        /**
396         * Deletes all vowels unless the vowel begins the word.
397         *
398         * <h2>API Usage</h2>
399         * <p>
400         * Consider this method private, it is package protected for unit testing only.
401         * </p>
402         *
403         * @param name
404         *            The name to have vowels removed
405         * @return De-voweled word
406         */
407        String removeVowels(String name) {
408            // Extract first letter
409            final String firstLetter = name.substring(0, 1);
410    
411            name = name.replaceAll("A", EMPTY);
412            name = name.replaceAll("E", EMPTY);
413            name = name.replaceAll("I", EMPTY);
414            name = name.replaceAll("O", EMPTY);
415            name = name.replaceAll("U", EMPTY);
416    
417            name = name.replaceAll("\\s{2,}\\b", SPACE);
418    
419            // return isVowel(firstLetter) ? (firstLetter + name) : name;
420            if (isVowel(firstLetter)) {
421                return firstLetter + name;
422            } else {
423                return name;
424            }
425        }
426    }