MatchRatingApproachEncoder.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.codec.language;

  18. import java.util.Locale;

  19. import org.apache.commons.codec.EncoderException;
  20. import org.apache.commons.codec.StringEncoder;

  21. /**
  22.  * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
  23.  *
  24.  * This class is immutable and thread-safe.
  25.  *
  26.  * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
  27.  * @since 1.8
  28.  */
  29. public class MatchRatingApproachEncoder implements StringEncoder {

  30.     private static final String SPACE = " ";

  31.     private static final String EMPTY = "";

  32.     /**
  33.      * Constants used mainly for the min rating value.
  34.      */
  35.     private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7,
  36.                              ELEVEN = 11, TWELVE = 12;

  37.     /**
  38.      * The plain letter equivalent of the accented letters.
  39.      */
  40.     private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
  41.             "AaEeIiOoUuYy" + // acute
  42.             "AaEeIiOoUuYy" + // circumflex
  43.             "AaOoNn" + // tilde
  44.             "AaEeIiOoUuYy" + // umlaut
  45.             "Aa" + // ring
  46.             "Cc" + // cedilla
  47.             "OoUu"; // double acute

  48.     /**
  49.      * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
  50.      */
  51.     private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
  52.             "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
  53.             "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
  54.             "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
  55.             "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
  56.             "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";

  57.     private static final String[] DOUBLE_CONSONANT =
  58.             new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
  59.                            "TT", "VV", "WW", "XX", "YY", "ZZ" };

  60.     /**
  61.      * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
  62.      * spaces.
  63.      *
  64.      * <h2>API Usage</h2>
  65.      * <p>
  66.      * Consider this method private, it is package protected for unit testing only.
  67.      * </p>
  68.      *
  69.      * @param name
  70.      *            The name to be cleaned
  71.      * @return The cleaned name
  72.      */
  73.     String cleanName(final String name) {
  74.         String upperName = name.toUpperCase(Locale.ENGLISH);

  75.         final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
  76.         for (final String str : charsToTrim) {
  77.             upperName = upperName.replaceAll(str, EMPTY);
  78.         }

  79.         upperName = removeAccents(upperName);
  80.         upperName = upperName.replaceAll("\\s+", EMPTY);

  81.         return upperName;
  82.     }

  83.     /**
  84.      * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
  85.      * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
  86.      *
  87.      * @param pObject
  88.      *            Object to encode
  89.      * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
  90.      *         String supplied.
  91.      * @throws EncoderException
  92.      *             if the parameter supplied is not of type java.lang.String
  93.      */
  94.     @Override
  95.     public final Object encode(final Object pObject) throws EncoderException {
  96.         if (!(pObject instanceof String)) {
  97.             throw new EncoderException(
  98.                     "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
  99.         }
  100.         return encode((String) pObject);
  101.     }

  102.     /**
  103.      * Encodes a String using the Match Rating Approach (MRA) algorithm.
  104.      *
  105.      * @param name
  106.      *            String object to encode
  107.      * @return The MRA code corresponding to the String supplied
  108.      */
  109.     @Override
  110.     public final String encode(String name) {
  111.         // Bulletproof for trivial input - NINO
  112.         if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
  113.             return EMPTY;
  114.         }

  115.         // Preprocessing
  116.         name = cleanName(name);

  117.         // BEGIN: Actual encoding part of the algorithm...
  118.         // 1. Delete all vowels unless the vowel begins the word
  119.         name = removeVowels(name);

  120.         // 2. Remove second consonant from any double consonant
  121.         name = removeDoubleConsonants(name);

  122.         // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
  123.         name = getFirst3Last3(name);

  124.         return name;
  125.     }

  126.     /**
  127.      * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
  128.      *
  129.      * <h2>API Usage</h2>
  130.      * <p>
  131.      * Consider this method private, it is package protected for unit testing only.
  132.      * </p>
  133.      *
  134.      * @param name
  135.      *            The string to get the substrings from
  136.      * @return Annexed first and last 3 letters of input word.
  137.      */
  138.     String getFirst3Last3(final String name) {
  139.         final int nameLength = name.length();

  140.         if (nameLength > SIX) {
  141.             final String firstThree = name.substring(0, THREE);
  142.             final String lastThree = name.substring(nameLength - THREE, nameLength);
  143.             return firstThree + lastThree;
  144.         }
  145.         return name;
  146.     }

  147.     /**
  148.      * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
  149.      * min rating. Values strictly from documentation.
  150.      *
  151.      * <h2>API Usage</h2>
  152.      * <p>
  153.      * Consider this method private, it is package protected for unit testing only.
  154.      * </p>
  155.      *
  156.      * @param sumLength
  157.      *            The length of 2 strings sent down
  158.      * @return The min rating value
  159.      */
  160.     int getMinRating(final int sumLength) {
  161.         int minRating = 0;

  162.         if (sumLength <= FOUR) {
  163.             minRating = FIVE;
  164.         } else if (sumLength <= SEVEN) { // aready know it is at least 5
  165.             minRating = FOUR;
  166.         } else if (sumLength <= ELEVEN) { // aready know it is at least 8
  167.             minRating = THREE;
  168.         } else if (sumLength == TWELVE) {
  169.             minRating = TWO;
  170.         } else {
  171.             minRating = ONE; // docs said little here.
  172.         }

  173.         return minRating;
  174.     }

  175.     /**
  176.      * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
  177.      * strings are cleaned in the same way as {@link #encode(String)}.
  178.      *
  179.      * @param name1
  180.      *            First of the 2 strings (names) to compare
  181.      * @param name2
  182.      *            Second of the 2 names to compare
  183.      * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
  184.      */
  185.     public boolean isEncodeEquals(String name1, String name2) {
  186.         // Bulletproof for trivial input - NINO
  187.         if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
  188.             return false;
  189.         } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
  190.             return false;
  191.         } else if (name1.length() == 1 || name2.length() == 1) {
  192.             return false;
  193.         } else if (name1.equalsIgnoreCase(name2)) {
  194.             return true;
  195.         }

  196.         // Preprocessing
  197.         name1 = cleanName(name1);
  198.         name2 = cleanName(name2);

  199.         // Actual MRA Algorithm

  200.         // 1. Remove vowels
  201.         name1 = removeVowels(name1);
  202.         name2 = removeVowels(name2);

  203.         // 2. Remove double consonants
  204.         name1 = removeDoubleConsonants(name1);
  205.         name2 = removeDoubleConsonants(name2);

  206.         // 3. Reduce down to 3 letters
  207.         name1 = getFirst3Last3(name1);
  208.         name2 = getFirst3Last3(name2);

  209.         // 4. Check for length difference - if 3 or greater then no similarity
  210.         // comparison is done
  211.         if (Math.abs(name1.length() - name2.length()) >= THREE) {
  212.             return false;
  213.         }

  214.         // 5. Obtain the minimum rating value by calculating the length sum of the
  215.         // encoded Strings and sending it down.
  216.         final int sumLength = Math.abs(name1.length() + name2.length());
  217.         int minRating = 0;
  218.         minRating = getMinRating(sumLength);

  219.         // 6. Process the encoded Strings from left to right and remove any
  220.         // identical characters found from both Strings respectively.
  221.         final int count = leftToRightThenRightToLeftProcessing(name1, name2);

  222.         // 7. Each PNI item that has a similarity rating equal to or greater than
  223.         // the min is considered to be a good candidate match
  224.         return count >= minRating;

  225.     }

  226.     /**
  227.      * Determines if a letter is a vowel.
  228.      *
  229.      * <h2>API Usage</h2>
  230.      * <p>
  231.      * Consider this method private, it is package protected for unit testing only.
  232.      * </p>
  233.      *
  234.      * @param letter
  235.      *            The letter under investiagtion
  236.      * @return True if a vowel, else false
  237.      */
  238.     boolean isVowel(final String letter) {
  239.         return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
  240.                letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
  241.     }

  242.     /**
  243.      * Processes the names from left to right (first) then right to left removing identical letters in same positions.
  244.      * Then subtracts the longer string that remains from 6 and returns this.
  245.      *
  246.      * <h2>API Usage</h2>
  247.      * <p>
  248.      * Consider this method private, it is package protected for unit testing only.
  249.      * </p>
  250.      *
  251.      * @param name1
  252.      *            name2
  253.      * @return the length as above
  254.      */
  255.     int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
  256.         final char[] name1Char = name1.toCharArray();
  257.         final char[] name2Char = name2.toCharArray();

  258.         final int name1Size = name1.length() - 1;
  259.         final int name2Size = name2.length() - 1;

  260.         String name1LtRStart = EMPTY;
  261.         String name1LtREnd = EMPTY;

  262.         String name2RtLStart = EMPTY;
  263.         String name2RtLEnd = EMPTY;

  264.         for (int i = 0; i < name1Char.length; i++) {
  265.             if (i > name2Size) {
  266.                 break;
  267.             }

  268.             name1LtRStart = name1.substring(i, i + 1);
  269.             name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);

  270.             name2RtLStart = name2.substring(i, i + 1);
  271.             name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);

  272.             // Left to right...
  273.             if (name1LtRStart.equals(name2RtLStart)) {
  274.                 name1Char[i] = ' ';
  275.                 name2Char[i] = ' ';
  276.             }

  277.             // Right to left...
  278.             if (name1LtREnd.equals(name2RtLEnd)) {
  279.                 name1Char[name1Size - i] = ' ';
  280.                 name2Char[name2Size - i] = ' ';
  281.             }
  282.         }

  283.         // Char arrays -> string & remove extraneous space
  284.         final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
  285.         final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);

  286.         // Final bit - subtract longest string from 6 and return this int value
  287.         if (strA.length() > strB.length()) {
  288.             return Math.abs(SIX - strA.length());
  289.         }
  290.         return Math.abs(SIX - strB.length());
  291.     }

  292.     /**
  293.      * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
  294.      * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
  295.      *
  296.      * @param accentedWord
  297.      *            The word that may have accents in it.
  298.      * @return De-accented word
  299.      */
  300.     String removeAccents(final String accentedWord) {
  301.         if (accentedWord == null) {
  302.             return null;
  303.         }

  304.         final StringBuilder sb = new StringBuilder();
  305.         final int n = accentedWord.length();

  306.         for (int i = 0; i < n; i++) {
  307.             final char c = accentedWord.charAt(i);
  308.             final int pos = UNICODE.indexOf(c);
  309.             if (pos > -1) {
  310.                 sb.append(PLAIN_ASCII.charAt(pos));
  311.             } else {
  312.                 sb.append(c);
  313.             }
  314.         }

  315.         return sb.toString();
  316.     }

  317.     /**
  318.      * Replaces any double consonant pair with the single letter equivalent.
  319.      *
  320.      * <h2>API Usage</h2>
  321.      * <p>
  322.      * Consider this method private, it is package protected for unit testing only.
  323.      * </p>
  324.      *
  325.      * @param name
  326.      *            String to have double consonants removed
  327.      * @return Single consonant word
  328.      */
  329.     String removeDoubleConsonants(final String name) {
  330.         String replacedName = name.toUpperCase(Locale.ENGLISH);
  331.         for (final String dc : DOUBLE_CONSONANT) {
  332.             if (replacedName.contains(dc)) {
  333.                 final String singleLetter = dc.substring(0, 1);
  334.                 replacedName = replacedName.replace(dc, singleLetter);
  335.             }
  336.         }
  337.         return replacedName;
  338.     }

  339.     /**
  340.      * Deletes all vowels unless the vowel begins the word.
  341.      *
  342.      * <h2>API Usage</h2>
  343.      * <p>
  344.      * Consider this method private, it is package protected for unit testing only.
  345.      * </p>
  346.      *
  347.      * @param name
  348.      *            The name to have vowels removed
  349.      * @return De-voweled word
  350.      */
  351.     String removeVowels(String name) {
  352.         // Extract first letter
  353.         final String firstLetter = name.substring(0, 1);

  354.         name = name.replaceAll("A", EMPTY);
  355.         name = name.replaceAll("E", EMPTY);
  356.         name = name.replaceAll("I", EMPTY);
  357.         name = name.replaceAll("O", EMPTY);
  358.         name = name.replaceAll("U", EMPTY);

  359.         name = name.replaceAll("\\s{2,}\\b", SPACE);

  360.         // return isVowel(firstLetter) ? (firstLetter + name) : name;
  361.         if (isVowel(firstLetter)) {
  362.             return firstLetter + name;
  363.         }
  364.         return name;
  365.     }
  366. }