MatchRatingApproachEncoder.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.codec.language;

  18. import java.util.Locale;

  19. import org.apache.commons.codec.EncoderException;
  20. import org.apache.commons.codec.StringEncoder;

  21. /**
  22.  * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
  23.  * <p>
  24.  * This class is immutable and thread-safe.
  25.  * </p>
  26.  *
  27.  * @see <a href="https://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
  28.  * @since 1.8
  29.  */
  30. public class MatchRatingApproachEncoder implements StringEncoder {

  31.     private static final String SPACE = " ";

  32.     private static final String EMPTY = "";

  33.     /**
  34.      * The plain letter equivalent of the accented letters.
  35.      */
  36.     private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
  37.             "AaEeIiOoUuYy" + // acute
  38.             "AaEeIiOoUuYy" + // circumflex
  39.             "AaOoNn" + // tilde
  40.             "AaEeIiOoUuYy" + // umlaut
  41.             "Aa" + // ring
  42.             "Cc" + // cedilla
  43.             "OoUu"; // double acute

  44.     /**
  45.      * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
  46.      */
  47.     private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
  48.             "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
  49.             "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
  50.             "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
  51.             "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
  52.             "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";

  53.     private static final String[] DOUBLE_CONSONANT =
  54.             { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
  55.                    "TT", "VV", "WW", "XX", "YY", "ZZ" };

  56.     /**
  57.      * Constructs a new instance.
  58.      */
  59.     public MatchRatingApproachEncoder() {
  60.         // empty
  61.     }

  62.     /**
  63.      * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
  64.      * spaces.
  65.      *
  66.      * <h2>API Usage</h2>
  67.      * <p>
  68.      * Consider this method private, it is package protected for unit testing only.
  69.      * </p>
  70.      *
  71.      * @param name
  72.      *            The name to be cleaned
  73.      * @return The cleaned name
  74.      */
  75.     String cleanName(final String name) {
  76.         String upperName = name.toUpperCase(Locale.ENGLISH);

  77.         final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
  78.         for (final String str : charsToTrim) {
  79.             upperName = upperName.replaceAll(str, EMPTY);
  80.         }

  81.         upperName = removeAccents(upperName);
  82.         return upperName.replaceAll("\\s+", EMPTY);
  83.     }

  84.     /**
  85.      * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the
  86.      * Encoder interface Throws an EncoderException if input object is not of type {@link String}.
  87.      *
  88.      * @param pObject
  89.      *            Object to encode
  90.      * @return An object (or type {@link String}) containing the Match Rating Approach code which corresponds to the
  91.      *         String supplied.
  92.      * @throws EncoderException
  93.      *             if the parameter supplied is not of type {@link String}
  94.      */
  95.     @Override
  96.     public final Object encode(final Object pObject) throws EncoderException {
  97.         if (!(pObject instanceof String)) {
  98.             throw new EncoderException(
  99.                     "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
  100.         }
  101.         return encode((String) pObject);
  102.     }

  103.     /**
  104.      * Encodes a String using the Match Rating Approach (MRA) algorithm.
  105.      *
  106.      * @param name
  107.      *            String object to encode
  108.      * @return The MRA code corresponding to the String supplied
  109.      */
  110.     @Override
  111.     public final String encode(String name) {
  112.         // Bulletproof for trivial input - NINO
  113.         if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
  114.             return EMPTY;
  115.         }

  116.         // Preprocessing
  117.         name = cleanName(name);

  118.         // Bulletproof if name becomes empty after cleanName(name)
  119.         if (SPACE.equals(name) || name.isEmpty()) {
  120.             return EMPTY;
  121.         }

  122.         // BEGIN: Actual encoding part of the algorithm...
  123.         // 1. Delete all vowels unless the vowel begins the word
  124.         name = removeVowels(name);

  125.         // Bulletproof if name becomes empty after removeVowels(name)
  126.         if (SPACE.equals(name) || name.isEmpty()) {
  127.             return EMPTY;
  128.         }

  129.         // 2. Remove second consonant from any double consonant
  130.         name = removeDoubleConsonants(name);

  131.         return getFirst3Last3(name);
  132.     }

  133.     /**
  134.      * Gets the first and last 3 letters of a name (if &gt; 6 characters) Else just returns the name.
  135.      *
  136.      * <h2>API Usage</h2>
  137.      * <p>
  138.      * Consider this method private, it is package protected for unit testing only.
  139.      * </p>
  140.      *
  141.      * @param name
  142.      *            The string to get the substrings from
  143.      * @return Annexed first and last 3 letters of input word.
  144.      */
  145.     String getFirst3Last3(final String name) {
  146.         final int nameLength = name.length();

  147.         if (nameLength > 6) {
  148.             final String firstThree = name.substring(0, 3);
  149.             final String lastThree = name.substring(nameLength - 3, nameLength);
  150.             return firstThree + lastThree;
  151.         }
  152.         return name;
  153.     }

  154.     /**
  155.      * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
  156.      * min rating. Values strictly from documentation.
  157.      *
  158.      * <h2>API Usage</h2>
  159.      * <p>
  160.      * Consider this method private, it is package protected for unit testing only.
  161.      * </p>
  162.      *
  163.      * @param sumLength
  164.      *            The length of 2 strings sent down
  165.      * @return The min rating value
  166.      */
  167.     int getMinRating(final int sumLength) {
  168.         int minRating = 0;

  169.         if (sumLength <= 4) {
  170.             minRating = 5;
  171.         } else if (sumLength <= 7) { // already know it is at least 5
  172.             minRating = 4;
  173.         } else if (sumLength <= 11) { // already know it is at least 8
  174.             minRating = 3;
  175.         } else if (sumLength == 12) {
  176.             minRating = 2;
  177.         } else {
  178.             minRating = 1; // docs said little here.
  179.         }

  180.         return minRating;
  181.     }

  182.     /**
  183.      * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
  184.      * strings are cleaned in the same way as {@link #encode(String)}.
  185.      *
  186.      * @param name1
  187.      *            First of the 2 strings (names) to compare
  188.      * @param name2
  189.      *            Second of the 2 names to compare
  190.      * @return {@code true} if the encodings are identical {@code false} otherwise.
  191.      */
  192.     public boolean isEncodeEquals(String name1, String name2) {
  193.         // Bulletproof for trivial input - NINO
  194.         if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
  195.             return false;
  196.         }
  197.         if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
  198.             return false;
  199.         }
  200.         if (name1.length() == 1 || name2.length() == 1) {
  201.             return false;
  202.         }
  203.         if (name1.equalsIgnoreCase(name2)) {
  204.             return true;
  205.         }

  206.         // Preprocessing
  207.         name1 = cleanName(name1);
  208.         name2 = cleanName(name2);

  209.         // Actual MRA Algorithm

  210.         // 1. Remove vowels
  211.         name1 = removeVowels(name1);
  212.         name2 = removeVowels(name2);

  213.         // 2. Remove double consonants
  214.         name1 = removeDoubleConsonants(name1);
  215.         name2 = removeDoubleConsonants(name2);

  216.         // 3. Reduce down to 3 letters
  217.         name1 = getFirst3Last3(name1);
  218.         name2 = getFirst3Last3(name2);

  219.         // 4. Check for length difference - if 3 or greater, then no similarity
  220.         // comparison is done
  221.         if (Math.abs(name1.length() - name2.length()) >= 3) {
  222.             return false;
  223.         }

  224.         // 5. Obtain the minimum rating value by calculating the length sum of the
  225.         // encoded Strings and sending it down.
  226.         final int sumLength = Math.abs(name1.length() + name2.length());
  227.         final int minRating = getMinRating(sumLength);

  228.         // 6. Process the encoded Strings from left to right and remove any
  229.         // identical characters found from both Strings respectively.
  230.         final int count = leftToRightThenRightToLeftProcessing(name1, name2);

  231.         // 7. Each PNI item that has a similarity rating equal to or greater than
  232.         // the min is considered to be a good candidate match
  233.         return count >= minRating;

  234.     }

  235.     /**
  236.      * Determines if a letter is a vowel.
  237.      *
  238.      * <h2>API Usage</h2>
  239.      * <p>
  240.      * Consider this method private, it is package protected for unit testing only.
  241.      * </p>
  242.      *
  243.      * @param letter
  244.      *            The letter under investigation
  245.      * @return True if a vowel, else false
  246.      */
  247.     boolean isVowel(final String letter) {
  248.         return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
  249.                letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
  250.     }

  251.     /**
  252.      * Processes the names from left to right (first) then right to left removing identical letters in same positions.
  253.      * Then subtracts the longer string that remains from 6 and returns this.
  254.      *
  255.      * <h2>API Usage</h2>
  256.      * <p>
  257.      * Consider this method private, it is package protected for unit testing only.
  258.      * </p>
  259.      *
  260.      * @param name1
  261.      *            name2
  262.      * @return the length as above
  263.      */
  264.     int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
  265.         final char[] name1Char = name1.toCharArray();
  266.         final char[] name2Char = name2.toCharArray();

  267.         final int name1Size = name1.length() - 1;
  268.         final int name2Size = name2.length() - 1;

  269.         String name1LtRStart = EMPTY;
  270.         String name1LtREnd = EMPTY;

  271.         String name2RtLStart = EMPTY;
  272.         String name2RtLEnd = EMPTY;

  273.         for (int i = 0; i < name1Char.length; i++) {
  274.             if (i > name2Size) {
  275.                 break;
  276.             }

  277.             name1LtRStart = name1.substring(i, i + 1);
  278.             name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);

  279.             name2RtLStart = name2.substring(i, i + 1);
  280.             name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);

  281.             // Left to right...
  282.             if (name1LtRStart.equals(name2RtLStart)) {
  283.                 name1Char[i] = ' ';
  284.                 name2Char[i] = ' ';
  285.             }

  286.             // Right to left...
  287.             if (name1LtREnd.equals(name2RtLEnd)) {
  288.                 name1Char[name1Size - i] = ' ';
  289.                 name2Char[name2Size - i] = ' ';
  290.             }
  291.         }

  292.         // Char arrays -> string & remove extraneous space
  293.         final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
  294.         final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);

  295.         // Final bit - subtract the longest string from 6 and return this int value
  296.         if (strA.length() > strB.length()) {
  297.             return Math.abs(6 - strA.length());
  298.         }
  299.         return Math.abs(6 - strB.length());
  300.     }

  301.     /**
  302.      * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved.
  303.      * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
  304.      *
  305.      * @param accentedWord
  306.      *            The word that may have accents in it.
  307.      * @return De-accented word
  308.      */
  309.     String removeAccents(final String accentedWord) {
  310.         if (accentedWord == null) {
  311.             return null;
  312.         }

  313.         final StringBuilder sb = new StringBuilder();
  314.         final int n = accentedWord.length();

  315.         for (int i = 0; i < n; i++) {
  316.             final char c = accentedWord.charAt(i);
  317.             final int pos = UNICODE.indexOf(c);
  318.             if (pos > -1) {
  319.                 sb.append(PLAIN_ASCII.charAt(pos));
  320.             } else {
  321.                 sb.append(c);
  322.             }
  323.         }

  324.         return sb.toString();
  325.     }

  326.     /**
  327.      * Replaces any double consonant pair with the single letter equivalent.
  328.      *
  329.      * <h2>API Usage</h2>
  330.      * <p>
  331.      * Consider this method private, it is package protected for unit testing only.
  332.      * </p>
  333.      *
  334.      * @param name
  335.      *            String to have double consonants removed
  336.      * @return Single consonant word
  337.      */
  338.     String removeDoubleConsonants(final String name) {
  339.         String replacedName = name.toUpperCase(Locale.ENGLISH);
  340.         for (final String dc : DOUBLE_CONSONANT) {
  341.             if (replacedName.contains(dc)) {
  342.                 final String singleLetter = dc.substring(0, 1);
  343.                 replacedName = replacedName.replace(dc, singleLetter);
  344.             }
  345.         }
  346.         return replacedName;
  347.     }

  348.     /**
  349.      * Deletes all vowels unless the vowel begins the word.
  350.      *
  351.      * <h2>API Usage</h2>
  352.      * <p>
  353.      * Consider this method private, it is package protected for unit testing only.
  354.      * </p>
  355.      *
  356.      * @param name
  357.      *            The name to have vowels removed
  358.      * @return De-voweled word
  359.      */
  360.     String removeVowels(String name) {
  361.         // Extract first letter
  362.         final String firstLetter = name.substring(0, 1);

  363.         name = name.replace("A", EMPTY);
  364.         name = name.replace("E", EMPTY);
  365.         name = name.replace("I", EMPTY);
  366.         name = name.replace("O", EMPTY);
  367.         name = name.replace("U", EMPTY);

  368.         name = name.replaceAll("\\s{2,}\\b", SPACE);

  369.         // return isVowel(firstLetter) ? (firstLetter + name) : name;
  370.         if (isVowel(firstLetter)) {
  371.             return firstLetter + name;
  372.         }
  373.         return name;
  374.     }
  375. }