JaroWinklerDistance.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.Arrays;

  19. /**
  20.  * A similarity algorithm indicating the percentage of matched characters between two character sequences.
  21.  *
  22.  * <p>
  23.  * The Jaro measure is the weighted sum of percentage of matched characters
  24.  * from each file and transposed characters. Winkler increased this measure
  25.  * for matching initial characters.
  26.  * </p>
  27.  *
  28.  * <p>
  29.  * This implementation is based on the Jaro Winkler similarity algorithm
  30.  * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
  31.  * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
  32.  * </p>
  33.  *
  34.  * <p>
  35.  * This code has been adapted from Apache Commons Lang 3.3.
  36.  * </p>
  37.  *
  38.  * @since 1.0
  39.  */
  40. public class JaroWinklerDistance implements SimilarityScore<Double> {

  41.     /**
  42.      * The default prefix length limit set to four.
  43.      */
  44.     private static final int PREFIX_LENGTH_LIMIT = 4;
  45.     /**
  46.      * Represents a failed index search.
  47.      */
  48.     public static final int INDEX_NOT_FOUND = -1;

  49.     /**
  50.      * Find the Jaro Winkler Distance which indicates the similarity score
  51.      * between two CharSequences.
  52.      *
  53.      * <pre>
  54.      * distance.apply(null, null)          = IllegalArgumentException
  55.      * distance.apply("","")               = 0.0
  56.      * distance.apply("","a")              = 0.0
  57.      * distance.apply("aaapppp", "")       = 0.0
  58.      * distance.apply("frog", "fog")       = 0.93
  59.      * distance.apply("fly", "ant")        = 0.0
  60.      * distance.apply("elephant", "hippo") = 0.44
  61.      * distance.apply("hippo", "elephant") = 0.44
  62.      * distance.apply("hippo", "zzzzzzzz") = 0.0
  63.      * distance.apply("hello", "hallo")    = 0.88
  64.      * distance.apply("ABC Corporation", "ABC Corp") = 0.93
  65.      * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
  66.      * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
  67.      * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA")    = 0.88
  68.      * </pre>
  69.      *
  70.      * @param left the first String, must not be null
  71.      * @param right the second String, must not be null
  72.      * @return result distance
  73.      * @throws IllegalArgumentException if either String input {@code null}
  74.      */
  75.     @Override
  76.     public Double apply(final CharSequence left, final CharSequence right) {
  77.         final double defaultScalingFactor = 0.1;
  78.         final double percentageRoundValue = 100.0;

  79.         if (left == null || right == null) {
  80.             throw new IllegalArgumentException("Strings must not be null");
  81.         }

  82.         int[] mtp = matches(left, right);
  83.         double m = mtp[0];
  84.         if (m == 0) {
  85.             return 0D;
  86.         }
  87.         double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
  88.         double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
  89.         return Math.round(jw * percentageRoundValue) / percentageRoundValue;
  90.     }

  91.     /**
  92.      * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array.
  93.      *
  94.      * @param first the first string to be matched
  95.      * @param second the second string to be machted
  96.      * @return mtp array containing: matches, transpositions, prefix, and max length
  97.      */
  98.     protected static int[] matches(final CharSequence first, final CharSequence second) {
  99.         CharSequence max, min;
  100.         if (first.length() > second.length()) {
  101.             max = first;
  102.             min = second;
  103.         } else {
  104.             max = second;
  105.             min = first;
  106.         }
  107.         int range = Math.max(max.length() / 2 - 1, 0);
  108.         int[] matchIndexes = new int[min.length()];
  109.         Arrays.fill(matchIndexes, -1);
  110.         boolean[] matchFlags = new boolean[max.length()];
  111.         int matches = 0;
  112.         for (int mi = 0; mi < min.length(); mi++) {
  113.             char c1 = min.charAt(mi);
  114.             for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
  115.                 if (!matchFlags[xi] && c1 == max.charAt(xi)) {
  116.                     matchIndexes[mi] = xi;
  117.                     matchFlags[xi] = true;
  118.                     matches++;
  119.                     break;
  120.                 }
  121.             }
  122.         }
  123.         char[] ms1 = new char[matches];
  124.         char[] ms2 = new char[matches];
  125.         for (int i = 0, si = 0; i < min.length(); i++) {
  126.             if (matchIndexes[i] != -1) {
  127.                 ms1[si] = min.charAt(i);
  128.                 si++;
  129.             }
  130.         }
  131.         for (int i = 0, si = 0; i < max.length(); i++) {
  132.             if (matchFlags[i]) {
  133.                 ms2[si] = max.charAt(i);
  134.                 si++;
  135.             }
  136.         }
  137.         int transpositions = 0;
  138.         for (int mi = 0; mi < ms1.length; mi++) {
  139.             if (ms1[mi] != ms2[mi]) {
  140.                 transpositions++;
  141.             }
  142.         }
  143.         int prefix = 0;
  144.         for (int mi = 0; mi < min.length(); mi++) {
  145.             if (first.charAt(mi) == second.charAt(mi)) {
  146.                 prefix++;
  147.             } else {
  148.                 break;
  149.             }
  150.         }
  151.         return new int[] { matches, transpositions / 2, prefix, max.length() };
  152.     }

  153. }