JaroWinklerSimilarity.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.Arrays;
  19. import java.util.Objects;

  20. /**
  21.  * A similarity algorithm indicating the percentage of matched characters between two character sequences.
  22.  *
  23.  * <p>
  24.  * The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. Winkler increased this measure for
  25.  * matching initial characters.
  26.  * </p>
  27.  * <p>
  28.  * This implementation is based on the Jaro Winkler similarity algorithm from <a href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
  29.  * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
  30.  * </p>
  31.  * <p>
  32.  * This code has been adapted from Apache Commons Lang 3.3.
  33.  * </p>
  34.  *
  35.  * @since 1.7
  36.  */
  37. public class JaroWinklerSimilarity implements SimilarityScore<Double> {

  38.     /**
  39.      * Singleton instance.
  40.      */
  41.     static final JaroWinklerSimilarity INSTANCE = new JaroWinklerSimilarity();

  42.     /**
  43.      * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
  44.      *
  45.      * @param first  the first input to be matched.
  46.      * @param second the second input to be matched.
  47.      * @return mtp array containing: matches, half transpositions, and prefix.
  48.      */
  49.     protected static int[] matches(final CharSequence first, final CharSequence second) {
  50.         return matches(SimilarityInput.input(first), SimilarityInput.input(second));
  51.     }

  52.     /**
  53.      * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
  54.      *
  55.      * @param <E> The type of similarity score unit.
  56.      * @param first  the first input to be matched.
  57.      * @param second the second input to be matched.
  58.      * @return mtp array containing: matches, half transpositions, and prefix.
  59.      * @since 1.13.0
  60.      */
  61.     protected static <E> int[] matches(final SimilarityInput<E> first, final SimilarityInput<E> second) {
  62.         final SimilarityInput<E> max;
  63.         final SimilarityInput<E> min;
  64.         if (first.length() > second.length()) {
  65.             max = first;
  66.             min = second;
  67.         } else {
  68.             max = second;
  69.             min = first;
  70.         }
  71.         final int range = Math.max(max.length() / 2 - 1, 0);
  72.         final int[] matchIndexes = new int[min.length()];
  73.         Arrays.fill(matchIndexes, -1);
  74.         final boolean[] matchFlags = new boolean[max.length()];
  75.         int matches = 0;
  76.         for (int mi = 0; mi < min.length(); mi++) {
  77.             final E c1 = min.at(mi);
  78.             for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
  79.                 if (!matchFlags[xi] && c1.equals(max.at(xi))) {
  80.                     matchIndexes[mi] = xi;
  81.                     matchFlags[xi] = true;
  82.                     matches++;
  83.                     break;
  84.                 }
  85.             }
  86.         }
  87.         final Object[] ms1 = new Object[matches];
  88.         final Object[] ms2 = new Object[matches];
  89.         for (int i = 0, si = 0; i < min.length(); i++) {
  90.             if (matchIndexes[i] != -1) {
  91.                 ms1[si] = min.at(i);
  92.                 si++;
  93.             }
  94.         }
  95.         for (int i = 0, si = 0; i < max.length(); i++) {
  96.             if (matchFlags[i]) {
  97.                 ms2[si] = max.at(i);
  98.                 si++;
  99.             }
  100.         }
  101.         int halfTranspositions = 0;
  102.         for (int mi = 0; mi < ms1.length; mi++) {
  103.             if (!ms1[mi].equals(ms2[mi])) {
  104.                 halfTranspositions++;
  105.             }
  106.         }
  107.         int prefix = 0;
  108.         for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
  109.             if (!first.at(mi).equals(second.at(mi))) {
  110.                 break;
  111.             }
  112.             prefix++;
  113.         }
  114.         return new int[] { matches, halfTranspositions, prefix };
  115.     }

  116.     /**
  117.      * Creates a new instance.
  118.      */
  119.     public JaroWinklerSimilarity() {
  120.         // empty
  121.     }

  122.     /**
  123.      * Computes the Jaro Winkler Similarity between two character sequences.
  124.      *
  125.      * <pre>
  126.      * sim.apply(null, null)          = IllegalArgumentException
  127.      * sim.apply("foo", null)         = IllegalArgumentException
  128.      * sim.apply(null, "foo")         = IllegalArgumentException
  129.      * sim.apply("", "")              = 1.0
  130.      * sim.apply("foo", "foo")        = 1.0
  131.      * sim.apply("foo", "foo ")       = 0.94
  132.      * sim.apply("foo", "foo  ")      = 0.91
  133.      * sim.apply("foo", " foo ")      = 0.87
  134.      * sim.apply("foo", "  foo")      = 0.51
  135.      * sim.apply("", "a")             = 0.0
  136.      * sim.apply("aaapppp", "")       = 0.0
  137.      * sim.apply("frog", "fog")       = 0.93
  138.      * sim.apply("fly", "ant")        = 0.0
  139.      * sim.apply("elephant", "hippo") = 0.44
  140.      * sim.apply("hippo", "elephant") = 0.44
  141.      * sim.apply("hippo", "zzzzzzzz") = 0.0
  142.      * sim.apply("hello", "hallo")    = 0.88
  143.      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
  144.      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
  145.      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
  146.      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
  147.      * </pre>
  148.      *
  149.      * @param left  the first input, must not be null.
  150.      * @param right the second input, must not be null.
  151.      * @return result similarity.
  152.      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
  153.      */
  154.     @Override
  155.     public Double apply(final CharSequence left, final CharSequence right) {
  156.         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
  157.     }

  158.     /**
  159.      * Computes the Jaro Winkler Similarity between two character sequences.
  160.      *
  161.      * <pre>
  162.      * sim.apply(null, null)          = IllegalArgumentException
  163.      * sim.apply("foo", null)         = IllegalArgumentException
  164.      * sim.apply(null, "foo")         = IllegalArgumentException
  165.      * sim.apply("", "")              = 1.0
  166.      * sim.apply("foo", "foo")        = 1.0
  167.      * sim.apply("foo", "foo ")       = 0.94
  168.      * sim.apply("foo", "foo  ")      = 0.91
  169.      * sim.apply("foo", " foo ")      = 0.87
  170.      * sim.apply("foo", "  foo")      = 0.51
  171.      * sim.apply("", "a")             = 0.0
  172.      * sim.apply("aaapppp", "")       = 0.0
  173.      * sim.apply("frog", "fog")       = 0.93
  174.      * sim.apply("fly", "ant")        = 0.0
  175.      * sim.apply("elephant", "hippo") = 0.44
  176.      * sim.apply("hippo", "elephant") = 0.44
  177.      * sim.apply("hippo", "zzzzzzzz") = 0.0
  178.      * sim.apply("hello", "hallo")    = 0.88
  179.      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
  180.      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
  181.      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
  182.      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
  183.      * </pre>
  184.      *
  185.      * @param <E> The type of similarity score unit.
  186.      * @param left  the first input, must not be null.
  187.      * @param right the second input, must not be null.
  188.      * @return result similarity.
  189.      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
  190.      * @since 1.13.0
  191.      */
  192.     public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
  193.         final double defaultScalingFactor = 0.1;
  194.         if (left == null || right == null) {
  195.             throw new IllegalArgumentException("CharSequences must not be null");
  196.         }
  197.         if (Objects.equals(left, right)) {
  198.             return 1d;
  199.         }
  200.         final int[] mtp = matches(left, right);
  201.         final double m = mtp[0];
  202.         if (m == 0) {
  203.             return 0d;
  204.         }
  205.         final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3;
  206.         return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j);
  207.     }

  208. }