JaroWinklerDistance.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. /**
  19.  * Measures the Jaro-Winkler distance of two character sequences.
  20.  * It is the complementary of Jaro-Winkler similarity.
  21.  *
  22.  * @since 1.0
  23.  */
  24. public class JaroWinklerDistance implements EditDistance<Double> {

  25.     /**
  26.      * @deprecated Deprecated as of 1.7. This constant will be removed in 2.0.
  27.      */
  28.     @Deprecated
  29.     public static final int INDEX_NOT_FOUND = -1;

  30.     /**
  31.      * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
  32.      *
  33.      * @param first the first string to be matched.
  34.      * @param second the second string to be matched.
  35.      * @return array containing: matches, half transpositions, and prefix
  36.      * @deprecated Deprecated as of 1.7. This method will be removed in 2.0, and moved to a Jaro Winkler similarity
  37.      *             class. TODO see TEXT-104.
  38.      */
  39.     @Deprecated
  40.     protected static int[] matches(final CharSequence first, final CharSequence second) {
  41.         return JaroWinklerSimilarity.matches(first, second);
  42.     }

  43.     /**
  44.      * Creates a new instance.
  45.      */
  46.     public JaroWinklerDistance() {
  47.         // empty
  48.     }

  49.     /**
  50.      * Computes the Jaro Winkler Distance between two character sequences.
  51.      *
  52.      * <pre>
  53.      * distance.apply(null, null)          = IllegalArgumentException
  54.      * distance.apply("foo", null)         = IllegalArgumentException
  55.      * distance.apply(null, "foo")         = IllegalArgumentException
  56.      * distance.apply("", "")              = 0.0
  57.      * distance.apply("foo", "foo")        = 0.0
  58.      * distance.apply("foo", "foo ")       = 0.06
  59.      * distance.apply("foo", "foo  ")      = 0.09
  60.      * distance.apply("foo", " foo ")      = 0.13
  61.      * distance.apply("foo", "  foo")      = 0.49
  62.      * distance.apply("", "a")             = 1.0
  63.      * distance.apply("aaapppp", "")       = 1.0
  64.      * distance.apply("frog", "fog")       = 0.07
  65.      * distance.apply("fly", "ant")        = 1.0
  66.      * distance.apply("elephant", "hippo") = 0.56
  67.      * distance.apply("hippo", "elephant") = 0.56
  68.      * distance.apply("hippo", "zzzzzzzz") = 1.0
  69.      * distance.apply("hello", "hallo")    = 0.12
  70.      * distance.apply("ABC Corporation", "ABC Corp") = 0.09
  71.      * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.05
  72.      * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.08
  73.      * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.12
  74.      * </pre>
  75.      *
  76.      * @param left the first input, must not be null.
  77.      * @param right the second input, must not be null.
  78.      * @return result distance.
  79.      * @throws IllegalArgumentException if either CharSequence input is {@code null}
  80.      */
  81.     @Override
  82.     public Double apply(final CharSequence left, final CharSequence right) {
  83.         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
  84.     }

  85.     /**
  86.      * Computes the Jaro Winkler Distance between two character sequences.
  87.      *
  88.      * <pre>
  89.      * distance.apply(null, null)          = IllegalArgumentException
  90.      * distance.apply("foo", null)         = IllegalArgumentException
  91.      * distance.apply(null, "foo")         = IllegalArgumentException
  92.      * distance.apply("", "")              = 0.0
  93.      * distance.apply("foo", "foo")        = 0.0
  94.      * distance.apply("foo", "foo ")       = 0.06
  95.      * distance.apply("foo", "foo  ")      = 0.09
  96.      * distance.apply("foo", " foo ")      = 0.13
  97.      * distance.apply("foo", "  foo")      = 0.49
  98.      * distance.apply("", "a")             = 1.0
  99.      * distance.apply("aaapppp", "")       = 1.0
  100.      * distance.apply("frog", "fog")       = 0.07
  101.      * distance.apply("fly", "ant")        = 1.0
  102.      * distance.apply("elephant", "hippo") = 0.56
  103.      * distance.apply("hippo", "elephant") = 0.56
  104.      * distance.apply("hippo", "zzzzzzzz") = 1.0
  105.      * distance.apply("hello", "hallo")    = 0.12
  106.      * distance.apply("ABC Corporation", "ABC Corp") = 0.09
  107.      * distance.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.05
  108.      * distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.08
  109.      * distance.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.12
  110.      * </pre>
  111.      *
  112.      * @param <E> The type of similarity score unit.
  113.      * @param left the first input, must not be null.
  114.      * @param right the second input, must not be null.
  115.      * @return result distance.
  116.      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
  117.      * @since 1.13.0
  118.      */
  119.     public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
  120.         if (left == null || right == null) {
  121.             throw new IllegalArgumentException("CharSequences must not be null");
  122.         }
  123.         return 1 - JaroWinklerSimilarity.INSTANCE.apply(left, right);
  124.     }
  125. }