View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  import java.util.Objects;
21  
22  /**
23   * A similarity algorithm indicating the percentage of matched characters between two character sequences.
24   *
25   * <p>
26   * The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. Winkler increased this measure for
27   * matching initial characters.
28   * </p>
29   * <p>
30   * This implementation is based on the Jaro Winkler similarity algorithm from <a href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
31   * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
32   * </p>
33   * <p>
34   * This code has been adapted from Apache Commons Lang 3.3.
35   * </p>
36   *
37   * @since 1.7
38   */
39  public class JaroWinklerSimilarity implements SimilarityScore<Double> {
40  
41      /**
42       * The singleton instance.
43       */
44      static final JaroWinklerSimilarity INSTANCE = new JaroWinklerSimilarity();
45  
46      /**
47       * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
48       *
49       * @param first  the first input to be matched.
50       * @param second the second input to be matched.
51       * @return mtp array containing: matches, half transpositions, and prefix.
52       */
53      protected static int[] matches(final CharSequence first, final CharSequence second) {
54          return matches(SimilarityInput.input(first), SimilarityInput.input(second));
55      }
56  
57      /**
58       * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
59       *
60       * @param <E> The type of similarity score unit.
61       * @param first  the first input to be matched.
62       * @param second the second input to be matched.
63       * @return mtp array containing: matches, half transpositions, and prefix.
64       * @since 1.13.0
65       */
66      protected static <E> int[] matches(final SimilarityInput<E> first, final SimilarityInput<E> second) {
67          final SimilarityInput<E> max;
68          final SimilarityInput<E> min;
69          if (first.length() > second.length()) {
70              max = first;
71              min = second;
72          } else {
73              max = second;
74              min = first;
75          }
76          final int range = Math.max(max.length() / 2 - 1, 0);
77          final int[] matchIndexes = new int[min.length()];
78          Arrays.fill(matchIndexes, -1);
79          final boolean[] matchFlags = new boolean[max.length()];
80          int matches = 0;
81          for (int mi = 0; mi < min.length(); mi++) {
82              final E c1 = min.at(mi);
83              for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
84                  if (!matchFlags[xi] && c1.equals(max.at(xi))) {
85                      matchIndexes[mi] = xi;
86                      matchFlags[xi] = true;
87                      matches++;
88                      break;
89                  }
90              }
91          }
92          final Object[] ms1 = new Object[matches];
93          final Object[] ms2 = new Object[matches];
94          for (int i = 0, si = 0; i < min.length(); i++) {
95              if (matchIndexes[i] != -1) {
96                  ms1[si] = min.at(i);
97                  si++;
98              }
99          }
100         for (int i = 0, si = 0; i < max.length(); i++) {
101             if (matchFlags[i]) {
102                 ms2[si] = max.at(i);
103                 si++;
104             }
105         }
106         int halfTranspositions = 0;
107         for (int mi = 0; mi < ms1.length; mi++) {
108             if (!ms1[mi].equals(ms2[mi])) {
109                 halfTranspositions++;
110             }
111         }
112         int prefix = 0;
113         for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
114             if (!first.at(mi).equals(second.at(mi))) {
115                 break;
116             }
117             prefix++;
118         }
119         return new int[] { matches, halfTranspositions, prefix };
120     }
121 
122     /**
123      * Creates a new instance.
124      */
125     public JaroWinklerSimilarity() {
126         // empty
127     }
128 
129     /**
130      * Computes the Jaro Winkler Similarity between two character sequences.
131      *
132      * <pre>
133      * sim.apply(null, null)          = IllegalArgumentException
134      * sim.apply("foo", null)         = IllegalArgumentException
135      * sim.apply(null, "foo")         = IllegalArgumentException
136      * sim.apply("", "")              = 1.0
137      * sim.apply("foo", "foo")        = 1.0
138      * sim.apply("foo", "foo ")       = 0.94
139      * sim.apply("foo", "foo  ")      = 0.91
140      * sim.apply("foo", " foo ")      = 0.87
141      * sim.apply("foo", "  foo")      = 0.51
142      * sim.apply("", "a")             = 0.0
143      * sim.apply("aaapppp", "")       = 0.0
144      * sim.apply("frog", "fog")       = 0.93
145      * sim.apply("fly", "ant")        = 0.0
146      * sim.apply("elephant", "hippo") = 0.44
147      * sim.apply("hippo", "elephant") = 0.44
148      * sim.apply("hippo", "zzzzzzzz") = 0.0
149      * sim.apply("hello", "hallo")    = 0.88
150      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
151      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
152      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
153      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
154      * </pre>
155      *
156      * @param left  the first input, must not be null.
157      * @param right the second input, must not be null.
158      * @return result similarity.
159      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
160      */
161     @Override
162     public Double apply(final CharSequence left, final CharSequence right) {
163         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
164     }
165 
166     /**
167      * Computes the Jaro Winkler Similarity between two character sequences.
168      *
169      * <pre>
170      * sim.apply(null, null)          = IllegalArgumentException
171      * sim.apply("foo", null)         = IllegalArgumentException
172      * sim.apply(null, "foo")         = IllegalArgumentException
173      * sim.apply("", "")              = 1.0
174      * sim.apply("foo", "foo")        = 1.0
175      * sim.apply("foo", "foo ")       = 0.94
176      * sim.apply("foo", "foo  ")      = 0.91
177      * sim.apply("foo", " foo ")      = 0.87
178      * sim.apply("foo", "  foo")      = 0.51
179      * sim.apply("", "a")             = 0.0
180      * sim.apply("aaapppp", "")       = 0.0
181      * sim.apply("frog", "fog")       = 0.93
182      * sim.apply("fly", "ant")        = 0.0
183      * sim.apply("elephant", "hippo") = 0.44
184      * sim.apply("hippo", "elephant") = 0.44
185      * sim.apply("hippo", "zzzzzzzz") = 0.0
186      * sim.apply("hello", "hallo")    = 0.88
187      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
188      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
189      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
190      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
191      * </pre>
192      *
193      * @param <E> The type of similarity score unit.
194      * @param left  the first input, must not be null.
195      * @param right the second input, must not be null.
196      * @return result similarity.
197      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
198      * @since 1.13.0
199      */
200     public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
201         final double defaultScalingFactor = 0.1;
202         if (left == null || right == null) {
203             throw new IllegalArgumentException("CharSequences must not be null");
204         }
205         if (Objects.equals(left, right)) {
206             return 1d;
207         }
208         final int[] mtp = matches(left, right);
209         final double m = mtp[0];
210         if (m == 0) {
211             return 0d;
212         }
213         final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3;
214         return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j);
215     }
216 
217 }