View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  import java.util.Objects;
21  
22  /**
23   * A similarity algorithm indicating the percentage of matched characters between two character sequences.
24   *
25   * <p>
26   * The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. Winkler increased this measure for
27   * matching initial characters.
28   * </p>
29   * <p>
30   * This implementation is based on the Jaro Winkler similarity algorithm from <a href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
31   * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
32   * </p>
33   * <p>
34   * This code has been adapted from Apache Commons Lang 3.3.
35   * </p>
36   *
37   * @since 1.7
38   */
39  public class JaroWinklerSimilarity implements SimilarityScore<Double> {
40  
41      /**
42       * Singleton instance.
43       */
44      static final JaroWinklerSimilarity INSTANCE = new JaroWinklerSimilarity();
45  
46      /**
47       * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
48       *
49       * @param first  the first input to be matched.
50       * @param second the second input to be matched.
51       * @return mtp array containing: matches, half transpositions, and prefix.
52       */
53      protected static int[] matches(final CharSequence first, final CharSequence second) {
54          return matches(SimilarityInput.input(first), SimilarityInput.input(second));
55      }
56  
57      /**
58       * Computes the Jaro-Winkler string matches, half transpositions, prefix array.
59       *
60       * @param <E> The type of similarity score unit.
61       * @param first  the first input to be matched.
62       * @param second the second input to be matched.
63       * @return mtp array containing: matches, half transpositions, and prefix.
64       * @since 1.13.0
65       */
66      protected static <E> int[] matches(final SimilarityInput<E> first, final SimilarityInput<E> second) {
67          final SimilarityInput<E> max;
68          final SimilarityInput<E> min;
69          if (first.length() > second.length()) {
70              max = first;
71              min = second;
72          } else {
73              max = second;
74              min = first;
75          }
76          final int range = Math.max(max.length() / 2 - 1, 0);
77          final int[] matchIndexes = new int[min.length()];
78          Arrays.fill(matchIndexes, -1);
79          final boolean[] matchFlags = new boolean[max.length()];
80          int matches = 0;
81          for (int mi = 0; mi < min.length(); mi++) {
82              final E c1 = min.at(mi);
83              for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
84                  if (!matchFlags[xi] && c1.equals(max.at(xi))) {
85                      matchIndexes[mi] = xi;
86                      matchFlags[xi] = true;
87                      matches++;
88                      break;
89                  }
90              }
91          }
92          final Object[] ms1 = new Object[matches];
93          final Object[] ms2 = new Object[matches];
94          for (int i = 0, si = 0; i < min.length(); i++) {
95              if (matchIndexes[i] != -1) {
96                  ms1[si] = min.at(i);
97                  si++;
98              }
99          }
100         for (int i = 0, si = 0; i < max.length(); i++) {
101             if (matchFlags[i]) {
102                 ms2[si] = max.at(i);
103                 si++;
104             }
105         }
106         int halfTranspositions = 0;
107         for (int mi = 0; mi < ms1.length; mi++) {
108             if (!ms1[mi].equals(ms2[mi])) {
109                 halfTranspositions++;
110             }
111         }
112         int prefix = 0;
113         for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
114             if (!first.at(mi).equals(second.at(mi))) {
115                 break;
116             }
117             prefix++;
118         }
119         return new int[] { matches, halfTranspositions, prefix };
120     }
121 
122     /**
123      * Computes the Jaro Winkler Similarity between two character sequences.
124      *
125      * <pre>
126      * sim.apply(null, null)          = IllegalArgumentException
127      * sim.apply("foo", null)         = IllegalArgumentException
128      * sim.apply(null, "foo")         = IllegalArgumentException
129      * sim.apply("", "")              = 1.0
130      * sim.apply("foo", "foo")        = 1.0
131      * sim.apply("foo", "foo ")       = 0.94
132      * sim.apply("foo", "foo  ")      = 0.91
133      * sim.apply("foo", " foo ")      = 0.87
134      * sim.apply("foo", "  foo")      = 0.51
135      * sim.apply("", "a")             = 0.0
136      * sim.apply("aaapppp", "")       = 0.0
137      * sim.apply("frog", "fog")       = 0.93
138      * sim.apply("fly", "ant")        = 0.0
139      * sim.apply("elephant", "hippo") = 0.44
140      * sim.apply("hippo", "elephant") = 0.44
141      * sim.apply("hippo", "zzzzzzzz") = 0.0
142      * sim.apply("hello", "hallo")    = 0.88
143      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
144      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
145      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
146      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
147      * </pre>
148      *
149      * @param left  the first input, must not be null.
150      * @param right the second input, must not be null.
151      * @return result similarity.
152      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
153      */
154     @Override
155     public Double apply(final CharSequence left, final CharSequence right) {
156         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
157     }
158 
159     /**
160      * Computes the Jaro Winkler Similarity between two character sequences.
161      *
162      * <pre>
163      * sim.apply(null, null)          = IllegalArgumentException
164      * sim.apply("foo", null)         = IllegalArgumentException
165      * sim.apply(null, "foo")         = IllegalArgumentException
166      * sim.apply("", "")              = 1.0
167      * sim.apply("foo", "foo")        = 1.0
168      * sim.apply("foo", "foo ")       = 0.94
169      * sim.apply("foo", "foo  ")      = 0.91
170      * sim.apply("foo", " foo ")      = 0.87
171      * sim.apply("foo", "  foo")      = 0.51
172      * sim.apply("", "a")             = 0.0
173      * sim.apply("aaapppp", "")       = 0.0
174      * sim.apply("frog", "fog")       = 0.93
175      * sim.apply("fly", "ant")        = 0.0
176      * sim.apply("elephant", "hippo") = 0.44
177      * sim.apply("hippo", "elephant") = 0.44
178      * sim.apply("hippo", "zzzzzzzz") = 0.0
179      * sim.apply("hello", "hallo")    = 0.88
180      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
181      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
182      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
183      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
184      * </pre>
185      *
186      * @param <E> The type of similarity score unit.
187      * @param left  the first input, must not be null.
188      * @param right the second input, must not be null.
189      * @return result similarity.
190      * @throws IllegalArgumentException if either CharSequence input is {@code null}.
191      * @since 1.13.0
192      */
193     public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
194         final double defaultScalingFactor = 0.1;
195         if (left == null || right == null) {
196             throw new IllegalArgumentException("CharSequences must not be null");
197         }
198         if (Objects.equals(left, right)) {
199             return 1d;
200         }
201         final int[] mtp = matches(left, right);
202         final double m = mtp[0];
203         if (m == 0) {
204             return 0d;
205         }
206         final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3;
207         return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j);
208     }
209 
210 }