View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  
21  import org.apache.commons.lang3.StringUtils;
22  
23  /**
24   * A similarity algorithm indicating the percentage of matched characters between two character sequences.
25   *
26   * <p>
27   * The Jaro measure is the weighted sum of percentage of matched characters
28   * from each file and transposed characters. Winkler increased this measure
29   * for matching initial characters.
30   * </p>
31   *
32   * <p>
33   * This implementation is based on the Jaro Winkler similarity algorithm
34   * from <a href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
35   * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>.
36   * </p>
37   *
38   * <p>
39   * This code has been adapted from Apache Commons Lang 3.3.
40   * </p>
41   *
42   * @since 1.7
43   */
44  public class JaroWinklerSimilarity implements SimilarityScore<Double> {
45  
46      /**
47       * Singleton instance.
48       */
49      static final JaroWinklerSimilarity INSTANCE = new JaroWinklerSimilarity();
50  
51      /**
52       * This method returns the Jaro-Winkler string matches, half transpositions, prefix array.
53       *
54       * @param first the first string to be matched
55       * @param second the second string to be matched
56       * @return mtp array containing: matches, half transpositions, and prefix
57       */
58      protected static int[] matches(final CharSequence first, final CharSequence second) {
59          final CharSequence max;
60          final CharSequence min;
61          if (first.length() > second.length()) {
62              max = first;
63              min = second;
64          } else {
65              max = second;
66              min = first;
67          }
68          final int range = Math.max(max.length() / 2 - 1, 0);
69          final int[] matchIndexes = new int[min.length()];
70          Arrays.fill(matchIndexes, -1);
71          final boolean[] matchFlags = new boolean[max.length()];
72          int matches = 0;
73          for (int mi = 0; mi < min.length(); mi++) {
74              final char c1 = min.charAt(mi);
75              for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
76                  if (!matchFlags[xi] && c1 == max.charAt(xi)) {
77                      matchIndexes[mi] = xi;
78                      matchFlags[xi] = true;
79                      matches++;
80                      break;
81                  }
82              }
83          }
84          final char[] ms1 = new char[matches];
85          final char[] ms2 = new char[matches];
86          for (int i = 0, si = 0; i < min.length(); i++) {
87              if (matchIndexes[i] != -1) {
88                  ms1[si] = min.charAt(i);
89                  si++;
90              }
91          }
92          for (int i = 0, si = 0; i < max.length(); i++) {
93              if (matchFlags[i]) {
94                  ms2[si] = max.charAt(i);
95                  si++;
96              }
97          }
98          int halfTranspositions = 0;
99          for (int mi = 0; mi < ms1.length; mi++) {
100             if (ms1[mi] != ms2[mi]) {
101                 halfTranspositions++;
102             }
103         }
104         int prefix = 0;
105         for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
106             if (first.charAt(mi) != second.charAt(mi)) {
107                 break;
108             }
109             prefix++;
110         }
111         return new int[] {matches, halfTranspositions, prefix};
112     }
113 
114     /**
115      * Computes the Jaro Winkler Similarity between two character sequences.
116      *
117      * <pre>
118      * sim.apply(null, null)          = IllegalArgumentException
119      * sim.apply("foo", null)         = IllegalArgumentException
120      * sim.apply(null, "foo")         = IllegalArgumentException
121      * sim.apply("", "")              = 1.0
122      * sim.apply("foo", "foo")        = 1.0
123      * sim.apply("foo", "foo ")       = 0.94
124      * sim.apply("foo", "foo  ")      = 0.91
125      * sim.apply("foo", " foo ")      = 0.87
126      * sim.apply("foo", "  foo")      = 0.51
127      * sim.apply("", "a")             = 0.0
128      * sim.apply("aaapppp", "")       = 0.0
129      * sim.apply("frog", "fog")       = 0.93
130      * sim.apply("fly", "ant")        = 0.0
131      * sim.apply("elephant", "hippo") = 0.44
132      * sim.apply("hippo", "elephant") = 0.44
133      * sim.apply("hippo", "zzzzzzzz") = 0.0
134      * sim.apply("hello", "hallo")    = 0.88
135      * sim.apply("ABC Corporation", "ABC Corp") = 0.91
136      * sim.apply("D N H Enterprises Inc", "D &amp; H Enterprises, Inc.") = 0.95
137      * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92
138      * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88
139      * </pre>
140      *
141      * @param left the first CharSequence, must not be null
142      * @param right the second CharSequence, must not be null
143      * @return result similarity
144      * @throws IllegalArgumentException if either CharSequence input is {@code null}
145      */
146     @Override
147     public Double apply(final CharSequence left, final CharSequence right) {
148         final double defaultScalingFactor = 0.1;
149 
150         if (left == null || right == null) {
151             throw new IllegalArgumentException("CharSequences must not be null");
152         }
153 
154         if (StringUtils.equals(left, right)) {
155             return 1d;
156         }
157 
158         final int[] mtp = matches(left, right);
159         final double m = mtp[0];
160         if (m == 0) {
161             return 0d;
162         }
163         final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3;
164         return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j);
165     }
166 
167 }