View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  
21  /**
22   * An algorithm for measuring the difference between two character sequences.
23   *
24   * <p>
25   * This is the number of changes needed to change one sequence into another,
26   * where each change is a single character modification (deletion, insertion
27   * or substitution).
28   * </p>
29   *
30   * <p>
31   * This code has been adapted from Apache Commons Lang 3.3.
32   * </p>
33   *
34   * @since 1.0
35   */
36  public class LevenshteinDistance implements EditDistance<Integer> {
37  
38      /**
39       * Default instance.
40       */
41      private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance();
42  
43      /**
44       * Threshold.
45       */
46      private final Integer threshold;
47  
48      /**
49       * <p>
50       * This returns the default instance that uses a version
51       * of the algorithm that does not use a threshold parameter.
52       * </p>
53       *
54       * @see LevenshteinDistance#getDefaultInstance()
55       */
56      public LevenshteinDistance() {
57          this(null);
58      }
59  
60      /**
61       * <p>
62       * If the threshold is not null, distance calculations will be limited to a maximum length.
63       * If the threshold is null, the unlimited version of the algorithm will be used.
64       * </p>
65       *
66       * @param threshold
67       *        If this is null then distances calculations will not be limited.
68       *        This may not be negative.
69       */
70      public LevenshteinDistance(final Integer threshold) {
71          if (threshold != null && threshold < 0) {
72              throw new IllegalArgumentException("Threshold must not be negative");
73          }
74          this.threshold = threshold;
75      }
76  
77      /**
78       * <p>Find the Levenshtein distance between two Strings.</p>
79       *
80       * <p>A higher score indicates a greater distance.</p>
81       *
82       * <p>The previous implementation of the Levenshtein distance algorithm
83       * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
84       *
85       * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
86       * which can occur when my Java implementation is used with very large strings.<br>
87       * This implementation of the Levenshtein distance algorithm
88       * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
89       *
90       * <pre>
91       * distance.apply(null, *)             = IllegalArgumentException
92       * distance.apply(*, null)             = IllegalArgumentException
93       * distance.apply("","")               = 0
94       * distance.apply("","a")              = 1
95       * distance.apply("aaapppp", "")       = 7
96       * distance.apply("frog", "fog")       = 1
97       * distance.apply("fly", "ant")        = 3
98       * distance.apply("elephant", "hippo") = 7
99       * distance.apply("hippo", "elephant") = 7
100      * distance.apply("hippo", "zzzzzzzz") = 8
101      * distance.apply("hello", "hallo")    = 1
102      * </pre>
103      *
104      * @param left the first string, must not be null
105      * @param right the second string, must not be null
106      * @return result distance, or -1
107      * @throws IllegalArgumentException if either String input {@code null}
108      */
109     @Override
110     public Integer apply(final CharSequence left, final CharSequence right) {
111         if (threshold != null) {
112             return limitedCompare(left, right, threshold);
113         }
114         return unlimitedCompare(left, right);
115     }
116 
117     /**
118      * Gets the default instance.
119      *
120      * @return the default instace
121      */
122     public static LevenshteinDistance getDefaultInstance() {
123         return DEFAULT_INSTANCE;
124     }
125 
126     /**
127      * Gets the distance threshold.
128      *
129      * @return the distance threshold
130      */
131     public Integer getThreshold() {
132         return threshold;
133     }
134 
135     /**
136      * Find the Levenshtein distance between two CharSequences if it's less than or
137      * equal to a given threshold.
138      *
139      * <p>
140      * This implementation follows from Algorithms on Strings, Trees and
141      * Sequences by Dan Gusfield and Chas Emerick's implementation of the
142      * Levenshtein distance algorithm from <a
143      * href="http://www.merriampark.com/ld.htm"
144      * >http://www.merriampark.com/ld.htm</a>;
145      * </p>
146      *
147      * <pre>
148      * limitedCompare(null, *, *)             = IllegalArgumentException
149      * limitedCompare(*, null, *)             = IllegalArgumentException
150      * limitedCompare(*, *, -1)               = IllegalArgumentException
151      * limitedCompare("","", 0)               = 0
152      * limitedCompare("aaapppp", "", 8)       = 7
153      * limitedCompare("aaapppp", "", 7)       = 7
154      * limitedCompare("aaapppp", "", 6))      = -1
155      * limitedCompare("elephant", "hippo", 7) = 7
156      * limitedCompare("elephant", "hippo", 6) = -1
157      * limitedCompare("hippo", "elephant", 7) = 7
158      * limitedCompare("hippo", "elephant", 6) = -1
159      * </pre>
160      *
161      * @param left the first string, must not be null
162      * @param right the second string, must not be null
163      * @param threshold the target threshold, must not be negative
164      * @return result distance, or -1
165      */
166     private static int limitedCompare(CharSequence left, CharSequence right, final int threshold) { // NOPMD
167         if (left == null || right == null) {
168             throw new IllegalArgumentException("Strings must not be null");
169         }
170         if (threshold < 0) {
171             throw new IllegalArgumentException("Threshold must not be negative");
172         }
173 
174         /*
175          * This implementation only computes the distance if it's less than or
176          * equal to the threshold value, returning -1 if it's greater. The
177          * advantage is performance: unbounded distance is O(nm), but a bound of
178          * k allows us to reduce it to O(km) time by only computing a diagonal
179          * stripe of width 2k + 1 of the cost table. It is also possible to use
180          * this to compute the unbounded Levenshtein distance by starting the
181          * threshold at 1 and doubling each time until the distance is found;
182          * this is O(dm), where d is the distance.
183          *
184          * One subtlety comes from needing to ignore entries on the border of
185          * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
186          * to the left of the leftmost member We must ignore the entry above the
187          * rightmost member
188          *
189          * Another subtlety comes from our stripe running off the matrix if the
190          * strings aren't of the same size. Since string s is always swapped to
191          * be the shorter of the two, the stripe will always run off to the
192          * upper right instead of the lower left of the matrix.
193          *
194          * As a concrete example, suppose s is of length 5, t is of length 7,
195          * and our threshold is 1. In this case we're going to walk a stripe of
196          * length 3. The matrix would look like so:
197          *
198          * <pre>
199          *    1 2 3 4 5
200          * 1 |#|#| | | |
201          * 2 |#|#|#| | |
202          * 3 | |#|#|#| |
203          * 4 | | |#|#|#|
204          * 5 | | | |#|#|
205          * 6 | | | | |#|
206          * 7 | | | | | |
207          * </pre>
208          *
209          * Note how the stripe leads off the table as there is no possible way
210          * to turn a string of length 5 into one of length 7 in edit distance of
211          * 1.
212          *
213          * Additionally, this implementation decreases memory usage by using two
214          * single-dimensional arrays and swapping them back and forth instead of
215          * allocating an entire n by m matrix. This requires a few minor
216          * changes, such as immediately returning when it's detected that the
217          * stripe has run off the matrix and initially filling the arrays with
218          * large values so that entries we don't compute are ignored.
219          *
220          * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
221          * some discussion.
222          */
223 
224         int n = left.length(); // length of left
225         int m = right.length(); // length of right
226 
227         // if one string is empty, the edit distance is necessarily the length
228         // of the other
229         if (n == 0) {
230             return m <= threshold ? m : -1;
231         } else if (m == 0) {
232             return n <= threshold ? n : -1;
233         }
234 
235         if (n > m) {
236             // swap the two strings to consume less memory
237             final CharSequence tmp = left;
238             left = right;
239             right = tmp;
240             n = m;
241             m = right.length();
242         }
243 
244         int[] p = new int[n + 1]; // 'previous' cost array, horizontally
245         int[] d = new int[n + 1]; // cost array, horizontally
246         int[] tempD; // placeholder to assist in swapping p and d
247 
248         // fill in starting table values
249         final int boundary = Math.min(n, threshold) + 1;
250         for (int i = 0; i < boundary; i++) {
251             p[i] = i;
252         }
253         // these fills ensure that the value above the rightmost entry of our
254         // stripe will be ignored in following loop iterations
255         Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
256         Arrays.fill(d, Integer.MAX_VALUE);
257 
258         // iterates through t
259         for (int j = 1; j <= m; j++) {
260             final char rightJ = right.charAt(j - 1); // jth character of right
261             d[0] = j;
262 
263             // compute stripe indices, constrain to array size
264             final int min = Math.max(1, j - threshold);
265             final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
266                     n, j + threshold);
267 
268             // the stripe may lead off of the table if s and t are of different
269             // sizes
270             if (min > max) {
271                 return -1;
272             }
273 
274             // ignore entry left of leftmost
275             if (min > 1) {
276                 d[min - 1] = Integer.MAX_VALUE;
277             }
278 
279             // iterates through [min, max] in s
280             for (int i = min; i <= max; i++) {
281                 if (left.charAt(i - 1) == rightJ) {
282                     // diagonally left and up
283                     d[i] = p[i - 1];
284                 } else {
285                     // 1 + minimum of cell to the left, to the top, diagonally
286                     // left and up
287                     d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
288                 }
289             }
290 
291             // copy current distance counts to 'previous row' distance counts
292             tempD = p;
293             p = d;
294             d = tempD;
295         }
296 
297         // if p[n] is greater than the threshold, there's no guarantee on it
298         // being the correct
299         // distance
300         if (p[n] <= threshold) {
301             return p[n];
302         }
303         return -1;
304     }
305 
306     /**
307      * <p>Find the Levenshtein distance between two Strings.</p>
308      *
309      * <p>A higher score indicates a greater distance.</p>
310      *
311      * <p>The previous implementation of the Levenshtein distance algorithm
312      * was from <a href="https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm">
313      * https://web.archive.org/web/20120526085419/http://www.merriampark.com/ldjava.htm</a></p>;
314      *
315      * <p>This implementation only need one single-dimensional arrays of length s.length() + 1</p>
316      *
317      * <pre>
318      * unlimitedCompare(null, *)             = IllegalArgumentException
319      * unlimitedCompare(*, null)             = IllegalArgumentException
320      * unlimitedCompare("","")               = 0
321      * unlimitedCompare("","a")              = 1
322      * unlimitedCompare("aaapppp", "")       = 7
323      * unlimitedCompare("frog", "fog")       = 1
324      * unlimitedCompare("fly", "ant")        = 3
325      * unlimitedCompare("elephant", "hippo") = 7
326      * unlimitedCompare("hippo", "elephant") = 7
327      * unlimitedCompare("hippo", "zzzzzzzz") = 8
328      * unlimitedCompare("hello", "hallo")    = 1
329      * </pre>
330      *
331      * @param left the first String, must not be null
332      * @param right the second String, must not be null
333      * @return result distance, or -1
334      * @throws IllegalArgumentException if either String input {@code null}
335      */
336     private static int unlimitedCompare(CharSequence left, CharSequence right) {
337         if (left == null || right == null) {
338             throw new IllegalArgumentException("Strings must not be null");
339         }
340 
341         /*
342            This implementation use two variable to record the previous cost counts,
343            So this implementation use less memory than previous impl.
344          */
345 
346         int n = left.length(); // length of left
347         int m = right.length(); // length of right
348 
349         if (n == 0) {
350             return m;
351         } else if (m == 0) {
352             return n;
353         }
354 
355         if (n > m) {
356             // swap the input strings to consume less memory
357             final CharSequence tmp = left;
358             left = right;
359             right = tmp;
360             n = m;
361             m = right.length();
362         }
363 
364         int[] p = new int[n + 1];
365 
366         // indexes into strings left and right
367         int i; // iterates through left
368         int j; // iterates through right
369         int upper_left;
370         int upper;
371 
372         char rightJ; // jth character of right
373         int cost; // cost
374 
375         for (i = 0; i <= n; i++) {
376             p[i] = i;
377         }
378 
379         for (j = 1; j <= m; j++) {
380             upper_left = p[0];
381             rightJ = right.charAt(j - 1);
382             p[0] = j;
383 
384             for (i = 1; i <= n; i++) {
385                 upper = p[i];
386                 cost = left.charAt(i - 1) == rightJ ? 0 : 1;
387                 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
388                 p[i] = Math.min(Math.min(p[i - 1] + 1, p[i] + 1), upper_left + cost);
389                 upper_left = upper;
390             }
391         }
392 
393         return p[n];
394     }
395 
396 }