View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Arrays;
20  
21  /**
22   * An algorithm for measuring the difference between two character sequences.
23   *
24   * <p>
25   * This is the number of changes needed to change one sequence into another,
26   * where each change is a single character modification (deletion, insertion
27   * or substitution).
28   * </p>
29   *
30   * @since 1.0
31   */
32  public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> {
33  
34      /**
35       * Singleton instance.
36       */
37      private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance();
38  
39      /**
40       * Finds count for each of the three [insert, delete, substitute] operations
41       * needed. This is based on the matrix formed based on the two character
42       * sequence.
43       *
44       * @param left character sequence which need to be converted from
45       * @param right character sequence which need to be converted to
46       * @param matrix two dimensional array containing
47       * @param swapped tells whether the value for left character sequence and right
48       *            character sequence were swapped to save memory
49       * @return result object containing the count of insert, delete and substitute and total count needed
50       */
51      private static LevenshteinResults findDetailedResults(final CharSequence left,
52                                                            final CharSequence right,
53                                                            final int[][] matrix,
54                                                            final boolean swapped) {
55  
56          int delCount = 0;
57          int addCount = 0;
58          int subCount = 0;
59  
60          int rowIndex = right.length();
61          int columnIndex = left.length();
62  
63          int dataAtLeft = 0;
64          int dataAtTop = 0;
65          int dataAtDiagonal = 0;
66          int data = 0;
67          boolean deleted = false;
68          boolean added = false;
69  
70          while (rowIndex >= 0 && columnIndex >= 0) {
71  
72              if (columnIndex == 0) {
73                  dataAtLeft = -1;
74              } else {
75                  dataAtLeft = matrix[rowIndex][columnIndex - 1];
76              }
77              if (rowIndex == 0) {
78                  dataAtTop = -1;
79              } else {
80                  dataAtTop = matrix[rowIndex - 1][columnIndex];
81              }
82              if (rowIndex > 0 && columnIndex > 0) {
83                  dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1];
84              } else {
85                  dataAtDiagonal = -1;
86              }
87              if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) {
88                  break;
89              }
90              data = matrix[rowIndex][columnIndex];
91  
92              // case in which the character at left and right are the same,
93              // in this case none of the counters will be incremented.
94              if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) {
95                  columnIndex--;
96                  rowIndex--;
97                  continue;
98              }
99  
100             // handling insert and delete cases.
101             deleted = false;
102             added = false;
103             if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop
104                     || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD
105                 columnIndex--;
106                 if (swapped) {
107                     addCount++;
108                     added = true;
109                 } else {
110                     delCount++;
111                     deleted = true;
112                 }
113             } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft
114                     || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD
115                 rowIndex--;
116                 if (swapped) {
117                     delCount++;
118                     deleted = true;
119                 } else {
120                     addCount++;
121                     added = true;
122                 }
123             }
124 
125             // substituted case
126             if (!added && !deleted) {
127                 subCount++;
128                 columnIndex--;
129                 rowIndex--;
130             }
131         }
132         return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount);
133     }
134 
135     /**
136      * Gets the default instance.
137      *
138      * @return The default instace
139      */
140     public static LevenshteinDetailedDistance getDefaultInstance() {
141         return INSTANCE;
142     }
143 
144     /**
145      * Finds the Levenshtein distance between two CharSequences if it's less than or
146      * equal to a given threshold.
147      *
148      * <p>
149      * This implementation follows from Algorithms on Strings, Trees and
150      * Sequences by Dan Gusfield and Chas Emerick's implementation of the
151      * Levenshtein distance algorithm from <a
152      * href="http://www.merriampark.com/ld.htm"
153      * >http://www.merriampark.com/ld.htm</a>
154      * </p>
155      *
156      * <pre>
157      * limitedCompare(null, *, *)             = IllegalArgumentException
158      * limitedCompare(*, null, *)             = IllegalArgumentException
159      * limitedCompare(*, *, -1)               = IllegalArgumentException
160      * limitedCompare("","", 0)               = 0
161      * limitedCompare("aaapppp", "", 8)       = 7
162      * limitedCompare("aaapppp", "", 7)       = 7
163      * limitedCompare("aaapppp", "", 6))      = -1
164      * limitedCompare("elephant", "hippo", 7) = 7
165      * limitedCompare("elephant", "hippo", 6) = -1
166      * limitedCompare("hippo", "elephant", 7) = 7
167      * limitedCompare("hippo", "elephant", 6) = -1
168      * </pre>
169      *
170      * @param left the first CharSequence, must not be null
171      * @param right the second CharSequence, must not be null
172      * @param threshold the target threshold, must not be negative
173      * @return result distance, or -1
174      */
175     private static LevenshteinResults limitedCompare(CharSequence left,
176                                                      CharSequence right,
177                                                      final int threshold) { //NOPMD
178         if (left == null || right == null) {
179             throw new IllegalArgumentException("CharSequences must not be null");
180         }
181         if (threshold < 0) {
182             throw new IllegalArgumentException("Threshold must not be negative");
183         }
184 
185         /*
186          * This implementation only computes the distance if it's less than or
187          * equal to the threshold value, returning -1 if it's greater. The
188          * advantage is performance: unbounded distance is O(nm), but a bound of
189          * k allows us to reduce it to O(km) time by only computing a diagonal
190          * stripe of width 2k + 1 of the cost table. It is also possible to use
191          * this to compute the unbounded Levenshtein distance by starting the
192          * threshold at 1 and doubling each time until the distance is found;
193          * this is O(dm), where d is the distance.
194          *
195          * One subtlety comes from needing to ignore entries on the border of
196          * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
197          * to the left of the leftmost member We must ignore the entry above the
198          * rightmost member
199          *
200          * Another subtlety comes from our stripe running off the matrix if the
201          * strings aren't of the same size. Since string s is always swapped to
202          * be the shorter of the two, the stripe will always run off to the
203          * upper right instead of the lower left of the matrix.
204          *
205          * As a concrete example, suppose s is of length 5, t is of length 7,
206          * and our threshold is 1. In this case we're going to walk a stripe of
207          * length 3. The matrix would look like so:
208          *
209          * <pre>
210          *    1 2 3 4 5
211          * 1 |#|#| | | |
212          * 2 |#|#|#| | |
213          * 3 | |#|#|#| |
214          * 4 | | |#|#|#|
215          * 5 | | | |#|#|
216          * 6 | | | | |#|
217          * 7 | | | | | |
218          * </pre>
219          *
220          * Note how the stripe leads off the table as there is no possible way
221          * to turn a string of length 5 into one of length 7 in edit distance of
222          * 1.
223          *
224          * Additionally, this implementation decreases memory usage by using two
225          * single-dimensional arrays and swapping them back and forth instead of
226          * allocating an entire n by m matrix. This requires a few minor
227          * changes, such as immediately returning when it's detected that the
228          * stripe has run off the matrix and initially filling the arrays with
229          * large values so that entries we don't compute are ignored.
230          *
231          * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
232          * some discussion.
233          */
234 
235         int n = left.length(); // length of left
236         int m = right.length(); // length of right
237 
238         // if one string is empty, the edit distance is necessarily the length of the other
239         if (n == 0) {
240             return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0);
241         }
242         if (m == 0) {
243             return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0);
244         }
245 
246         boolean swapped = false;
247         if (n > m) {
248             // swap the two strings to consume less memory
249             final CharSequence tmp = left;
250             left = right;
251             right = tmp;
252             n = m;
253             m = right.length();
254             swapped = true;
255         }
256 
257         int[] p = new int[n + 1]; // 'previous' cost array, horizontally
258         int[] d = new int[n + 1]; // cost array, horizontally
259         int[] tempD; // placeholder to assist in swapping p and d
260         final int[][] matrix = new int[m + 1][n + 1];
261 
262         //filling the first row and first column values in the matrix
263         for (int index = 0; index <= n; index++) {
264             matrix[0][index] = index;
265         }
266         for (int index = 0; index <= m; index++) {
267             matrix[index][0] = index;
268         }
269 
270         // fill in starting table values
271         final int boundary = Math.min(n, threshold) + 1;
272         for (int i = 0; i < boundary; i++) {
273             p[i] = i;
274         }
275         // these fills ensure that the value above the rightmost entry of our
276         // stripe will be ignored in following loop iterations
277         Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
278         Arrays.fill(d, Integer.MAX_VALUE);
279 
280         // iterates through t
281         for (int j = 1; j <= m; j++) {
282             final char rightJ = right.charAt(j - 1); // jth character of right
283             d[0] = j;
284 
285             // compute stripe indices, constrain to array size
286             final int min = Math.max(1, j - threshold);
287             final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
288                     n, j + threshold);
289 
290             // the stripe may lead off of the table if s and t are of different sizes
291             if (min > max) {
292                 return new LevenshteinResults(-1, 0, 0, 0);
293             }
294 
295             // ignore entry left of leftmost
296             if (min > 1) {
297                 d[min - 1] = Integer.MAX_VALUE;
298             }
299 
300             // iterates through [min, max] in s
301             for (int i = min; i <= max; i++) {
302                 if (left.charAt(i - 1) == rightJ) {
303                     // diagonally left and up
304                     d[i] = p[i - 1];
305                 } else {
306                     // 1 + minimum of cell to the left, to the top, diagonally left and up
307                     d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
308                 }
309                 matrix[j][i] = d[i];
310             }
311 
312             // copy current distance counts to 'previous row' distance counts
313             tempD = p;
314             p = d;
315             d = tempD;
316         }
317 
318         // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance
319         if (p[n] <= threshold) {
320             return findDetailedResults(left, right, matrix, swapped);
321         }
322         return new LevenshteinResults(-1, 0, 0, 0);
323     }
324 
325     /**
326      * Finds the Levenshtein distance between two Strings.
327      *
328      * <p>A higher score indicates a greater distance.</p>
329      *
330      * <p>The previous implementation of the Levenshtein distance algorithm
331      * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
332      *
333      * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
334      * which can occur when my Java implementation is used with very large strings.<br>
335      * This implementation of the Levenshtein distance algorithm
336      * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
337      *
338      * <pre>
339      * unlimitedCompare(null, *)             = IllegalArgumentException
340      * unlimitedCompare(*, null)             = IllegalArgumentException
341      * unlimitedCompare("","")               = 0
342      * unlimitedCompare("","a")              = 1
343      * unlimitedCompare("aaapppp", "")       = 7
344      * unlimitedCompare("frog", "fog")       = 1
345      * unlimitedCompare("fly", "ant")        = 3
346      * unlimitedCompare("elephant", "hippo") = 7
347      * unlimitedCompare("hippo", "elephant") = 7
348      * unlimitedCompare("hippo", "zzzzzzzz") = 8
349      * unlimitedCompare("hello", "hallo")    = 1
350      * </pre>
351      *
352      * @param left the first CharSequence, must not be null
353      * @param right the second CharSequence, must not be null
354      * @return result distance, or -1
355      * @throws IllegalArgumentException if either CharSequence input is {@code null}
356      */
357     private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) {
358         if (left == null || right == null) {
359             throw new IllegalArgumentException("CharSequences must not be null");
360         }
361 
362         /*
363            The difference between this impl. and the previous is that, rather
364            than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
365            we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
366            is the 'current working' distance array that maintains the newest distance cost
367            counts as we iterate through the characters of String s.  Each time we increment
368            the index of String t we are comparing, d is copied to p, the second int[].  Doing so
369            allows us to retain the previous cost counts as required by the algorithm (taking
370            the minimum of the cost count to the left, up one, and diagonally up and to the left
371            of the current cost count being calculated).  (Note that the arrays aren't really
372            copied anymore, just switched...this is clearly much better than cloning an array
373            or doing a System.arraycopy() each time  through the outer loop.)
374 
375            Effectively, the difference between the two implementations is this one does not
376            cause an out of memory condition when calculating the LD over two very large strings.
377          */
378 
379         int n = left.length(); // length of left
380         int m = right.length(); // length of right
381 
382         if (n == 0) {
383             return new LevenshteinResults(m, m, 0, 0);
384         }
385         if (m == 0) {
386             return new LevenshteinResults(n, 0, n, 0);
387         }
388         boolean swapped = false;
389         if (n > m) {
390             // swap the input strings to consume less memory
391             final CharSequence tmp = left;
392             left = right;
393             right = tmp;
394             n = m;
395             m = right.length();
396             swapped = true;
397         }
398 
399         int[] p = new int[n + 1]; // 'previous' cost array, horizontally
400         int[] d = new int[n + 1]; // cost array, horizontally
401         int[] tempD; //placeholder to assist in swapping p and d
402         final int[][] matrix = new int[m + 1][n + 1];
403 
404         // filling the first row and first column values in the matrix
405         for (int index = 0; index <= n; index++) {
406             matrix[0][index] = index;
407         }
408         for (int index = 0; index <= m; index++) {
409             matrix[index][0] = index;
410         }
411 
412         // indexes into strings left and right
413         int i; // iterates through left
414         int j; // iterates through right
415 
416         char rightJ; // jth character of right
417 
418         int cost; // cost
419         for (i = 0; i <= n; i++) {
420             p[i] = i;
421         }
422 
423         for (j = 1; j <= m; j++) {
424             rightJ = right.charAt(j - 1);
425             d[0] = j;
426 
427             for (i = 1; i <= n; i++) {
428                 cost = left.charAt(i - 1) == rightJ ? 0 : 1;
429                 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
430                 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
431                 //filling the matrix
432                 matrix[j][i] = d[i];
433             }
434 
435             // copy current distance counts to 'previous row' distance counts
436             tempD = p;
437             p = d;
438             d = tempD;
439         }
440         return findDetailedResults(left, right, matrix, swapped);
441     }
442 
443     /**
444      * Threshold.
445      */
446     private final Integer threshold;
447 
448     /**
449      * <p>
450      * This returns the default instance that uses a version
451      * of the algorithm that does not use a threshold parameter.
452      * </p>
453      *
454      * @see LevenshteinDetailedDistance#getDefaultInstance()
455      */
456     public LevenshteinDetailedDistance() {
457         this(null);
458     }
459 
460     /**
461      * If the threshold is not null, distance calculations will be limited to a maximum length.
462      *
463      * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p>
464      *
465      * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
466      */
467     public LevenshteinDetailedDistance(final Integer threshold) {
468         if (threshold != null && threshold < 0) {
469             throw new IllegalArgumentException("Threshold must not be negative");
470         }
471         this.threshold = threshold;
472     }
473 
474     /**
475      * Finds the Levenshtein distance between two Strings.
476      *
477      * <p>A higher score indicates a greater distance.</p>
478      *
479      * <p>The previous implementation of the Levenshtein distance algorithm
480      * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
481      *
482      * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
483      * which can occur when my Java implementation is used with very large strings.<br>
484      * This implementation of the Levenshtein distance algorithm
485      * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
486      *
487      * <pre>
488      * distance.apply(null, *)             = IllegalArgumentException
489      * distance.apply(*, null)             = IllegalArgumentException
490      * distance.apply("","")               = 0
491      * distance.apply("","a")              = 1
492      * distance.apply("aaapppp", "")       = 7
493      * distance.apply("frog", "fog")       = 1
494      * distance.apply("fly", "ant")        = 3
495      * distance.apply("elephant", "hippo") = 7
496      * distance.apply("hippo", "elephant") = 7
497      * distance.apply("hippo", "zzzzzzzz") = 8
498      * distance.apply("hello", "hallo")    = 1
499      * </pre>
500      *
501      * @param left the first string, must not be null
502      * @param right the second string, must not be null
503      * @return result distance, or -1
504      * @throws IllegalArgumentException if either String input {@code null}
505      */
506     @Override
507     public LevenshteinResults apply(final CharSequence left, final CharSequence right) {
508         if (threshold != null) {
509             return limitedCompare(left, right, threshold);
510         }
511         return unlimitedCompare(left, right);
512     }
513 
514     /**
515      * Gets the distance threshold.
516      *
517      * @return The distance threshold
518      */
519     public Integer getThreshold() {
520         return threshold;
521     }
522 }