Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Arrays;
020
021/**
022 * An algorithm for measuring the difference between two character sequences.
023 *
024 * <p>
025 * This is the number of changes needed to change one sequence into another,
026 * where each change is a single character modification (deletion, insertion
027 * or substitution).
028 * </p>
029 *
030 * @since 1.0
031 */
032public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> {
033
034    /**
035     * Singleton instance.
036     */
037    private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance();
038
039    /**
040     * Finds count for each of the three [insert, delete, substitute] operations
041     * needed. This is based on the matrix formed based on the two character
042     * sequence.
043     *
044     * @param <E> The type of similarity score unit.
045     * @param left character sequence which need to be converted from
046     * @param right character sequence which need to be converted to
047     * @param matrix two dimensional array containing
048     * @param swapped tells whether the value for left character sequence and right
049     *            character sequence were swapped to save memory
050     * @return result object containing the count of insert, delete and substitute and total count needed
051     */
052    private static <E> LevenshteinResults findDetailedResults(final SimilarityInput<E> left,
053                                                          final SimilarityInput<E> right,
054                                                          final int[][] matrix,
055                                                          final boolean swapped) {
056
057        int delCount = 0;
058        int addCount = 0;
059        int subCount = 0;
060
061        int rowIndex = right.length();
062        int columnIndex = left.length();
063
064        int dataAtLeft = 0;
065        int dataAtTop = 0;
066        int dataAtDiagonal = 0;
067        int data = 0;
068        boolean deleted = false;
069        boolean added = false;
070
071        while (rowIndex >= 0 && columnIndex >= 0) {
072
073            if (columnIndex == 0) {
074                dataAtLeft = -1;
075            } else {
076                dataAtLeft = matrix[rowIndex][columnIndex - 1];
077            }
078            if (rowIndex == 0) {
079                dataAtTop = -1;
080            } else {
081                dataAtTop = matrix[rowIndex - 1][columnIndex];
082            }
083            if (rowIndex > 0 && columnIndex > 0) {
084                dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1];
085            } else {
086                dataAtDiagonal = -1;
087            }
088            if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) {
089                break;
090            }
091            data = matrix[rowIndex][columnIndex];
092
093            // case in which the character at left and right are the same,
094            // in this case none of the counters will be incremented.
095            if (columnIndex > 0 && rowIndex > 0 && left.at(columnIndex - 1).equals(right.at(rowIndex - 1))) {
096                columnIndex--;
097                rowIndex--;
098                continue;
099            }
100
101            // handling insert and delete cases.
102            deleted = false;
103            added = false;
104            if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop
105                    || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD
106                columnIndex--;
107                if (swapped) {
108                    addCount++;
109                    added = true;
110                } else {
111                    delCount++;
112                    deleted = true;
113                }
114            } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft
115                    || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD
116                rowIndex--;
117                if (swapped) {
118                    delCount++;
119                    deleted = true;
120                } else {
121                    addCount++;
122                    added = true;
123                }
124            }
125
126            // substituted case
127            if (!added && !deleted) {
128                subCount++;
129                columnIndex--;
130                rowIndex--;
131            }
132        }
133        return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount);
134    }
135
136    /**
137     * Gets the default instance.
138     *
139     * @return The default instace
140     */
141    public static LevenshteinDetailedDistance getDefaultInstance() {
142        return INSTANCE;
143    }
144
145    /**
146     * Finds the Levenshtein distance between two CharSequences if it's less than or
147     * equal to a given threshold.
148     *
149     * <p>
150     * This implementation follows from Algorithms on Strings, Trees and
151     * Sequences by Dan Gusfield and Chas Emerick's implementation of the
152     * Levenshtein distance algorithm from <a
153     * href="http://www.merriampark.com/ld.htm"
154     * >http://www.merriampark.com/ld.htm</a>
155     * </p>
156     *
157     * <pre>
158     * limitedCompare(null, *, *)             = IllegalArgumentException
159     * limitedCompare(*, null, *)             = IllegalArgumentException
160     * limitedCompare(*, *, -1)               = IllegalArgumentException
161     * limitedCompare("","", 0)               = 0
162     * limitedCompare("aaapppp", "", 8)       = 7
163     * limitedCompare("aaapppp", "", 7)       = 7
164     * limitedCompare("aaapppp", "", 6))      = -1
165     * limitedCompare("elephant", "hippo", 7) = 7
166     * limitedCompare("elephant", "hippo", 6) = -1
167     * limitedCompare("hippo", "elephant", 7) = 7
168     * limitedCompare("hippo", "elephant", 6) = -1
169     * </pre>
170     *
171     * @param <E> The type of similarity score unit.
172     * @param left the first CharSequence, must not be null
173     * @param right the second CharSequence, must not be null
174     * @param threshold the target threshold, must not be negative
175     * @return result distance, or -1
176     */
177    private static <E> LevenshteinResults limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) { //NOPMD
178        if (left == null || right == null) {
179            throw new IllegalArgumentException("CharSequences must not be null");
180        }
181        if (threshold < 0) {
182            throw new IllegalArgumentException("Threshold must not be negative");
183        }
184
185        /*
186         * This implementation only computes the distance if it's less than or
187         * equal to the threshold value, returning -1 if it's greater. The
188         * advantage is performance: unbounded distance is O(nm), but a bound of
189         * k allows us to reduce it to O(km) time by only computing a diagonal
190         * stripe of width 2k + 1 of the cost table. It is also possible to use
191         * this to compute the unbounded Levenshtein distance by starting the
192         * threshold at 1 and doubling each time until the distance is found;
193         * this is O(dm), where d is the distance.
194         *
195         * One subtlety comes from needing to ignore entries on the border of
196         * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry
197         * to the left of the leftmost member We must ignore the entry above the
198         * rightmost member
199         *
200         * Another subtlety comes from our stripe running off the matrix if the
201         * strings aren't of the same size. Since string s is always swapped to
202         * be the shorter of the two, the stripe will always run off to the
203         * upper right instead of the lower left of the matrix.
204         *
205         * As a concrete example, suppose s is of length 5, t is of length 7,
206         * and our threshold is 1. In this case we're going to walk a stripe of
207         * length 3. The matrix would look like so:
208         *
209         * <pre>
210         *    1 2 3 4 5
211         * 1 |#|#| | | |
212         * 2 |#|#|#| | |
213         * 3 | |#|#|#| |
214         * 4 | | |#|#|#|
215         * 5 | | | |#|#|
216         * 6 | | | | |#|
217         * 7 | | | | | |
218         * </pre>
219         *
220         * Note how the stripe leads off the table as there is no possible way
221         * to turn a string of length 5 into one of length 7 in edit distance of
222         * 1.
223         *
224         * Additionally, this implementation decreases memory usage by using two
225         * single-dimensional arrays and swapping them back and forth instead of
226         * allocating an entire n by m matrix. This requires a few minor
227         * changes, such as immediately returning when it's detected that the
228         * stripe has run off the matrix and initially filling the arrays with
229         * large values so that entries we don't compute are ignored.
230         *
231         * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for
232         * some discussion.
233         */
234
235        int n = left.length(); // length of left
236        int m = right.length(); // length of right
237
238        // if one string is empty, the edit distance is necessarily the length of the other
239        if (n == 0) {
240            return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0);
241        }
242        if (m == 0) {
243            return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0);
244        }
245
246        boolean swapped = false;
247        if (n > m) {
248            // swap the two strings to consume less memory
249            final SimilarityInput<E> tmp = left;
250            left = right;
251            right = tmp;
252            n = m;
253            m = right.length();
254            swapped = true;
255        }
256
257        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
258        int[] d = new int[n + 1]; // cost array, horizontally
259        int[] tempD; // placeholder to assist in swapping p and d
260        final int[][] matrix = new int[m + 1][n + 1];
261
262        //filling the first row and first column values in the matrix
263        for (int index = 0; index <= n; index++) {
264            matrix[0][index] = index;
265        }
266        for (int index = 0; index <= m; index++) {
267            matrix[index][0] = index;
268        }
269
270        // fill in starting table values
271        final int boundary = Math.min(n, threshold) + 1;
272        for (int i = 0; i < boundary; i++) {
273            p[i] = i;
274        }
275        // these fills ensure that the value above the rightmost entry of our
276        // stripe will be ignored in following loop iterations
277        Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
278        Arrays.fill(d, Integer.MAX_VALUE);
279
280        // iterates through t
281        for (int j = 1; j <= m; j++) {
282            final E rightJ = right.at(j - 1); // jth character of right
283            d[0] = j;
284
285            // compute stripe indices, constrain to array size
286            final int min = Math.max(1, j - threshold);
287            final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(
288                    n, j + threshold);
289
290            // the stripe may lead off of the table if s and t are of different sizes
291            if (min > max) {
292                return new LevenshteinResults(-1, 0, 0, 0);
293            }
294
295            // ignore entry left of leftmost
296            if (min > 1) {
297                d[min - 1] = Integer.MAX_VALUE;
298            }
299
300            // iterates through [min, max] in s
301            for (int i = min; i <= max; i++) {
302                if (left.at(i - 1).equals(rightJ)) {
303                    // diagonally left and up
304                    d[i] = p[i - 1];
305                } else {
306                    // 1 + minimum of cell to the left, to the top, diagonally left and up
307                    d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
308                }
309                matrix[j][i] = d[i];
310            }
311
312            // copy current distance counts to 'previous row' distance counts
313            tempD = p;
314            p = d;
315            d = tempD;
316        }
317
318        // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance
319        if (p[n] <= threshold) {
320            return findDetailedResults(left, right, matrix, swapped);
321        }
322        return new LevenshteinResults(-1, 0, 0, 0);
323    }
324
325    /**
326     * Finds the Levenshtein distance between two Strings.
327     *
328     * <p>A higher score indicates a greater distance.</p>
329     *
330     * <p>The previous implementation of the Levenshtein distance algorithm
331     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
332     *
333     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
334     * which can occur when my Java implementation is used with very large strings.<br>
335     * This implementation of the Levenshtein distance algorithm
336     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
337     *
338     * <pre>
339     * unlimitedCompare(null, *)             = IllegalArgumentException
340     * unlimitedCompare(*, null)             = IllegalArgumentException
341     * unlimitedCompare("","")               = 0
342     * unlimitedCompare("","a")              = 1
343     * unlimitedCompare("aaapppp", "")       = 7
344     * unlimitedCompare("frog", "fog")       = 1
345     * unlimitedCompare("fly", "ant")        = 3
346     * unlimitedCompare("elephant", "hippo") = 7
347     * unlimitedCompare("hippo", "elephant") = 7
348     * unlimitedCompare("hippo", "zzzzzzzz") = 8
349     * unlimitedCompare("hello", "hallo")    = 1
350     * </pre>
351     *
352     * @param <E> The type of similarity score unit.
353     * @param left the first CharSequence, must not be null
354     * @param right the second CharSequence, must not be null
355     * @return result distance, or -1
356     * @throws IllegalArgumentException if either CharSequence input is {@code null}
357     */
358    private static <E> LevenshteinResults unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) {
359        if (left == null || right == null) {
360            throw new IllegalArgumentException("CharSequences must not be null");
361        }
362
363        /*
364           The difference between this impl. and the previous is that, rather
365           than creating and retaining a matrix of size s.length() + 1 by t.length() + 1,
366           we maintain two single-dimensional arrays of length s.length() + 1.  The first, d,
367           is the 'current working' distance array that maintains the newest distance cost
368           counts as we iterate through the characters of String s.  Each time we increment
369           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
370           allows us to retain the previous cost counts as required by the algorithm (taking
371           the minimum of the cost count to the left, up one, and diagonally up and to the left
372           of the current cost count being calculated).  (Note that the arrays aren't really
373           copied anymore, just switched...this is clearly much better than cloning an array
374           or doing a System.arraycopy() each time  through the outer loop.)
375
376           Effectively, the difference between the two implementations is this one does not
377           cause an out of memory condition when calculating the LD over two very large strings.
378         */
379
380        int n = left.length(); // length of left
381        int m = right.length(); // length of right
382
383        if (n == 0) {
384            return new LevenshteinResults(m, m, 0, 0);
385        }
386        if (m == 0) {
387            return new LevenshteinResults(n, 0, n, 0);
388        }
389        boolean swapped = false;
390        if (n > m) {
391            // swap the input strings to consume less memory
392            final SimilarityInput<E> tmp = left;
393            left = right;
394            right = tmp;
395            n = m;
396            m = right.length();
397            swapped = true;
398        }
399
400        int[] p = new int[n + 1]; // 'previous' cost array, horizontally
401        int[] d = new int[n + 1]; // cost array, horizontally
402        int[] tempD; //placeholder to assist in swapping p and d
403        final int[][] matrix = new int[m + 1][n + 1];
404
405        // filling the first row and first column values in the matrix
406        for (int index = 0; index <= n; index++) {
407            matrix[0][index] = index;
408        }
409        for (int index = 0; index <= m; index++) {
410            matrix[index][0] = index;
411        }
412
413        // indexes into strings left and right
414        int i; // iterates through left
415        int j; // iterates through right
416
417        E rightJ; // jth character of right
418
419        int cost; // cost
420        for (i = 0; i <= n; i++) {
421            p[i] = i;
422        }
423
424        for (j = 1; j <= m; j++) {
425            rightJ = right.at(j - 1);
426            d[0] = j;
427
428            for (i = 1; i <= n; i++) {
429                cost = left.at(i - 1).equals(rightJ) ? 0 : 1;
430                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
431                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
432                //filling the matrix
433                matrix[j][i] = d[i];
434            }
435
436            // copy current distance counts to 'previous row' distance counts
437            tempD = p;
438            p = d;
439            d = tempD;
440        }
441        return findDetailedResults(left, right, matrix, swapped);
442    }
443
444    /**
445     * Threshold.
446     */
447    private final Integer threshold;
448
449    /**
450     * <p>
451     * This returns the default instance that uses a version
452     * of the algorithm that does not use a threshold parameter.
453     * </p>
454     *
455     * @see LevenshteinDetailedDistance#getDefaultInstance()
456     * @deprecated Use {@link #getDefaultInstance()}.
457     */
458    @Deprecated
459    public LevenshteinDetailedDistance() {
460        this(null);
461    }
462
463    /**
464     * If the threshold is not null, distance calculations will be limited to a maximum length.
465     *
466     * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p>
467     *
468     * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
469     */
470    public LevenshteinDetailedDistance(final Integer threshold) {
471        if (threshold != null && threshold < 0) {
472            throw new IllegalArgumentException("Threshold must not be negative");
473        }
474        this.threshold = threshold;
475    }
476
477    /**
478     * Computes the Levenshtein distance between two Strings.
479     *
480     * <p>A higher score indicates a greater distance.</p>
481     *
482     * <p>The previous implementation of the Levenshtein distance algorithm
483     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
484     *
485     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
486     * which can occur when my Java implementation is used with very large strings.<br>
487     * This implementation of the Levenshtein distance algorithm
488     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
489     *
490     * <pre>
491     * distance.apply(null, *)             = IllegalArgumentException
492     * distance.apply(*, null)             = IllegalArgumentException
493     * distance.apply("","")               = 0
494     * distance.apply("","a")              = 1
495     * distance.apply("aaapppp", "")       = 7
496     * distance.apply("frog", "fog")       = 1
497     * distance.apply("fly", "ant")        = 3
498     * distance.apply("elephant", "hippo") = 7
499     * distance.apply("hippo", "elephant") = 7
500     * distance.apply("hippo", "zzzzzzzz") = 8
501     * distance.apply("hello", "hallo")    = 1
502     * </pre>
503     *
504     * @param left the first input, must not be null
505     * @param right the second input, must not be null
506     * @return result distance, or -1
507     * @throws IllegalArgumentException if either String input {@code null}
508     */
509    @Override
510    public LevenshteinResults apply(final CharSequence left, final CharSequence right) {
511        return apply(SimilarityInput.input(left), SimilarityInput.input(right));
512    }
513
514    /**
515     * Computes the Levenshtein distance between two Strings.
516     *
517     * <p>A higher score indicates a greater distance.</p>
518     *
519     * <p>The previous implementation of the Levenshtein distance algorithm
520     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
521     *
522     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
523     * which can occur when my Java implementation is used with very large strings.<br>
524     * This implementation of the Levenshtein distance algorithm
525     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
526     *
527     * <pre>
528     * distance.apply(null, *)             = IllegalArgumentException
529     * distance.apply(*, null)             = IllegalArgumentException
530     * distance.apply("","")               = 0
531     * distance.apply("","a")              = 1
532     * distance.apply("aaapppp", "")       = 7
533     * distance.apply("frog", "fog")       = 1
534     * distance.apply("fly", "ant")        = 3
535     * distance.apply("elephant", "hippo") = 7
536     * distance.apply("hippo", "elephant") = 7
537     * distance.apply("hippo", "zzzzzzzz") = 8
538     * distance.apply("hello", "hallo")    = 1
539     * </pre>
540     *
541     * @param <E> The type of similarity score unit.
542     * @param left the first input, must not be null
543     * @param right the second input, must not be null
544     * @return result distance, or -1
545     * @throws IllegalArgumentException if either String input {@code null}
546     * @since 1.13.0
547     */
548    public <E> LevenshteinResults apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
549        if (threshold != null) {
550            return limitedCompare(left, right, threshold);
551        }
552        return unlimitedCompare(left, right);
553    }
554
555    /**
556     * Gets the distance threshold.
557     *
558     * @return The distance threshold
559     */
560    public Integer getThreshold() {
561        return threshold;
562    }
563}