001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019/**
020 * An algorithm for measuring the difference between two character sequences using the
021 * <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance</a>.
022 *
023 * <p>
024 * This is the number of changes needed to change one sequence into another, where each change is a single character
025 * modification (deletion, insertion, substitution, or transposition of two adjacent characters).
026 * </p>
027 *
028 * @see <a href="https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance">Damerau-Levenshtein Distance on Wikipedia</a>
029 * @since 1.15.0
030 */
031public class DamerauLevenshteinDistance implements EditDistance<Integer> {
032
033    /**
034     * Utility function to ensure distance is valid according to threshold.
035     *
036     * @param distance  The distance value.
037     * @param threshold The threshold value.
038     * @return The distance value, or {@code -1} if distance is greater than threshold.
039     */
040    private static int clampDistance(final int distance, final int threshold) {
041        return distance > threshold ? -1 : distance;
042    }
043
044    /**
045     * Finds the Damerau-Levenshtein distance between two CharSequences if it's less than or equal to a given threshold.
046     *
047     * @param left      the first SimilarityInput, must not be null.
048     * @param right     the second SimilarityInput, must not be null.
049     * @param threshold the target threshold, must not be negative.
050     * @return result distance, or -1 if distance exceeds threshold.
051     */
052    private static <E> int limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) {
053        if (left == null || right == null) {
054            throw new IllegalArgumentException("Left/right inputs must not be null");
055        }
056
057        // Implementation based on https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
058
059        int leftLength = left.length();
060        int rightLength = right.length();
061
062        if (leftLength == 0) {
063            return clampDistance(rightLength, threshold);
064        }
065
066        if (rightLength == 0) {
067            return clampDistance(leftLength, threshold);
068        }
069
070        // Inspired by LevenshteinDistance impl; swap the input strings to consume less memory
071        if (rightLength > leftLength) {
072            final SimilarityInput<E> tmp = left;
073            left = right;
074            right = tmp;
075            leftLength = rightLength;
076            rightLength = right.length();
077        }
078
079        // If the difference between the lengths of the strings is greater than the threshold, we must at least do
080        // threshold operations so we can return early
081        if (leftLength - rightLength > threshold) {
082            return -1;
083        }
084
085        // Use three arrays of minimum possible size to reduce memory usage. This avoids having to create a 2D
086        // array of size leftLength * rightLength
087        int[] curr = new int[rightLength + 1];
088        int[] prev = new int[rightLength + 1];
089        int[] prevPrev = new int[rightLength + 1];
090        int[] temp; // Temp variable use to shuffle arrays at the end of each iteration
091
092        int rightIndex, leftIndex, cost, minCost;
093
094        // Changing empty sequence to [0..i] requires i insertions
095        for (rightIndex = 0; rightIndex <= rightLength; rightIndex++) {
096            prev[rightIndex] = rightIndex;
097        }
098
099        // Calculate how many operations it takes to change right[0..rightIndex] into left[0..leftIndex]
100        // For each iteration
101        //  - curr[i] contains the cost of changing right[0..i] into left[0..leftIndex]
102        //          (computed in current iteration)
103        //  - prev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 1]
104        //          (computed in previous iteration)
105        //  - prevPrev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 2]
106        //          (computed in iteration before previous)
107        for (leftIndex = 1; leftIndex <= leftLength; leftIndex++) {
108            // For right[0..0] we must insert leftIndex characters, which means the cost is always leftIndex
109            curr[0] = leftIndex;
110
111            minCost = Integer.MAX_VALUE;
112
113            for (rightIndex = 1; rightIndex <= rightLength; rightIndex++) {
114                cost = left.at(leftIndex - 1) == right.at(rightIndex - 1) ? 0 : 1;
115
116                // Select cheapest operation
117                curr[rightIndex] = Math.min(
118                        Math.min(
119                                prev[rightIndex] + 1, // Delete current character
120                                curr[rightIndex - 1] + 1 // Insert current character
121                        ),
122                        prev[rightIndex - 1] + cost // Replace (or no cost if same character)
123                );
124
125                // Check if adjacent characters are the same -> transpose if cheaper
126                if (leftIndex > 1
127                        && rightIndex > 1
128                        && left.at(leftIndex - 1) == right.at(rightIndex - 2)
129                        && left.at(leftIndex - 2) == right.at(rightIndex - 1)) {
130                    // Use cost here, to properly handle two subsequent equal letters
131                    curr[rightIndex] = Math.min(curr[rightIndex], prevPrev[rightIndex - 2] + cost);
132                }
133
134                minCost = Math.min(curr[rightIndex], minCost);
135            }
136
137            // If there was no total cost for this entire iteration to transform right to left[0..leftIndex], there
138            // can not be a way to do it below threshold. This is because we have no way to reduce the overall cost
139            // in later operations.
140            if (minCost > threshold) {
141                return -1;
142            }
143
144            // Rotate arrays for next iteration
145            temp = prevPrev;
146            prevPrev = prev;
147            prev = curr;
148            curr = temp;
149        }
150
151        // Prev contains the value computed in the latest iteration
152        return clampDistance(prev[rightLength], threshold);
153    }
154
155    /**
156     * Finds the Damerau-Levenshtein distance between two inputs using optimal string alignment.
157     *
158     * @param left  the first CharSequence, must not be null.
159     * @param right the second CharSequence, must not be null.
160     * @return result distance.
161     * @throws IllegalArgumentException if either CharSequence input is {@code null}.
162     */
163    private static <E> int unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) {
164        if (left == null || right == null) {
165            throw new IllegalArgumentException("Left/right inputs must not be null");
166        }
167
168        /*
169         * Implementation based on https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
170         */
171
172        int leftLength = left.length();
173        int rightLength = right.length();
174
175        if (leftLength == 0) {
176            return rightLength;
177        }
178
179        if (rightLength == 0) {
180            return leftLength;
181        }
182
183        // Inspired by LevenshteinDistance impl; swap the input strings to consume less memory
184        if (rightLength > leftLength) {
185            final SimilarityInput<E> tmp = left;
186            left = right;
187            right = tmp;
188            leftLength = rightLength;
189            rightLength = right.length();
190        }
191
192        // Use three arrays of minimum possible size to reduce memory usage. This avoids having to create a 2D
193        // array of size leftLength * rightLength
194        int[] curr = new int[rightLength + 1];
195        int[] prev = new int[rightLength + 1];
196        int[] prevPrev = new int[rightLength + 1];
197        int[] temp; // Temp variable use to shuffle arrays at the end of each iteration
198
199        int rightIndex, leftIndex, cost;
200
201        // Changing empty sequence to [0..i] requires i insertions
202        for (rightIndex = 0; rightIndex <= rightLength; rightIndex++) {
203            prev[rightIndex] = rightIndex;
204        }
205
206        // Calculate how many operations it takes to change right[0..rightIndex] into left[0..leftIndex]
207        // For each iteration
208        //  - curr[i] contains the cost of changing right[0..i] into left[0..leftIndex]
209        //          (computed in current iteration)
210        //  - prev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 1]
211        //          (computed in previous iteration)
212        //  - prevPrev[i] contains the cost of changing right[0..i] into left[0..leftIndex - 2]
213        //          (computed in iteration before previous)
214        for (leftIndex = 1; leftIndex <= leftLength; leftIndex++) {
215            // For right[0..0] we must insert leftIndex characters, which means the cost is always leftIndex
216            curr[0] = leftIndex;
217
218            for (rightIndex = 1; rightIndex <= rightLength; rightIndex++) {
219                cost = left.at(leftIndex - 1) == right.at(rightIndex - 1) ? 0 : 1;
220
221                // Select cheapest operation
222                curr[rightIndex] = Math.min(
223                        Math.min(
224                                prev[rightIndex] + 1, // Delete current character
225                                curr[rightIndex - 1] + 1 // Insert current character
226                        ),
227                        prev[rightIndex - 1] + cost // Replace (or no cost if same character)
228                );
229
230                // Check if adjacent characters are the same -> transpose if cheaper
231                if (leftIndex > 1
232                        && rightIndex > 1
233                        && left.at(leftIndex - 1) == right.at(rightIndex - 2)
234                        && left.at(leftIndex - 2) == right.at(rightIndex - 1)) {
235                    // Use cost here, to properly handle two subsequent equal letters
236                    curr[rightIndex] = Math.min(curr[rightIndex], prevPrev[rightIndex - 2] + cost);
237                }
238            }
239
240            // Rotate arrays for next iteration
241            temp = prevPrev;
242            prevPrev = prev;
243            prev = curr;
244            curr = temp;
245        }
246
247        // Prev contains the value computed in the latest iteration
248        return prev[rightLength];
249    }
250
251    /**
252     * Threshold.
253     */
254    private final Integer threshold;
255
256    /**
257     * Constructs a default instance that uses a version of the algorithm that does not use a threshold parameter.
258     */
259    public DamerauLevenshteinDistance() {
260        this(null);
261    }
262
263    /**
264     * Constructs a new instance. If the threshold is not null, distance calculations will be limited to a maximum length.
265     * If the threshold is null, the unlimited version of the algorithm will be used.
266     *
267     * @param threshold If this is null then distances calculations will not be limited. This may not be negative.
268     */
269    public DamerauLevenshteinDistance(final Integer threshold) {
270        if (threshold != null && threshold < 0) {
271            throw new IllegalArgumentException("Threshold must not be negative");
272        }
273        this.threshold = threshold;
274    }
275
276    /**
277     * Computes the Damerau-Levenshtein distance between two Strings.
278     *
279     * <p>
280     * A higher score indicates a greater distance.
281     * </p>
282     *
283     * @param left  the first input, must not be null.
284     * @param right the second input, must not be null.
285     * @return result distance, or -1 if threshold is exceeded.
286     * @throws IllegalArgumentException if either String input {@code null}.
287     */
288    @Override
289    public Integer apply(final CharSequence left, final CharSequence right) {
290        return apply(SimilarityInput.input(left), SimilarityInput.input(right));
291    }
292
293    /**
294     * Computes the Damerau-Levenshtein distance between two inputs.
295     *
296     * <p>
297     * A higher score indicates a greater distance.
298     * </p>
299     *
300     * @param <E>   The type of similarity score unit.
301     * @param left  the first input, must not be null.
302     * @param right the second input, must not be null.
303     * @return result distance, or -1 if threshold is exceeded.
304     * @throws IllegalArgumentException if either String input {@code null}.
305     * @since 1.13.0
306     */
307    public <E> Integer apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
308        if (threshold != null) {
309            return limitedCompare(left, right, threshold);
310        }
311        return unlimitedCompare(left, right);
312    }
313
314    /**
315     * Gets the distance threshold.
316     *
317     * @return The distance threshold.
318     */
319    public Integer getThreshold() {
320        return threshold;
321    }
322}