001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.Arrays; 020 021/** 022 * An algorithm for measuring the difference between two character sequences. 023 * 024 * <p> 025 * This is the number of changes needed to change one sequence into another, 026 * where each change is a single character modification (deletion, insertion 027 * or substitution). 028 * </p> 029 * 030 * @since 1.0 031 */ 032public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { 033 034 /** 035 * Singleton instance. 036 */ 037 private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance(); 038 039 /** 040 * Finds count for each of the three [insert, delete, substitute] operations 041 * needed. This is based on the matrix formed based on the two character 042 * sequence. 043 * 044 * @param <E> The type of similarity score unit. 045 * @param left character sequence which need to be converted from 046 * @param right character sequence which need to be converted to 047 * @param matrix two dimensional array containing 048 * @param swapped tells whether the value for left character sequence and right 049 * character sequence were swapped to save memory 050 * @return result object containing the count of insert, delete and substitute and total count needed 051 */ 052 private static <E> LevenshteinResults findDetailedResults(final SimilarityInput<E> left, 053 final SimilarityInput<E> right, 054 final int[][] matrix, 055 final boolean swapped) { 056 057 int delCount = 0; 058 int addCount = 0; 059 int subCount = 0; 060 061 int rowIndex = right.length(); 062 int columnIndex = left.length(); 063 064 int dataAtLeft = 0; 065 int dataAtTop = 0; 066 int dataAtDiagonal = 0; 067 int data = 0; 068 boolean deleted = false; 069 boolean added = false; 070 071 while (rowIndex >= 0 && columnIndex >= 0) { 072 073 if (columnIndex == 0) { 074 dataAtLeft = -1; 075 } else { 076 dataAtLeft = matrix[rowIndex][columnIndex - 1]; 077 } 078 if (rowIndex == 0) { 079 dataAtTop = -1; 080 } else { 081 dataAtTop = matrix[rowIndex - 1][columnIndex]; 082 } 083 if (rowIndex > 0 && columnIndex > 0) { 084 dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; 085 } else { 086 dataAtDiagonal = -1; 087 } 088 if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { 089 break; 090 } 091 data = matrix[rowIndex][columnIndex]; 092 093 // case in which the character at left and right are the same, 094 // in this case none of the counters will be incremented. 095 if (columnIndex > 0 && rowIndex > 0 && left.at(columnIndex - 1).equals(right.at(rowIndex - 1))) { 096 columnIndex--; 097 rowIndex--; 098 continue; 099 } 100 101 // handling insert and delete cases. 102 deleted = false; 103 added = false; 104 if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop 105 || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD 106 columnIndex--; 107 if (swapped) { 108 addCount++; 109 added = true; 110 } else { 111 delCount++; 112 deleted = true; 113 } 114 } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft 115 || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD 116 rowIndex--; 117 if (swapped) { 118 delCount++; 119 deleted = true; 120 } else { 121 addCount++; 122 added = true; 123 } 124 } 125 126 // substituted case 127 if (!added && !deleted) { 128 subCount++; 129 columnIndex--; 130 rowIndex--; 131 } 132 } 133 return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); 134 } 135 136 /** 137 * Gets the default instance. 138 * 139 * @return The default instace 140 */ 141 public static LevenshteinDetailedDistance getDefaultInstance() { 142 return INSTANCE; 143 } 144 145 /** 146 * Finds the Levenshtein distance between two CharSequences if it's less than or 147 * equal to a given threshold. 148 * 149 * <p> 150 * This implementation follows from Algorithms on Strings, Trees and 151 * Sequences by Dan Gusfield and Chas Emerick's implementation of the 152 * Levenshtein distance algorithm from <a 153 * href="http://www.merriampark.com/ld.htm" 154 * >http://www.merriampark.com/ld.htm</a> 155 * </p> 156 * 157 * <pre> 158 * limitedCompare(null, *, *) = IllegalArgumentException 159 * limitedCompare(*, null, *) = IllegalArgumentException 160 * limitedCompare(*, *, -1) = IllegalArgumentException 161 * limitedCompare("","", 0) = 0 162 * limitedCompare("aaapppp", "", 8) = 7 163 * limitedCompare("aaapppp", "", 7) = 7 164 * limitedCompare("aaapppp", "", 6)) = -1 165 * limitedCompare("elephant", "hippo", 7) = 7 166 * limitedCompare("elephant", "hippo", 6) = -1 167 * limitedCompare("hippo", "elephant", 7) = 7 168 * limitedCompare("hippo", "elephant", 6) = -1 169 * </pre> 170 * 171 * @param <E> The type of similarity score unit. 172 * @param left the first CharSequence, must not be null 173 * @param right the second CharSequence, must not be null 174 * @param threshold the target threshold, must not be negative 175 * @return result distance, or -1 176 */ 177 private static <E> LevenshteinResults limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) { //NOPMD 178 if (left == null || right == null) { 179 throw new IllegalArgumentException("CharSequences must not be null"); 180 } 181 if (threshold < 0) { 182 throw new IllegalArgumentException("Threshold must not be negative"); 183 } 184 185 /* 186 * This implementation only computes the distance if it's less than or 187 * equal to the threshold value, returning -1 if it's greater. The 188 * advantage is performance: unbounded distance is O(nm), but a bound of 189 * k allows us to reduce it to O(km) time by only computing a diagonal 190 * stripe of width 2k + 1 of the cost table. It is also possible to use 191 * this to compute the unbounded Levenshtein distance by starting the 192 * threshold at 1 and doubling each time until the distance is found; 193 * this is O(dm), where d is the distance. 194 * 195 * One subtlety comes from needing to ignore entries on the border of 196 * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry 197 * to the left of the leftmost member We must ignore the entry above the 198 * rightmost member 199 * 200 * Another subtlety comes from our stripe running off the matrix if the 201 * strings aren't of the same size. Since string s is always swapped to 202 * be the shorter of the two, the stripe will always run off to the 203 * upper right instead of the lower left of the matrix. 204 * 205 * As a concrete example, suppose s is of length 5, t is of length 7, 206 * and our threshold is 1. In this case we're going to walk a stripe of 207 * length 3. The matrix would look like so: 208 * 209 * <pre> 210 * 1 2 3 4 5 211 * 1 |#|#| | | | 212 * 2 |#|#|#| | | 213 * 3 | |#|#|#| | 214 * 4 | | |#|#|#| 215 * 5 | | | |#|#| 216 * 6 | | | | |#| 217 * 7 | | | | | | 218 * </pre> 219 * 220 * Note how the stripe leads off the table as there is no possible way 221 * to turn a string of length 5 into one of length 7 in edit distance of 222 * 1. 223 * 224 * Additionally, this implementation decreases memory usage by using two 225 * single-dimensional arrays and swapping them back and forth instead of 226 * allocating an entire n by m matrix. This requires a few minor 227 * changes, such as immediately returning when it's detected that the 228 * stripe has run off the matrix and initially filling the arrays with 229 * large values so that entries we don't compute are ignored. 230 * 231 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for 232 * some discussion. 233 */ 234 235 int n = left.length(); // length of left 236 int m = right.length(); // length of right 237 238 // if one string is empty, the edit distance is necessarily the length of the other 239 if (n == 0) { 240 return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); 241 } 242 if (m == 0) { 243 return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); 244 } 245 246 boolean swapped = false; 247 if (n > m) { 248 // swap the two strings to consume less memory 249 final SimilarityInput<E> tmp = left; 250 left = right; 251 right = tmp; 252 n = m; 253 m = right.length(); 254 swapped = true; 255 } 256 257 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 258 int[] d = new int[n + 1]; // cost array, horizontally 259 int[] tempD; // placeholder to assist in swapping p and d 260 final int[][] matrix = new int[m + 1][n + 1]; 261 262 //filling the first row and first column values in the matrix 263 for (int index = 0; index <= n; index++) { 264 matrix[0][index] = index; 265 } 266 for (int index = 0; index <= m; index++) { 267 matrix[index][0] = index; 268 } 269 270 // fill in starting table values 271 final int boundary = Math.min(n, threshold) + 1; 272 for (int i = 0; i < boundary; i++) { 273 p[i] = i; 274 } 275 // these fills ensure that the value above the rightmost entry of our 276 // stripe will be ignored in following loop iterations 277 Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); 278 Arrays.fill(d, Integer.MAX_VALUE); 279 280 // iterates through t 281 for (int j = 1; j <= m; j++) { 282 final E rightJ = right.at(j - 1); // jth character of right 283 d[0] = j; 284 285 // compute stripe indices, constrain to array size 286 final int min = Math.max(1, j - threshold); 287 final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( 288 n, j + threshold); 289 290 // the stripe may lead off of the table if s and t are of different sizes 291 if (min > max) { 292 return new LevenshteinResults(-1, 0, 0, 0); 293 } 294 295 // ignore entry left of leftmost 296 if (min > 1) { 297 d[min - 1] = Integer.MAX_VALUE; 298 } 299 300 // iterates through [min, max] in s 301 for (int i = min; i <= max; i++) { 302 if (left.at(i - 1).equals(rightJ)) { 303 // diagonally left and up 304 d[i] = p[i - 1]; 305 } else { 306 // 1 + minimum of cell to the left, to the top, diagonally left and up 307 d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); 308 } 309 matrix[j][i] = d[i]; 310 } 311 312 // copy current distance counts to 'previous row' distance counts 313 tempD = p; 314 p = d; 315 d = tempD; 316 } 317 318 // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance 319 if (p[n] <= threshold) { 320 return findDetailedResults(left, right, matrix, swapped); 321 } 322 return new LevenshteinResults(-1, 0, 0, 0); 323 } 324 325 /** 326 * Finds the Levenshtein distance between two Strings. 327 * 328 * <p>A higher score indicates a greater distance.</p> 329 * 330 * <p>The previous implementation of the Levenshtein distance algorithm 331 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 332 * 333 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 334 * which can occur when my Java implementation is used with very large strings.<br> 335 * This implementation of the Levenshtein distance algorithm 336 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 337 * 338 * <pre> 339 * unlimitedCompare(null, *) = IllegalArgumentException 340 * unlimitedCompare(*, null) = IllegalArgumentException 341 * unlimitedCompare("","") = 0 342 * unlimitedCompare("","a") = 1 343 * unlimitedCompare("aaapppp", "") = 7 344 * unlimitedCompare("frog", "fog") = 1 345 * unlimitedCompare("fly", "ant") = 3 346 * unlimitedCompare("elephant", "hippo") = 7 347 * unlimitedCompare("hippo", "elephant") = 7 348 * unlimitedCompare("hippo", "zzzzzzzz") = 8 349 * unlimitedCompare("hello", "hallo") = 1 350 * </pre> 351 * 352 * @param <E> The type of similarity score unit. 353 * @param left the first CharSequence, must not be null 354 * @param right the second CharSequence, must not be null 355 * @return result distance, or -1 356 * @throws IllegalArgumentException if either CharSequence input is {@code null} 357 */ 358 private static <E> LevenshteinResults unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) { 359 if (left == null || right == null) { 360 throw new IllegalArgumentException("CharSequences must not be null"); 361 } 362 363 /* 364 The difference between this impl. and the previous is that, rather 365 than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, 366 we maintain two single-dimensional arrays of length s.length() + 1. The first, d, 367 is the 'current working' distance array that maintains the newest distance cost 368 counts as we iterate through the characters of String s. Each time we increment 369 the index of String t we are comparing, d is copied to p, the second int[]. Doing so 370 allows us to retain the previous cost counts as required by the algorithm (taking 371 the minimum of the cost count to the left, up one, and diagonally up and to the left 372 of the current cost count being calculated). (Note that the arrays aren't really 373 copied anymore, just switched...this is clearly much better than cloning an array 374 or doing a System.arraycopy() each time through the outer loop.) 375 376 Effectively, the difference between the two implementations is this one does not 377 cause an out of memory condition when calculating the LD over two very large strings. 378 */ 379 380 int n = left.length(); // length of left 381 int m = right.length(); // length of right 382 383 if (n == 0) { 384 return new LevenshteinResults(m, m, 0, 0); 385 } 386 if (m == 0) { 387 return new LevenshteinResults(n, 0, n, 0); 388 } 389 boolean swapped = false; 390 if (n > m) { 391 // swap the input strings to consume less memory 392 final SimilarityInput<E> tmp = left; 393 left = right; 394 right = tmp; 395 n = m; 396 m = right.length(); 397 swapped = true; 398 } 399 400 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 401 int[] d = new int[n + 1]; // cost array, horizontally 402 int[] tempD; //placeholder to assist in swapping p and d 403 final int[][] matrix = new int[m + 1][n + 1]; 404 405 // filling the first row and first column values in the matrix 406 for (int index = 0; index <= n; index++) { 407 matrix[0][index] = index; 408 } 409 for (int index = 0; index <= m; index++) { 410 matrix[index][0] = index; 411 } 412 413 // indexes into strings left and right 414 int i; // iterates through left 415 int j; // iterates through right 416 417 E rightJ; // jth character of right 418 419 int cost; // cost 420 for (i = 0; i <= n; i++) { 421 p[i] = i; 422 } 423 424 for (j = 1; j <= m; j++) { 425 rightJ = right.at(j - 1); 426 d[0] = j; 427 428 for (i = 1; i <= n; i++) { 429 cost = left.at(i - 1).equals(rightJ) ? 0 : 1; 430 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost 431 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); 432 //filling the matrix 433 matrix[j][i] = d[i]; 434 } 435 436 // copy current distance counts to 'previous row' distance counts 437 tempD = p; 438 p = d; 439 d = tempD; 440 } 441 return findDetailedResults(left, right, matrix, swapped); 442 } 443 444 /** 445 * Threshold. 446 */ 447 private final Integer threshold; 448 449 /** 450 * <p> 451 * This returns the default instance that uses a version 452 * of the algorithm that does not use a threshold parameter. 453 * </p> 454 * 455 * @see LevenshteinDetailedDistance#getDefaultInstance() 456 * @deprecated Use {@link #getDefaultInstance()}. 457 */ 458 @Deprecated 459 public LevenshteinDetailedDistance() { 460 this(null); 461 } 462 463 /** 464 * If the threshold is not null, distance calculations will be limited to a maximum length. 465 * 466 * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> 467 * 468 * @param threshold If this is null then distances calculations will not be limited. This may not be negative. 469 */ 470 public LevenshteinDetailedDistance(final Integer threshold) { 471 if (threshold != null && threshold < 0) { 472 throw new IllegalArgumentException("Threshold must not be negative"); 473 } 474 this.threshold = threshold; 475 } 476 477 /** 478 * Computes the Levenshtein distance between two Strings. 479 * 480 * <p>A higher score indicates a greater distance.</p> 481 * 482 * <p>The previous implementation of the Levenshtein distance algorithm 483 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 484 * 485 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 486 * which can occur when my Java implementation is used with very large strings.<br> 487 * This implementation of the Levenshtein distance algorithm 488 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 489 * 490 * <pre> 491 * distance.apply(null, *) = IllegalArgumentException 492 * distance.apply(*, null) = IllegalArgumentException 493 * distance.apply("","") = 0 494 * distance.apply("","a") = 1 495 * distance.apply("aaapppp", "") = 7 496 * distance.apply("frog", "fog") = 1 497 * distance.apply("fly", "ant") = 3 498 * distance.apply("elephant", "hippo") = 7 499 * distance.apply("hippo", "elephant") = 7 500 * distance.apply("hippo", "zzzzzzzz") = 8 501 * distance.apply("hello", "hallo") = 1 502 * </pre> 503 * 504 * @param left the first input, must not be null 505 * @param right the second input, must not be null 506 * @return result distance, or -1 507 * @throws IllegalArgumentException if either String input {@code null} 508 */ 509 @Override 510 public LevenshteinResults apply(final CharSequence left, final CharSequence right) { 511 return apply(SimilarityInput.input(left), SimilarityInput.input(right)); 512 } 513 514 /** 515 * Computes the Levenshtein distance between two Strings. 516 * 517 * <p>A higher score indicates a greater distance.</p> 518 * 519 * <p>The previous implementation of the Levenshtein distance algorithm 520 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 521 * 522 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 523 * which can occur when my Java implementation is used with very large strings.<br> 524 * This implementation of the Levenshtein distance algorithm 525 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 526 * 527 * <pre> 528 * distance.apply(null, *) = IllegalArgumentException 529 * distance.apply(*, null) = IllegalArgumentException 530 * distance.apply("","") = 0 531 * distance.apply("","a") = 1 532 * distance.apply("aaapppp", "") = 7 533 * distance.apply("frog", "fog") = 1 534 * distance.apply("fly", "ant") = 3 535 * distance.apply("elephant", "hippo") = 7 536 * distance.apply("hippo", "elephant") = 7 537 * distance.apply("hippo", "zzzzzzzz") = 8 538 * distance.apply("hello", "hallo") = 1 539 * </pre> 540 * 541 * @param <E> The type of similarity score unit. 542 * @param left the first input, must not be null 543 * @param right the second input, must not be null 544 * @return result distance, or -1 545 * @throws IllegalArgumentException if either String input {@code null} 546 * @since 1.13.0 547 */ 548 public <E> LevenshteinResults apply(final SimilarityInput<E> left, final SimilarityInput<E> right) { 549 if (threshold != null) { 550 return limitedCompare(left, right, threshold); 551 } 552 return unlimitedCompare(left, right); 553 } 554 555 /** 556 * Gets the distance threshold. 557 * 558 * @return The distance threshold 559 */ 560 public Integer getThreshold() { 561 return threshold; 562 } 563}