1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 /** 20 * A similarity algorithm indicating the length of the longest common subsequence between two strings. 21 * 22 * <p> 23 * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in 24 * common. Two strings that are entirely different, return a value of 0, and two strings that return a value 25 * of the commonly shared length implies that the strings are completely the same in value and position. 26 * <i>Note.</i> Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input 27 * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the 28 * algorithm is <i>O(m*n)</i>. 29 * </p> 30 * 31 * <p> 32 * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space 33 * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings. 34 * </p> 35 * 36 * <p> 37 * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below). 38 * </p> 39 * 40 * <p>For further reading see:</p> 41 * <ul> 42 * <li> 43 * Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b> 44 * </li> 45 * <li> 46 * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343. 47 * </li> 48 * </ul> 49 * 50 * @since 1.0 51 */ 52 public class LongestCommonSubsequence implements SimilarityScore<Integer> { 53 54 /** 55 * Singleton instance. 56 */ 57 static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence(); 58 59 /** 60 * An implementation of "ALG B" from Hirschberg's CACM '71 paper. 61 * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size 62 * <code>n</code>, this method returns the last row of the dynamic programming (DP) table when calculating 63 * the LCS of the two sequences in <i>O(m*n)</i> time and <i>O(n)</i> space. 64 * The last element of the returned array, is the size of the LCS of the two input sequences. 65 * 66 * @param left first input sequence. 67 * @param right second input sequence. 68 * @return last row of the dynamic-programming (DP) table for calculating the LCS of <code>left</code> and <code>right</code> 69 * @since 1.10.0 70 */ 71 private static int[] algorithmB(final CharSequence left, final CharSequence right) { 72 final int m = left.length(); 73 final int n = right.length(); 74 75 // Creating an array for storing two rows of DP table 76 final int[][] dpRows = new int[2][1 + n]; 77 78 for (int i = 1; i <= m; i++) { 79 // K(0, j) <- K(1, j) [j = 0...n], as per the paper: 80 // Since we have references in Java, we can swap references instead of literal copying. 81 // We could also use a "binary index" using modulus operator, but directly swapping the 82 // two rows helps readability and keeps the code consistent with the algorithm description 83 // in the paper. 84 final int[] temp = dpRows[0]; 85 dpRows[0] = dpRows[1]; 86 dpRows[1] = temp; 87 88 for (int j = 1; j <= n; j++) { 89 if (left.charAt(i - 1) == right.charAt(j - 1)) { 90 dpRows[1][j] = dpRows[0][j - 1] + 1; 91 } else { 92 dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]); 93 } 94 } 95 } 96 97 // LL(j) <- K(1, j) [j=0...n], as per the paper: 98 // We don't need literal copying of the array, we can just return the reference 99 return dpRows[1]; 100 } 101 102 /** 103 * An implementation of "ALG C" from Hirschberg's CACM '71 paper. 104 * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size 105 * <code>n</code>, this method returns the Longest Common Subsequence (LCS) of the two sequences in 106 * <i>O(m*n)</i> time and <i>O(m+n)</i> space. 107 * 108 * @param left first input sequence. 109 * @param right second input sequence. 110 * @return the LCS of <code>left</code> and <code>right</code> 111 * @since 1.10.0 112 */ 113 private static String algorithmC(final CharSequence left, final CharSequence right) { 114 final int m = left.length(); 115 final int n = right.length(); 116 117 final StringBuilder out = new StringBuilder(); 118 119 if (m == 1) { // Handle trivial cases, as per the paper 120 final char leftCh = left.charAt(0); 121 for (int j = 0; j < n; j++) { 122 if (leftCh == right.charAt(j)) { 123 out.append(leftCh); 124 break; 125 } 126 } 127 } else if (n > 0 && m > 1) { 128 final int mid = m / 2; // Find the middle point 129 130 final CharSequence leftFirstPart = left.subSequence(0, mid); 131 final CharSequence leftSecondPart = left.subSequence(mid, m); 132 133 // Step 3 of the algorithm: two calls to Algorithm B 134 final int[] l1 = algorithmB(leftFirstPart, right); 135 final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right)); 136 137 // Find k, as per the Step 4 of the algorithm 138 int k = 0; 139 int t = 0; 140 for (int j = 0; j <= n; j++) { 141 final int s = l1[j] + l2[n - j]; 142 if (t < s) { 143 t = s; 144 k = j; 145 } 146 } 147 148 // Step 5: solve simpler problems, recursively 149 out.append(algorithmC(leftFirstPart, right.subSequence(0, k))); 150 out.append(algorithmC(leftSecondPart, right.subSequence(k, n))); 151 } 152 153 return out.toString(); 154 } 155 156 // An auxiliary method for CharSequence reversal 157 private static String reverse(final CharSequence s) { 158 return new StringBuilder(s).reverse().toString(); 159 } 160 161 /** 162 * Calculates the longest common subsequence similarity score of two {@code CharSequence}'s passed as 163 * input. 164 * 165 * <p> 166 * This method implements a more efficient version of LCS algorithm which has quadratic time and 167 * linear space complexity. 168 * </p> 169 * 170 * <p> 171 * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}. 172 * An evaluation using JMH revealed that this method is almost two times faster than its previous version. 173 * </p> 174 * 175 * @param left first character sequence 176 * @param right second character sequence 177 * @return length of the longest common subsequence of <code>left</code> and <code>right</code> 178 * @throws IllegalArgumentException if either String input {@code null} 179 */ 180 @Override 181 public Integer apply(final CharSequence left, final CharSequence right) { 182 // Quick return for invalid inputs 183 if (left == null || right == null) { 184 throw new IllegalArgumentException("Inputs must not be null"); 185 } 186 // Find lengths of two strings 187 final int leftSz = left.length(); 188 final int rightSz = right.length(); 189 190 // Check if we can avoid calling algorithmB which involves heap space allocation 191 if (leftSz == 0 || rightSz == 0) { 192 return 0; 193 } 194 195 // Check if we can save even more space 196 if (leftSz < rightSz) { 197 return algorithmB(right, left)[leftSz]; 198 } 199 return algorithmB(left, right)[rightSz]; 200 } 201 202 /** 203 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input. 204 * 205 * <p> 206 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and 207 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, 208 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a 209 * subsequence need not have adjacent characters. 210 * </p> 211 * 212 * <p> 213 * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that 214 * can be derived from another sequence by deleting some elements without changing the order of the remaining 215 * elements. 216 * </p> 217 * 218 * @param left first character sequence 219 * @param right second character sequence 220 * @return the longest common subsequence found 221 * @throws IllegalArgumentException if either String input {@code null} 222 * @deprecated Deprecated as of 1.2 due to a typo in the method name. 223 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead. 224 * This method will be removed in 2.0. 225 */ 226 @Deprecated 227 public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { 228 return longestCommonSubsequence(left, right); 229 } 230 231 /** 232 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as 233 * input. 234 * 235 * <p> 236 * This method implements a more efficient version of LCS algorithm which although has quadratic time, it 237 * has linear space complexity. 238 * </p> 239 * 240 * 241 * <p> 242 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and 243 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, 244 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a 245 * subsequence need not have adjacent characters. 246 * </p> 247 * 248 * <p> 249 * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that 250 * can be derived from another sequence by deleting some elements without changing the order of the remaining 251 * elements. 252 * </p> 253 * 254 * @param left first character sequence 255 * @param right second character sequence 256 * @return the longest common subsequence found 257 * @throws IllegalArgumentException if either String input {@code null} 258 * @since 1.2 259 */ 260 public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) { 261 // Quick return 262 if (left == null || right == null) { 263 throw new IllegalArgumentException("Inputs must not be null"); 264 } 265 // Find lengths of two strings 266 final int leftSz = left.length(); 267 final int rightSz = right.length(); 268 269 // Check if we can avoid calling algorithmC which involves heap space allocation 270 if (leftSz == 0 || rightSz == 0) { 271 return ""; 272 } 273 274 // Check if we can save even more space 275 if (leftSz < rightSz) { 276 return algorithmC(right, left); 277 } 278 return algorithmC(left, right); 279 } 280 281 /** 282 * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the 283 * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being 284 * O(m*n), where m=left.length() and n=right.length(). 285 * 286 * @param left first character sequence 287 * @param right second character sequence 288 * @return lcsLengthArray 289 * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available. 290 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS. 291 * This method will be removed in 2.0. 292 */ 293 @Deprecated 294 public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { 295 final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; 296 for (int i = 0; i < left.length(); i++) { 297 for (int j = 0; j < right.length(); j++) { 298 if (i == 0) { 299 lcsLengthArray[i][j] = 0; 300 } 301 if (j == 0) { 302 lcsLengthArray[i][j] = 0; 303 } 304 if (left.charAt(i) == right.charAt(j)) { 305 lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; 306 } else { 307 lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); 308 } 309 } 310 } 311 return lcsLengthArray; 312 } 313 314 }