001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019/** 020 * A similarity algorithm indicating the length of the longest common subsequence between two strings. 021 * 022 * <p> 023 * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in 024 * common. Two strings that are entirely different, return a value of 0, and two strings that return a value 025 * of the commonly shared length implies that the strings are completely the same in value and position. 026 * <em>Note.</em> Generally this algorithm is fairly inefficient, as for length <em>m</em>, <em>n</em> of the input 027 * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the 028 * algorithm is <em>O(m*n)</em>. 029 * </p> 030 * 031 * <p> 032 * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space 033 * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings. 034 * </p> 035 * 036 * <p> 037 * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below). 038 * </p> 039 * 040 * <p>For further reading see:</p> 041 * <ul> 042 * <li> 043 * Lothaire, M. <em>Applied combinatorics on words</em>. New York: Cambridge U Press, 2005. <strong>12-13</strong> 044 * </li> 045 * <li> 046 * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343. 047 * </li> 048 * </ul> 049 * 050 * @since 1.0 051 */ 052public class LongestCommonSubsequence implements SimilarityScore<Integer> { 053 054 /** 055 * The singleton instance. 056 */ 057 static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence(); 058 059 /** 060 * An implementation of "ALG B" from Hirschberg's CACM '71 paper. 061 * Assuming the first input sequence is of size {@code m} and the second input sequence is of size 062 * {@code n}, this method returns the last row of the dynamic programming (DP) table when calculating 063 * the LCS of the two sequences in <em>O(m*n)</em> time and <em>O(n)</em> space. 064 * The last element of the returned array, is the size of the LCS of the two input sequences. 065 * 066 * @param left first input sequence. 067 * @param right second input sequence. 068 * @return last row of the dynamic-programming (DP) table for calculating the LCS of {@code left} and {@code right}. 069 * @since 1.10.0 070 */ 071 private static int[] algorithmB(final CharSequence left, final CharSequence right) { 072 final int m = left.length(); 073 final int n = right.length(); 074 // Creating an array for storing two rows of DP table 075 final int[][] dpRows = new int[2][1 + n]; 076 for (int i = 1; i <= m; i++) { 077 // K(0, j) <- K(1, j) [j = 0...n], as per the paper: 078 // Since we have references in Java, we can swap references instead of literal copying. 079 // We could also use a "binary index" using modulus operator, but directly swapping the 080 // two rows helps readability and keeps the code consistent with the algorithm description 081 // in the paper. 082 final int[] temp = dpRows[0]; 083 dpRows[0] = dpRows[1]; 084 dpRows[1] = temp; 085 086 for (int j = 1; j <= n; j++) { 087 if (left.charAt(i - 1) == right.charAt(j - 1)) { 088 dpRows[1][j] = dpRows[0][j - 1] + 1; 089 } else { 090 dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]); 091 } 092 } 093 } 094 // LL(j) <- K(1, j) [j=0...n], as per the paper: 095 // We don't need literal copying of the array, we can just return the reference 096 return dpRows[1]; 097 } 098 099 /** 100 * An implementation of "ALG C" from Hirschberg's CACM '71 paper. 101 * Assuming the first input sequence is of size {@code m} and the second input sequence is of size 102 * {@code n}, this method returns the Longest Common Subsequence (LCS) of the two sequences in 103 * <em>O(m*n)</em> time and <em>O(m+n)</em> space. 104 * 105 * @param left first input sequence. 106 * @param right second input sequence. 107 * @return the LCS of {@code left} and {@code right}. 108 * @since 1.10.0 109 */ 110 private static String algorithmC(final CharSequence left, final CharSequence right) { 111 final int m = left.length(); 112 final int n = right.length(); 113 final StringBuilder out = new StringBuilder(); 114 if (m == 1) { // Handle trivial cases, as per the paper 115 final char leftCh = left.charAt(0); 116 for (int j = 0; j < n; j++) { 117 if (leftCh == right.charAt(j)) { 118 out.append(leftCh); 119 break; 120 } 121 } 122 } else if (n > 0 && m > 1) { 123 final int mid = m / 2; // Find the middle point 124 final CharSequence leftFirstPart = left.subSequence(0, mid); 125 final CharSequence leftSecondPart = left.subSequence(mid, m); 126 // Step 3 of the algorithm: two calls to Algorithm B 127 final int[] l1 = algorithmB(leftFirstPart, right); 128 final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right)); 129 // Find k, as per the Step 4 of the algorithm 130 int k = 0; 131 int t = 0; 132 for (int j = 0; j <= n; j++) { 133 final int s = l1[j] + l2[n - j]; 134 if (t < s) { 135 t = s; 136 k = j; 137 } 138 } 139 // Step 5: solve simpler problems, recursively 140 out.append(algorithmC(leftFirstPart, right.subSequence(0, k))); 141 out.append(algorithmC(leftSecondPart, right.subSequence(k, n))); 142 } 143 144 return out.toString(); 145 } 146 147 /* 148 * A method for CharSequence reversal. 149 */ 150 private static String reverse(final CharSequence s) { 151 return new StringBuilder(s).reverse().toString(); 152 } 153 154 /** 155 * Creates a new instance. 156 */ 157 public LongestCommonSubsequence() { 158 // empty 159 } 160 161 /** 162 * Computes the longest common subsequence similarity score of two {@code CharSequence}'s passed as 163 * input. 164 * 165 * <p> 166 * This method implements a more efficient version of LCS algorithm which has quadratic time and 167 * linear space complexity. 168 * </p> 169 * 170 * <p> 171 * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}. 172 * An evaluation using JMH revealed that this method is almost two times faster than its previous version. 173 * </p> 174 * 175 * @param left first character sequence. 176 * @param right second character sequence. 177 * @return length of the longest common subsequence of {@code left} and {@code right}. 178 * @throws IllegalArgumentException if either String input {@code null}. 179 */ 180 @Override 181 public Integer apply(final CharSequence left, final CharSequence right) { 182 // Quick return for invalid inputs 183 if (left == null || right == null) { 184 throw new IllegalArgumentException("Inputs must not be null"); 185 } 186 // Find lengths of two strings 187 final int leftSz = left.length(); 188 final int rightSz = right.length(); 189 // Check if we can avoid calling algorithmB which involves heap space allocation 190 if (leftSz == 0 || rightSz == 0) { 191 return 0; 192 } 193 // Check if we can save even more space 194 if (leftSz < rightSz) { 195 return algorithmB(right, left)[leftSz]; 196 } 197 return algorithmB(left, right)[rightSz]; 198 } 199 200 /** 201 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input. 202 * 203 * <p> 204 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and 205 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, 206 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a 207 * subsequence need not have adjacent characters. 208 * </p> 209 * 210 * <p> 211 * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that 212 * can be derived from another sequence by deleting some elements without changing the order of the remaining 213 * elements. 214 * </p> 215 * 216 * @param left first character sequence. 217 * @param right second character sequence. 218 * @return the longest common subsequence found. 219 * @throws IllegalArgumentException if either String input {@code null}. 220 * @deprecated Deprecated as of 1.2 due to a typo in the method name. 221 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead. 222 * This method will be removed in 2.0. 223 */ 224 @Deprecated 225 public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { 226 return longestCommonSubsequence(left, right); 227 } 228 229 /** 230 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as 231 * input. 232 * 233 * <p> 234 * This method implements a more efficient version of LCS algorithm which although has quadratic time, it 235 * has linear space complexity. 236 * </p> 237 * 238 * 239 * <p> 240 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and 241 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, 242 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a 243 * subsequence need not have adjacent characters. 244 * </p> 245 * 246 * <p> 247 * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that 248 * can be derived from another sequence by deleting some elements without changing the order of the remaining 249 * elements. 250 * </p> 251 * 252 * @param left first character sequence. 253 * @param right second character sequence. 254 * @return the longest common subsequence found. 255 * @throws IllegalArgumentException if either String input {@code null}. 256 * @since 1.2 257 */ 258 public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) { 259 // Quick return 260 if (left == null || right == null) { 261 throw new IllegalArgumentException("Inputs must not be null"); 262 } 263 // Find lengths of two strings 264 final int leftSz = left.length(); 265 final int rightSz = right.length(); 266 267 // Check if we can avoid calling algorithmC which involves heap space allocation 268 if (leftSz == 0 || rightSz == 0) { 269 return ""; 270 } 271 272 // Check if we can save even more space 273 if (leftSz < rightSz) { 274 return algorithmC(right, left); 275 } 276 return algorithmC(left, right); 277 } 278 279 /** 280 * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the 281 * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being 282 * O(m*n), where m=left.length() and n=right.length(). 283 * 284 * @param left first character sequence. 285 * @param right second character sequence. 286 * @return longest common substring length array. 287 * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available. 288 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS. 289 * This method will be removed in 2.0. 290 */ 291 @Deprecated 292 public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { 293 final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; 294 for (int i = 0; i < left.length(); i++) { 295 for (int j = 0; j < right.length(); j++) { 296 if (i == 0) { 297 lcsLengthArray[i][j] = 0; 298 } 299 if (j == 0) { 300 lcsLengthArray[i][j] = 0; 301 } 302 if (left.charAt(i) == right.charAt(j)) { 303 lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; 304 } else { 305 lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); 306 } 307 } 308 } 309 return lcsLengthArray; 310 } 311 312}