1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text.similarity;
18
19 /**
20 * A similarity algorithm indicating the length of the longest common subsequence between two strings.
21 *
22 * <p>
23 * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
24 * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
25 * of the commonly shared length implies that the strings are completely the same in value and position.
26 * <em>Note.</em> Generally this algorithm is fairly inefficient, as for length <em>m</em>, <em>n</em> of the input
27 * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the
28 * algorithm is <em>O(m*n)</em>.
29 * </p>
30 *
31 * <p>
32 * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space
33 * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings.
34 * </p>
35 *
36 * <p>
37 * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below).
38 * </p>
39 *
40 * <p>For further reading see:</p>
41 * <ul>
42 * <li>
43 * Lothaire, M. <em>Applied combinatorics on words</em>. New York: Cambridge U Press, 2005. <strong>12-13</strong>
44 * </li>
45 * <li>
46 * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343.
47 * </li>
48 * </ul>
49 *
50 * @since 1.0
51 */
52 public class LongestCommonSubsequence implements SimilarityScore<Integer> {
53
54 /**
55 * The singleton instance.
56 */
57 static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence();
58
59 /**
60 * An implementation of "ALG B" from Hirschberg's CACM '71 paper.
61 * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
62 * {@code n}, this method returns the last row of the dynamic programming (DP) table when calculating
63 * the LCS of the two sequences in <em>O(m*n)</em> time and <em>O(n)</em> space.
64 * The last element of the returned array, is the size of the LCS of the two input sequences.
65 *
66 * @param left first input sequence.
67 * @param right second input sequence.
68 * @return last row of the dynamic-programming (DP) table for calculating the LCS of {@code left} and {@code right}.
69 * @since 1.10.0
70 */
71 private static int[] algorithmB(final CharSequence left, final CharSequence right) {
72 final int m = left.length();
73 final int n = right.length();
74 // Creating an array for storing two rows of DP table
75 final int[][] dpRows = new int[2][1 + n];
76 for (int i = 1; i <= m; i++) {
77 // K(0, j) <- K(1, j) [j = 0...n], as per the paper:
78 // Since we have references in Java, we can swap references instead of literal copying.
79 // We could also use a "binary index" using modulus operator, but directly swapping the
80 // two rows helps readability and keeps the code consistent with the algorithm description
81 // in the paper.
82 final int[] temp = dpRows[0];
83 dpRows[0] = dpRows[1];
84 dpRows[1] = temp;
85
86 for (int j = 1; j <= n; j++) {
87 if (left.charAt(i - 1) == right.charAt(j - 1)) {
88 dpRows[1][j] = dpRows[0][j - 1] + 1;
89 } else {
90 dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]);
91 }
92 }
93 }
94 // LL(j) <- K(1, j) [j=0...n], as per the paper:
95 // We don't need literal copying of the array, we can just return the reference
96 return dpRows[1];
97 }
98
99 /**
100 * An implementation of "ALG C" from Hirschberg's CACM '71 paper.
101 * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
102 * {@code n}, this method returns the Longest Common Subsequence (LCS) of the two sequences in
103 * <em>O(m*n)</em> time and <em>O(m+n)</em> space.
104 *
105 * @param left first input sequence.
106 * @param right second input sequence.
107 * @return the LCS of {@code left} and {@code right}.
108 * @since 1.10.0
109 */
110 private static String algorithmC(final CharSequence left, final CharSequence right) {
111 final int m = left.length();
112 final int n = right.length();
113 final StringBuilder out = new StringBuilder();
114 if (m == 1) { // Handle trivial cases, as per the paper
115 final char leftCh = left.charAt(0);
116 for (int j = 0; j < n; j++) {
117 if (leftCh == right.charAt(j)) {
118 out.append(leftCh);
119 break;
120 }
121 }
122 } else if (n > 0 && m > 1) {
123 final int mid = m / 2; // Find the middle point
124 final CharSequence leftFirstPart = left.subSequence(0, mid);
125 final CharSequence leftSecondPart = left.subSequence(mid, m);
126 // Step 3 of the algorithm: two calls to Algorithm B
127 final int[] l1 = algorithmB(leftFirstPart, right);
128 final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right));
129 // Find k, as per the Step 4 of the algorithm
130 int k = 0;
131 int t = 0;
132 for (int j = 0; j <= n; j++) {
133 final int s = l1[j] + l2[n - j];
134 if (t < s) {
135 t = s;
136 k = j;
137 }
138 }
139 // Step 5: solve simpler problems, recursively
140 out.append(algorithmC(leftFirstPart, right.subSequence(0, k)));
141 out.append(algorithmC(leftSecondPart, right.subSequence(k, n)));
142 }
143
144 return out.toString();
145 }
146
147 /*
148 * A method for CharSequence reversal.
149 */
150 private static String reverse(final CharSequence s) {
151 return new StringBuilder(s).reverse().toString();
152 }
153
154 /**
155 * Creates a new instance.
156 */
157 public LongestCommonSubsequence() {
158 // empty
159 }
160
161 /**
162 * Computes the longest common subsequence similarity score of two {@code CharSequence}'s passed as
163 * input.
164 *
165 * <p>
166 * This method implements a more efficient version of LCS algorithm which has quadratic time and
167 * linear space complexity.
168 * </p>
169 *
170 * <p>
171 * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}.
172 * An evaluation using JMH revealed that this method is almost two times faster than its previous version.
173 * </p>
174 *
175 * @param left first character sequence.
176 * @param right second character sequence.
177 * @return length of the longest common subsequence of {@code left} and {@code right}.
178 * @throws IllegalArgumentException if either String input {@code null}.
179 */
180 @Override
181 public Integer apply(final CharSequence left, final CharSequence right) {
182 // Quick return for invalid inputs
183 if (left == null || right == null) {
184 throw new IllegalArgumentException("Inputs must not be null");
185 }
186 // Find lengths of two strings
187 final int leftSz = left.length();
188 final int rightSz = right.length();
189 // Check if we can avoid calling algorithmB which involves heap space allocation
190 if (leftSz == 0 || rightSz == 0) {
191 return 0;
192 }
193 // Check if we can save even more space
194 if (leftSz < rightSz) {
195 return algorithmB(right, left)[leftSz];
196 }
197 return algorithmB(left, right)[rightSz];
198 }
199
200 /**
201 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input.
202 *
203 * <p>
204 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
205 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
206 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
207 * subsequence need not have adjacent characters.
208 * </p>
209 *
210 * <p>
211 * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
212 * can be derived from another sequence by deleting some elements without changing the order of the remaining
213 * elements.
214 * </p>
215 *
216 * @param left first character sequence.
217 * @param right second character sequence.
218 * @return the longest common subsequence found.
219 * @throws IllegalArgumentException if either String input {@code null}.
220 * @deprecated Deprecated as of 1.2 due to a typo in the method name.
221 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead.
222 * This method will be removed in 2.0.
223 */
224 @Deprecated
225 public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
226 return longestCommonSubsequence(left, right);
227 }
228
229 /**
230 * Computes the longest common subsequence between the two {@code CharSequence}'s passed as
231 * input.
232 *
233 * <p>
234 * This method implements a more efficient version of LCS algorithm which although has quadratic time, it
235 * has linear space complexity.
236 * </p>
237 *
238 *
239 * <p>
240 * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
241 * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
242 * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
243 * subsequence need not have adjacent characters.
244 * </p>
245 *
246 * <p>
247 * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
248 * can be derived from another sequence by deleting some elements without changing the order of the remaining
249 * elements.
250 * </p>
251 *
252 * @param left first character sequence.
253 * @param right second character sequence.
254 * @return the longest common subsequence found.
255 * @throws IllegalArgumentException if either String input {@code null}.
256 * @since 1.2
257 */
258 public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) {
259 // Quick return
260 if (left == null || right == null) {
261 throw new IllegalArgumentException("Inputs must not be null");
262 }
263 // Find lengths of two strings
264 final int leftSz = left.length();
265 final int rightSz = right.length();
266
267 // Check if we can avoid calling algorithmC which involves heap space allocation
268 if (leftSz == 0 || rightSz == 0) {
269 return "";
270 }
271
272 // Check if we can save even more space
273 if (leftSz < rightSz) {
274 return algorithmC(right, left);
275 }
276 return algorithmC(left, right);
277 }
278
279 /**
280 * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
281 * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
282 * O(m*n), where m=left.length() and n=right.length().
283 *
284 * @param left first character sequence.
285 * @param right second character sequence.
286 * @return longest common substring length array.
287 * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available.
288 * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS.
289 * This method will be removed in 2.0.
290 */
291 @Deprecated
292 public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
293 final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
294 for (int i = 0; i < left.length(); i++) {
295 for (int j = 0; j < right.length(); j++) {
296 if (i == 0) {
297 lcsLengthArray[i][j] = 0;
298 }
299 if (j == 0) {
300 lcsLengthArray[i][j] = 0;
301 }
302 if (left.charAt(i) == right.charAt(j)) {
303 lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
304 } else {
305 lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
306 }
307 }
308 }
309 return lcsLengthArray;
310 }
311
312 }