View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  /**
20   * A similarity algorithm indicating the length of the longest common subsequence between two strings.
21   *
22   * <p>
23   * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
24   * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
25   * of the commonly shared length implies that the strings are completely the same in value and position.
26   * <i>Note.</i>  Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input
27   * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the
28   * algorithm is <i>O(m*n)</i>.
29   * </p>
30   *
31   * <p>
32   * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space
33   * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings.
34   * </p>
35   *
36   * <p>
37   * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below).
38   * </p>
39   *
40   * <p>For further reading see:</p>
41   * <ul>
42   * <li>
43   * Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b>
44   * </li>
45   * <li>
46   * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343.
47   * </li>
48   * </ul>
49   *
50   * @since 1.0
51   */
52  public class LongestCommonSubsequence implements SimilarityScore<Integer> {
53  
54      /**
55       * Singleton instance.
56       */
57      static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence();
58  
59      /**
60       * An implementation of "ALG B" from Hirschberg's CACM '71 paper.
61       * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size
62       * <code>n</code>, this method returns the last row of the dynamic programming (DP) table when calculating
63       * the LCS of the two sequences in <i>O(m*n)</i> time and <i>O(n)</i> space.
64       * The last element of the returned array, is the size of the LCS of the two input sequences.
65       *
66       * @param left first input sequence.
67       * @param right second input sequence.
68       * @return last row of the dynamic-programming (DP) table for calculating the LCS of <code>left</code> and <code>right</code>
69       * @since 1.10.0
70       */
71      private static int[] algorithmB(final CharSequence left, final CharSequence right) {
72          final int m = left.length();
73          final int n = right.length();
74  
75          // Creating an array for storing two rows of DP table
76          final int[][] dpRows = new int[2][1 + n];
77  
78          for (int i = 1; i <= m; i++) {
79              // K(0, j) <- K(1, j) [j = 0...n], as per the paper:
80              // Since we have references in Java, we can swap references instead of literal copying.
81              // We could also use a "binary index" using modulus operator, but directly swapping the
82              // two rows helps readability and keeps the code consistent with the algorithm description
83              // in the paper.
84              final int[] temp = dpRows[0];
85              dpRows[0] = dpRows[1];
86              dpRows[1] = temp;
87  
88              for (int j = 1; j <= n; j++) {
89                  if (left.charAt(i - 1) == right.charAt(j - 1)) {
90                      dpRows[1][j] = dpRows[0][j - 1] + 1;
91                  } else {
92                      dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]);
93                  }
94              }
95          }
96  
97          // LL(j) <- K(1, j) [j=0...n], as per the paper:
98          // We don't need literal copying of the array, we can just return the reference
99          return dpRows[1];
100     }
101 
102     /**
103      * An implementation of "ALG C" from Hirschberg's CACM '71 paper.
104      * Assuming the first input sequence is of size <code>m</code> and the second input sequence is of size
105      * <code>n</code>, this method returns the Longest Common Subsequence (LCS) of the two sequences in
106      * <i>O(m*n)</i> time and <i>O(m+n)</i> space.
107      *
108      * @param left first input sequence.
109      * @param right second input sequence.
110      * @return the LCS of <code>left</code> and <code>right</code>
111      * @since 1.10.0
112      */
113     private static String algorithmC(final CharSequence left, final CharSequence right) {
114         final int m = left.length();
115         final int n = right.length();
116 
117         final StringBuilder out = new StringBuilder();
118 
119         if (m == 1) { // Handle trivial cases, as per the paper
120             final char leftCh = left.charAt(0);
121             for (int j = 0; j < n; j++) {
122                 if (leftCh == right.charAt(j)) {
123                     out.append(leftCh);
124                     break;
125                 }
126             }
127         } else if (n > 0 && m > 1) {
128             final int mid = m / 2; // Find the middle point
129 
130             final CharSequence leftFirstPart = left.subSequence(0, mid);
131             final CharSequence leftSecondPart = left.subSequence(mid, m);
132 
133             // Step 3 of the algorithm: two calls to Algorithm B
134             final int[] l1 = algorithmB(leftFirstPart, right);
135             final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right));
136 
137             // Find k, as per the Step 4 of the algorithm
138             int k = 0;
139             int t = 0;
140             for (int j = 0; j <= n; j++) {
141                 final int s = l1[j] + l2[n - j];
142                 if (t < s) {
143                     t = s;
144                     k = j;
145                 }
146             }
147 
148             // Step 5: solve simpler problems, recursively
149             out.append(algorithmC(leftFirstPart, right.subSequence(0, k)));
150             out.append(algorithmC(leftSecondPart, right.subSequence(k, n)));
151         }
152 
153         return out.toString();
154     }
155 
156     // An auxiliary method for CharSequence reversal
157     private static String reverse(final CharSequence s) {
158         return new StringBuilder(s).reverse().toString();
159     }
160 
161     /**
162      * Calculates the longest common subsequence similarity score of two {@code CharSequence}'s passed as
163      * input.
164      *
165      * <p>
166      * This method implements a more efficient version of LCS algorithm which has quadratic time and
167      * linear space complexity.
168      * </p>
169      *
170      * <p>
171      * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}.
172      * An evaluation using JMH revealed that this method is almost two times faster than its previous version.
173      * </p>
174      *
175      * @param left first character sequence
176      * @param right second character sequence
177      * @return length of the longest common subsequence of <code>left</code> and <code>right</code>
178      * @throws IllegalArgumentException if either String input {@code null}
179      */
180     @Override
181     public Integer apply(final CharSequence left, final CharSequence right) {
182         // Quick return for invalid inputs
183         if (left == null || right == null) {
184             throw new IllegalArgumentException("Inputs must not be null");
185         }
186         // Find lengths of two strings
187         final int leftSz = left.length();
188         final int rightSz = right.length();
189 
190         // Check if we can avoid calling algorithmB which involves heap space allocation
191         if (leftSz == 0 || rightSz == 0) {
192             return 0;
193         }
194 
195         // Check if we can save even more space
196         if (leftSz < rightSz) {
197             return algorithmB(right, left)[leftSz];
198         }
199         return algorithmB(left, right)[rightSz];
200     }
201 
202     /**
203      * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input.
204      *
205      * <p>
206      * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
207      * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
208      * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
209      * subsequence need not have adjacent characters.
210      * </p>
211      *
212      * <p>
213      * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that
214      * can be derived from another sequence by deleting some elements without changing the order of the remaining
215      * elements.
216      * </p>
217      *
218      * @param left first character sequence
219      * @param right second character sequence
220      * @return the longest common subsequence found
221      * @throws IllegalArgumentException if either String input {@code null}
222      * @deprecated Deprecated as of 1.2 due to a typo in the method name.
223      * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead.
224      * This method will be removed in 2.0.
225      */
226     @Deprecated
227     public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
228         return longestCommonSubsequence(left, right);
229     }
230 
231     /**
232      * Computes the longest common subsequence between the two {@code CharSequence}'s passed as
233      * input.
234      *
235      * <p>
236      * This method implements a more efficient version of LCS algorithm which although has quadratic time, it
237      * has linear space complexity.
238      * </p>
239      *
240      *
241      * <p>
242      * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
243      * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
244      * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
245      * subsequence need not have adjacent characters.
246      * </p>
247      *
248      * <p>
249      * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that
250      * can be derived from another sequence by deleting some elements without changing the order of the remaining
251      * elements.
252      * </p>
253      *
254      * @param left first character sequence
255      * @param right second character sequence
256      * @return the longest common subsequence found
257      * @throws IllegalArgumentException if either String input {@code null}
258      * @since 1.2
259      */
260     public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) {
261         // Quick return
262         if (left == null || right == null) {
263             throw new IllegalArgumentException("Inputs must not be null");
264         }
265         // Find lengths of two strings
266         final int leftSz = left.length();
267         final int rightSz = right.length();
268 
269         // Check if we can avoid calling algorithmC which involves heap space allocation
270         if (leftSz == 0 || rightSz == 0) {
271             return "";
272         }
273 
274         // Check if we can save even more space
275         if (leftSz < rightSz) {
276             return algorithmC(right, left);
277         }
278         return algorithmC(left, right);
279     }
280 
281     /**
282      * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
283      * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
284      * O(m*n), where m=left.length() and n=right.length().
285      *
286      * @param left first character sequence
287      * @param right second character sequence
288      * @return lcsLengthArray
289      * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available.
290      * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS.
291      * This method will be removed in 2.0.
292      */
293     @Deprecated
294     public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
295         final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
296         for (int i = 0; i < left.length(); i++) {
297             for (int j = 0; j < right.length(); j++) {
298                 if (i == 0) {
299                     lcsLengthArray[i][j] = 0;
300                 }
301                 if (j == 0) {
302                     lcsLengthArray[i][j] = 0;
303                 }
304                 if (left.charAt(i) == right.charAt(j)) {
305                     lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
306                 } else {
307                     lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
308                 }
309             }
310         }
311         return lcsLengthArray;
312     }
313 
314 }