001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019/**
020 * A similarity algorithm indicating the length of the longest common subsequence between two strings.
021 *
022 * <p>
023 * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
024 * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
025 * of the commonly shared length implies that the strings are completely the same in value and position.
026 * <em>Note.</em>  Generally this algorithm is fairly inefficient, as for length <em>m</em>, <em>n</em> of the input
027 * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the
028 * algorithm is <em>O(m*n)</em>.
029 * </p>
030 *
031 * <p>
032 * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space
033 * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings.
034 * </p>
035 *
036 * <p>
037 * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below).
038 * </p>
039 *
040 * <p>For further reading see:</p>
041 * <ul>
042 * <li>
043 * Lothaire, M. <em>Applied combinatorics on words</em>. New York: Cambridge U Press, 2005. <strong>12-13</strong>
044 * </li>
045 * <li>
046 * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343.
047 * </li>
048 * </ul>
049 *
050 * @since 1.0
051 */
052public class LongestCommonSubsequence implements SimilarityScore<Integer> {
053
054    /**
055     * The singleton instance.
056     */
057    static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence();
058
059    /**
060     * An implementation of "ALG B" from Hirschberg's CACM '71 paper.
061     * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
062     * {@code n}, this method returns the last row of the dynamic programming (DP) table when calculating
063     * the LCS of the two sequences in <em>O(m*n)</em> time and <em>O(n)</em> space.
064     * The last element of the returned array, is the size of the LCS of the two input sequences.
065     *
066     * @param left first input sequence.
067     * @param right second input sequence.
068     * @return last row of the dynamic-programming (DP) table for calculating the LCS of {@code left} and {@code right}.
069     * @since 1.10.0
070     */
071    private static int[] algorithmB(final CharSequence left, final CharSequence right) {
072        final int m = left.length();
073        final int n = right.length();
074        // Creating an array for storing two rows of DP table
075        final int[][] dpRows = new int[2][1 + n];
076        for (int i = 1; i <= m; i++) {
077            // K(0, j) <- K(1, j) [j = 0...n], as per the paper:
078            // Since we have references in Java, we can swap references instead of literal copying.
079            // We could also use a "binary index" using modulus operator, but directly swapping the
080            // two rows helps readability and keeps the code consistent with the algorithm description
081            // in the paper.
082            final int[] temp = dpRows[0];
083            dpRows[0] = dpRows[1];
084            dpRows[1] = temp;
085
086            for (int j = 1; j <= n; j++) {
087                if (left.charAt(i - 1) == right.charAt(j - 1)) {
088                    dpRows[1][j] = dpRows[0][j - 1] + 1;
089                } else {
090                    dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]);
091                }
092            }
093        }
094        // LL(j) <- K(1, j) [j=0...n], as per the paper:
095        // We don't need literal copying of the array, we can just return the reference
096        return dpRows[1];
097    }
098
099    /**
100     * An implementation of "ALG C" from Hirschberg's CACM '71 paper.
101     * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
102     * {@code n}, this method returns the Longest Common Subsequence (LCS) of the two sequences in
103     * <em>O(m*n)</em> time and <em>O(m+n)</em> space.
104     *
105     * @param left first input sequence.
106     * @param right second input sequence.
107     * @return the LCS of {@code left} and {@code right}.
108     * @since 1.10.0
109     */
110    private static String algorithmC(final CharSequence left, final CharSequence right) {
111        final int m = left.length();
112        final int n = right.length();
113        final StringBuilder out = new StringBuilder();
114        if (m == 1) { // Handle trivial cases, as per the paper
115            final char leftCh = left.charAt(0);
116            for (int j = 0; j < n; j++) {
117                if (leftCh == right.charAt(j)) {
118                    out.append(leftCh);
119                    break;
120                }
121            }
122        } else if (n > 0 && m > 1) {
123            final int mid = m / 2; // Find the middle point
124            final CharSequence leftFirstPart = left.subSequence(0, mid);
125            final CharSequence leftSecondPart = left.subSequence(mid, m);
126            // Step 3 of the algorithm: two calls to Algorithm B
127            final int[] l1 = algorithmB(leftFirstPart, right);
128            final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right));
129            // Find k, as per the Step 4 of the algorithm
130            int k = 0;
131            int t = 0;
132            for (int j = 0; j <= n; j++) {
133                final int s = l1[j] + l2[n - j];
134                if (t < s) {
135                    t = s;
136                    k = j;
137                }
138            }
139            // Step 5: solve simpler problems, recursively
140            out.append(algorithmC(leftFirstPart, right.subSequence(0, k)));
141            out.append(algorithmC(leftSecondPart, right.subSequence(k, n)));
142        }
143
144        return out.toString();
145    }
146
147    /*
148     *  A method for CharSequence reversal.
149     */
150    private static String reverse(final CharSequence s) {
151        return new StringBuilder(s).reverse().toString();
152    }
153
154    /**
155     * Creates a new instance.
156     */
157    public LongestCommonSubsequence() {
158        // empty
159    }
160
161    /**
162     * Computes the longest common subsequence similarity score of two {@code CharSequence}'s passed as
163     * input.
164     *
165     * <p>
166     * This method implements a more efficient version of LCS algorithm which has quadratic time and
167     * linear space complexity.
168     * </p>
169     *
170     * <p>
171     * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}.
172     * An evaluation using JMH revealed that this method is almost two times faster than its previous version.
173     * </p>
174     *
175     * @param left first character sequence.
176     * @param right second character sequence.
177     * @return length of the longest common subsequence of {@code left} and {@code right}.
178     * @throws IllegalArgumentException if either String input {@code null}.
179     */
180    @Override
181    public Integer apply(final CharSequence left, final CharSequence right) {
182        // Quick return for invalid inputs
183        if (left == null || right == null) {
184            throw new IllegalArgumentException("Inputs must not be null");
185        }
186        // Find lengths of two strings
187        final int leftSz = left.length();
188        final int rightSz = right.length();
189        // Check if we can avoid calling algorithmB which involves heap space allocation
190        if (leftSz == 0 || rightSz == 0) {
191            return 0;
192        }
193        // Check if we can save even more space
194        if (leftSz < rightSz) {
195            return algorithmB(right, left)[leftSz];
196        }
197        return algorithmB(left, right)[rightSz];
198    }
199
200    /**
201     * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input.
202     *
203     * <p>
204     * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
205     * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
206     * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
207     * subsequence need not have adjacent characters.
208     * </p>
209     *
210     * <p>
211     * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
212     * can be derived from another sequence by deleting some elements without changing the order of the remaining
213     * elements.
214     * </p>
215     *
216     * @param left first character sequence.
217     * @param right second character sequence.
218     * @return the longest common subsequence found.
219     * @throws IllegalArgumentException if either String input {@code null}.
220     * @deprecated Deprecated as of 1.2 due to a typo in the method name.
221     * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead.
222     * This method will be removed in 2.0.
223     */
224    @Deprecated
225    public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
226        return longestCommonSubsequence(left, right);
227    }
228
229    /**
230     * Computes the longest common subsequence between the two {@code CharSequence}'s passed as
231     * input.
232     *
233     * <p>
234     * This method implements a more efficient version of LCS algorithm which although has quadratic time, it
235     * has linear space complexity.
236     * </p>
237     *
238     *
239     * <p>
240     * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
241     * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
242     * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
243     * subsequence need not have adjacent characters.
244     * </p>
245     *
246     * <p>
247     * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
248     * can be derived from another sequence by deleting some elements without changing the order of the remaining
249     * elements.
250     * </p>
251     *
252     * @param left first character sequence.
253     * @param right second character sequence.
254     * @return the longest common subsequence found.
255     * @throws IllegalArgumentException if either String input {@code null}.
256     * @since 1.2
257     */
258    public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) {
259        // Quick return
260        if (left == null || right == null) {
261            throw new IllegalArgumentException("Inputs must not be null");
262        }
263        // Find lengths of two strings
264        final int leftSz = left.length();
265        final int rightSz = right.length();
266
267        // Check if we can avoid calling algorithmC which involves heap space allocation
268        if (leftSz == 0 || rightSz == 0) {
269            return "";
270        }
271
272        // Check if we can save even more space
273        if (leftSz < rightSz) {
274            return algorithmC(right, left);
275        }
276        return algorithmC(left, right);
277    }
278
279    /**
280     * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
281     * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
282     * O(m*n), where m=left.length() and n=right.length().
283     *
284     * @param left first character sequence.
285     * @param right second character sequence.
286     * @return longest common substring length array.
287     * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available.
288     * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS.
289     * This method will be removed in 2.0.
290     */
291    @Deprecated
292    public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
293        final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
294        for (int i = 0; i < left.length(); i++) {
295            for (int j = 0; j < right.length(); j++) {
296                if (i == 0) {
297                    lcsLengthArray[i][j] = 0;
298                }
299                if (j == 0) {
300                    lcsLengthArray[i][j] = 0;
301                }
302                if (left.charAt(i) == right.charAt(j)) {
303                    lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
304                } else {
305                    lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
306                }
307            }
308        }
309        return lcsLengthArray;
310    }
311
312}