LongestCommonSubsequence.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. /**
  19.  * A similarity algorithm indicating the length of the longest common subsequence between two strings.
  20.  *
  21.  * <p>
  22.  * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
  23.  * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
  24.  * of the commonly shared length implies that the strings are completely the same in value and position.
  25.  * <em>Note.</em>  Generally this algorithm is fairly inefficient, as for length <em>m</em>, <em>n</em> of the input
  26.  * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the
  27.  * algorithm is <em>O(m*n)</em>.
  28.  * </p>
  29.  *
  30.  * <p>
  31.  * As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space
  32.  * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings.
  33.  * </p>
  34.  *
  35.  * <p>
  36.  * The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below).
  37.  * </p>
  38.  *
  39.  * <p>For further reading see:</p>
  40.  * <ul>
  41.  * <li>
  42.  * Lothaire, M. <em>Applied combinatorics on words</em>. New York: Cambridge U Press, 2005. <strong>12-13</strong>
  43.  * </li>
  44.  * <li>
  45.  * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343.
  46.  * </li>
  47.  * </ul>
  48.  *
  49.  * @since 1.0
  50.  */
  51. public class LongestCommonSubsequence implements SimilarityScore<Integer> {

  52.     /**
  53.      * Singleton instance.
  54.      */
  55.     static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence();

  56.     /**
  57.      * An implementation of "ALG B" from Hirschberg's CACM '71 paper.
  58.      * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
  59.      * {@code n}, this method returns the last row of the dynamic programming (DP) table when calculating
  60.      * the LCS of the two sequences in <em>O(m*n)</em> time and <em>O(n)</em> space.
  61.      * The last element of the returned array, is the size of the LCS of the two input sequences.
  62.      *
  63.      * @param left first input sequence.
  64.      * @param right second input sequence.
  65.      * @return last row of the dynamic-programming (DP) table for calculating the LCS of {@code left} and {@code right}
  66.      * @since 1.10.0
  67.      */
  68.     private static int[] algorithmB(final CharSequence left, final CharSequence right) {
  69.         final int m = left.length();
  70.         final int n = right.length();
  71.         // Creating an array for storing two rows of DP table
  72.         final int[][] dpRows = new int[2][1 + n];
  73.         for (int i = 1; i <= m; i++) {
  74.             // K(0, j) <- K(1, j) [j = 0...n], as per the paper:
  75.             // Since we have references in Java, we can swap references instead of literal copying.
  76.             // We could also use a "binary index" using modulus operator, but directly swapping the
  77.             // two rows helps readability and keeps the code consistent with the algorithm description
  78.             // in the paper.
  79.             final int[] temp = dpRows[0];
  80.             dpRows[0] = dpRows[1];
  81.             dpRows[1] = temp;

  82.             for (int j = 1; j <= n; j++) {
  83.                 if (left.charAt(i - 1) == right.charAt(j - 1)) {
  84.                     dpRows[1][j] = dpRows[0][j - 1] + 1;
  85.                 } else {
  86.                     dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]);
  87.                 }
  88.             }
  89.         }
  90.         // LL(j) <- K(1, j) [j=0...n], as per the paper:
  91.         // We don't need literal copying of the array, we can just return the reference
  92.         return dpRows[1];
  93.     }

  94.     /**
  95.      * An implementation of "ALG C" from Hirschberg's CACM '71 paper.
  96.      * Assuming the first input sequence is of size {@code m} and the second input sequence is of size
  97.      * {@code n}, this method returns the Longest Common Subsequence (LCS) of the two sequences in
  98.      * <em>O(m*n)</em> time and <em>O(m+n)</em> space.
  99.      *
  100.      * @param left first input sequence.
  101.      * @param right second input sequence.
  102.      * @return the LCS of {@code left} and {@code right}
  103.      * @since 1.10.0
  104.      */
  105.     private static String algorithmC(final CharSequence left, final CharSequence right) {
  106.         final int m = left.length();
  107.         final int n = right.length();
  108.         final StringBuilder out = new StringBuilder();
  109.         if (m == 1) { // Handle trivial cases, as per the paper
  110.             final char leftCh = left.charAt(0);
  111.             for (int j = 0; j < n; j++) {
  112.                 if (leftCh == right.charAt(j)) {
  113.                     out.append(leftCh);
  114.                     break;
  115.                 }
  116.             }
  117.         } else if (n > 0 && m > 1) {
  118.             final int mid = m / 2; // Find the middle point
  119.             final CharSequence leftFirstPart = left.subSequence(0, mid);
  120.             final CharSequence leftSecondPart = left.subSequence(mid, m);
  121.             // Step 3 of the algorithm: two calls to Algorithm B
  122.             final int[] l1 = algorithmB(leftFirstPart, right);
  123.             final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right));
  124.             // Find k, as per the Step 4 of the algorithm
  125.             int k = 0;
  126.             int t = 0;
  127.             for (int j = 0; j <= n; j++) {
  128.                 final int s = l1[j] + l2[n - j];
  129.                 if (t < s) {
  130.                     t = s;
  131.                     k = j;
  132.                 }
  133.             }
  134.             // Step 5: solve simpler problems, recursively
  135.             out.append(algorithmC(leftFirstPart, right.subSequence(0, k)));
  136.             out.append(algorithmC(leftSecondPart, right.subSequence(k, n)));
  137.         }

  138.         return out.toString();
  139.     }

  140.     // An auxiliary method for CharSequence reversal
  141.     private static String reverse(final CharSequence s) {
  142.         return new StringBuilder(s).reverse().toString();
  143.     }

  144.     /**
  145.      * Creates a new instance.
  146.      */
  147.     public LongestCommonSubsequence() {
  148.         // empty
  149.     }

  150.     /**
  151.      * Computes the longest common subsequence similarity score of two {@code CharSequence}'s passed as
  152.      * input.
  153.      *
  154.      * <p>
  155.      * This method implements a more efficient version of LCS algorithm which has quadratic time and
  156.      * linear space complexity.
  157.      * </p>
  158.      *
  159.      * <p>
  160.      * This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}.
  161.      * An evaluation using JMH revealed that this method is almost two times faster than its previous version.
  162.      * </p>
  163.      *
  164.      * @param left first character sequence
  165.      * @param right second character sequence
  166.      * @return length of the longest common subsequence of {@code left} and {@code right}
  167.      * @throws IllegalArgumentException if either String input {@code null}
  168.      */
  169.     @Override
  170.     public Integer apply(final CharSequence left, final CharSequence right) {
  171.         // Quick return for invalid inputs
  172.         if (left == null || right == null) {
  173.             throw new IllegalArgumentException("Inputs must not be null");
  174.         }
  175.         // Find lengths of two strings
  176.         final int leftSz = left.length();
  177.         final int rightSz = right.length();
  178.         // Check if we can avoid calling algorithmB which involves heap space allocation
  179.         if (leftSz == 0 || rightSz == 0) {
  180.             return 0;
  181.         }
  182.         // Check if we can save even more space
  183.         if (leftSz < rightSz) {
  184.             return algorithmB(right, left)[leftSz];
  185.         }
  186.         return algorithmB(left, right)[rightSz];
  187.     }

  188.     /**
  189.      * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input.
  190.      *
  191.      * <p>
  192.      * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
  193.      * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
  194.      * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
  195.      * subsequence need not have adjacent characters.
  196.      * </p>
  197.      *
  198.      * <p>
  199.      * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
  200.      * can be derived from another sequence by deleting some elements without changing the order of the remaining
  201.      * elements.
  202.      * </p>
  203.      *
  204.      * @param left first character sequence
  205.      * @param right second character sequence
  206.      * @return the longest common subsequence found
  207.      * @throws IllegalArgumentException if either String input {@code null}
  208.      * @deprecated Deprecated as of 1.2 due to a typo in the method name.
  209.      * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead.
  210.      * This method will be removed in 2.0.
  211.      */
  212.     @Deprecated
  213.     public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
  214.         return longestCommonSubsequence(left, right);
  215.     }

  216.     /**
  217.      * Computes the longest common subsequence between the two {@code CharSequence}'s passed as
  218.      * input.
  219.      *
  220.      * <p>
  221.      * This method implements a more efficient version of LCS algorithm which although has quadratic time, it
  222.      * has linear space complexity.
  223.      * </p>
  224.      *
  225.      *
  226.      * <p>
  227.      * Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and
  228.      * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However,
  229.      * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a
  230.      * subsequence need not have adjacent characters.
  231.      * </p>
  232.      *
  233.      * <p>
  234.      * For reference, we give the definition of a subsequence for the reader: a <em>subsequence</em> is a sequence that
  235.      * can be derived from another sequence by deleting some elements without changing the order of the remaining
  236.      * elements.
  237.      * </p>
  238.      *
  239.      * @param left first character sequence
  240.      * @param right second character sequence
  241.      * @return the longest common subsequence found
  242.      * @throws IllegalArgumentException if either String input {@code null}
  243.      * @since 1.2
  244.      */
  245.     public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) {
  246.         // Quick return
  247.         if (left == null || right == null) {
  248.             throw new IllegalArgumentException("Inputs must not be null");
  249.         }
  250.         // Find lengths of two strings
  251.         final int leftSz = left.length();
  252.         final int rightSz = right.length();

  253.         // Check if we can avoid calling algorithmC which involves heap space allocation
  254.         if (leftSz == 0 || rightSz == 0) {
  255.             return "";
  256.         }

  257.         // Check if we can save even more space
  258.         if (leftSz < rightSz) {
  259.             return algorithmC(right, left);
  260.         }
  261.         return algorithmC(left, right);
  262.     }

  263.     /**
  264.      * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
  265.      * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
  266.      * O(m*n), where m=left.length() and n=right.length().
  267.      *
  268.      * @param left first character sequence
  269.      * @param right second character sequence
  270.      * @return lcsLengthArray
  271.      * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available.
  272.      * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS.
  273.      * This method will be removed in 2.0.
  274.      */
  275.     @Deprecated
  276.     public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
  277.         final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
  278.         for (int i = 0; i < left.length(); i++) {
  279.             for (int j = 0; j < right.length(); j++) {
  280.                 if (i == 0) {
  281.                     lcsLengthArray[i][j] = 0;
  282.                 }
  283.                 if (j == 0) {
  284.                     lcsLengthArray[i][j] = 0;
  285.                 }
  286.                 if (left.charAt(i) == right.charAt(j)) {
  287.                     lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
  288.                 } else {
  289.                     lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
  290.                 }
  291.             }
  292.         }
  293.         return lcsLengthArray;
  294.     }

  295. }