LongestCommonSubsequence.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. /**
  19.  * A similarity algorithm indicating the length of the longest common subsequence between two strings.
  20.  *
  21.  * <p>
  22.  * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
  23.  * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
  24.  * of the commonly shared length implies that the strings are completely the same in value and position.
  25.  * <i>Note.</i>  Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input
  26.  * <code>CharSequence</code>'s <code>left</code> and <code>right</code> respectively, the runtime of the
  27.  * algorithm is <i>O(m*n)</i>.
  28.  * </p>
  29.  *
  30.  * <p>
  31.  * This implementation is based on the Longest Commons Substring algorithm
  32.  * from <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem">
  33.  * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>.
  34.  * </p>
  35.  *
  36.  * <p>For further reading see:</p>
  37.  *
  38.  * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p>
  39.  *
  40.  * @since 1.0
  41.  */
  42. public class LongestCommonSubsequence implements SimilarityScore<Integer> {

  43.     /**
  44.      * Calculates longestCommonSubsequence similarity score of two <code>CharSequence</code>'s passed as
  45.      * input.
  46.      *
  47.      * @param left first character sequence
  48.      * @param right second character sequence
  49.      * @return longestCommonSubsequenceLength
  50.      * @throws IllegalArgumentException
  51.      *             if either String input {@code null}
  52.      */
  53.     @Override
  54.     public Integer apply(final CharSequence left, final CharSequence right) {
  55.         // Quick return for invalid inputs
  56.         if (left == null || right == null) {
  57.             throw new IllegalArgumentException("Inputs must not be null");
  58.         }
  59.         return logestCommonSubsequence(left, right).length();
  60.     }

  61.     /**
  62.      *
  63.      * Computes the longestCommonSubsequence between the two <code>CharSequence</code>'s passed as
  64.      * input.
  65.      *
  66.      * <p>
  67.      * Note, a substring and
  68.      * subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and
  69.      * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However,
  70.      * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a
  71.      * subsequence need not have adjacent characters.
  72.      * </p>
  73.      *
  74.      * <p>
  75.      * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that can be
  76.      * derived from another sequence by deleting some elements without changing the order of the remaining elements.
  77.      * </p>
  78.      *
  79.      * @param left first character sequence
  80.      * @param right second character sequence
  81.      * @return lcsLengthArray
  82.      * @throws IllegalArgumentException
  83.      *             if either String input {@code null}
  84.      */
  85.     public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
  86.         // Quick return
  87.         if (left == null || right == null) {
  88.             throw new IllegalArgumentException("Inputs must not be null");
  89.         }
  90.         StringBuilder longestCommonSubstringArray = new StringBuilder(Math.max(left.length(), right.length()));
  91.         int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, right);
  92.         int i = left.length() - 1;
  93.         int j = right.length() - 1;
  94.         int k = lcsLengthArray[left.length()][right.length()] - 1;
  95.         while (k >= 0) {
  96.             if (left.charAt(i) == right.charAt(j)) {
  97.                 longestCommonSubstringArray.append(left.charAt(i));
  98.                 i = i - 1;
  99.                 j = j - 1;
  100.                 k = k - 1;
  101.             } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) {
  102.                 i = i - 1;
  103.             } else {
  104.                 j = j - 1;
  105.             }
  106.         }
  107.         return longestCommonSubstringArray.reverse().toString();
  108.     }

  109.     /**
  110.      *
  111.      * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
  112.      * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
  113.      * O(m*n), where m=left.length() and n=right.length().
  114.      *
  115.      * @param left first character sequence
  116.      * @param right second character sequence
  117.      * @return lcsLengthArray
  118.      */
  119.     public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
  120.         int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
  121.         for (int i=0; i < left.length(); i++) {
  122.             for (int j=0; j < right.length(); j++) {
  123.                 if (i == 0) {
  124.                     lcsLengthArray[i][j] = 0;
  125.                 }
  126.                 if (j == 0) {
  127.                     lcsLengthArray[i][j] = 0;
  128.                 }
  129.                 if (left.charAt(i) == right.charAt(j)) {
  130.                     lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
  131.                 } else {
  132.                     lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
  133.                 }
  134.             }
  135.         }
  136.         return lcsLengthArray;
  137.     }

  138. }