View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  /**
20   * A similarity algorithm indicating the length of the longest common subsequence between two strings.
21   *
22   * <p>
23   * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in
24   * common. Two strings that are entirely different, return a value of 0, and two strings that return a value
25   * of the commonly shared length implies that the strings are completely the same in value and position.
26   * <i>Note.</i>  Generally this algorithm is fairly inefficient, as for length <i>m</i>, <i>n</i> of the input
27   * <code>CharSequence</code>'s <code>left</code> and <code>right</code> respectively, the runtime of the
28   * algorithm is <i>O(m*n)</i>.
29   * </p>
30   *
31   * <p>
32   * This implementation is based on the Longest Commons Substring algorithm
33   * from <a href="https://en.wikipedia.org/wiki/Longest_common_subsequence_problem">
34   * https://en.wikipedia.org/wiki/Longest_common_subsequence_problem</a>.
35   * </p>
36   *
37   * <p>For further reading see:</p>
38   *
39   * <p>Lothaire, M. <i>Applied combinatorics on words</i>. New York: Cambridge U Press, 2005. <b>12-13</b></p>
40   *
41   * @since 1.0
42   */
43  public class LongestCommonSubsequence implements SimilarityScore<Integer> {
44  
45      /**
46       * Calculates longestCommonSubsequence similarity score of two <code>CharSequence</code>'s passed as
47       * input.
48       *
49       * @param left first character sequence
50       * @param right second character sequence
51       * @return longestCommonSubsequenceLength
52       * @throws IllegalArgumentException
53       *             if either String input {@code null}
54       */
55      @Override
56      public Integer apply(final CharSequence left, final CharSequence right) {
57          // Quick return for invalid inputs
58          if (left == null || right == null) {
59              throw new IllegalArgumentException("Inputs must not be null");
60          }
61          return logestCommonSubsequence(left, right).length();
62      }
63  
64      /**
65       *
66       * Computes the longestCommonSubsequence between the two <code>CharSequence</code>'s passed as
67       * input.
68       *
69       * <p>
70       * Note, a substring and
71       * subsequence are not necessarily the same thing. Indeed, <code>abcxyzqrs</code> and
72       * <code>xyzghfm</code> have both the same common substring and subsequence, namely <code>xyz</code>. However,
73       * <code>axbyczqrs</code> and <code>abcxyzqtv</code> have the longest common subsequence <code>xyzq</code> because a
74       * subsequence need not have adjacent characters.
75       * </p>
76       *
77       * <p>
78       * For reference, we give the definition of a subsequence for the reader: a <i>subsequence</i> is a sequence that can be
79       * derived from another sequence by deleting some elements without changing the order of the remaining elements.
80       * </p>
81       *
82       * @param left first character sequence
83       * @param right second character sequence
84       * @return lcsLengthArray
85       * @throws IllegalArgumentException
86       *             if either String input {@code null}
87       */
88      public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) {
89          // Quick return
90          if (left == null || right == null) {
91              throw new IllegalArgumentException("Inputs must not be null");
92          }
93          StringBuilder longestCommonSubstringArray = new StringBuilder(Math.max(left.length(), right.length()));
94          int[][] lcsLengthArray = longestCommonSubstringLengthArray(left, right);
95          int i = left.length() - 1;
96          int j = right.length() - 1;
97          int k = lcsLengthArray[left.length()][right.length()] - 1;
98          while (k >= 0) {
99              if (left.charAt(i) == right.charAt(j)) {
100                 longestCommonSubstringArray.append(left.charAt(i));
101                 i = i - 1;
102                 j = j - 1;
103                 k = k - 1;
104             } else if (lcsLengthArray[i + 1][j] < lcsLengthArray[i][j + 1]) {
105                 i = i - 1;
106             } else {
107                 j = j - 1;
108             }
109         }
110         return longestCommonSubstringArray.reverse().toString();
111     }
112 
113     /**
114      *
115      * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the
116      * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being
117      * O(m*n), where m=left.length() and n=right.length().
118      *
119      * @param left first character sequence
120      * @param right second character sequence
121      * @return lcsLengthArray
122      */
123     public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) {
124         int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1];
125         for (int i=0; i < left.length(); i++) {
126             for (int j=0; j < right.length(); j++) {
127                 if (i == 0) {
128                     lcsLengthArray[i][j] = 0;
129                 }
130                 if (j == 0) {
131                     lcsLengthArray[i][j] = 0;
132                 }
133                 if (left.charAt(i) == right.charAt(j)) {
134                     lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1;
135                 } else {
136                     lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]);
137                 }
138             }
139         }
140         return lcsLengthArray;
141     }
142 
143 }