View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.diff;
18  
19  /**
20   * <p>
21   * It is guaranteed that the comparisons will always be done as
22   * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
23   * sequence and <code>o2</code> belongs to the second sequence. This can
24   * be important if subclassing is used for some elements in the first
25   * sequence and the <code>equals</code> method is specialized.
26   * </p>
27   * <p>
28   * Comparison can be seen from two points of view: either as giving the smallest
29   * modification allowing to transform the first sequence into the second one, or
30   * as giving the longest sequence which is a subsequence of both initial
31   * sequences. The <code>equals</code> method is used to compare objects, so any
32   * object can be put into sequences. Modifications include deleting, inserting
33   * or keeping one object, starting from the beginning of the first sequence.
34   * </p>
35   * <p>
36   * This class implements the comparison algorithm, which is the very efficient
37   * algorithm from Eugene W. Myers
38   * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
39   * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
40   * the shortest possible {@link EditScript edit script} containing all the
41   * {@link EditCommand commands} needed to transform the first sequence into
42   * the second one.
43   *
44   * <p>
45   * This code has been adapted from Apache Commons Collections 4.0.
46   * </p>
47   *
48   * @see EditScript
49   * @see EditCommand
50   * @see CommandVisitor
51   * @since 1.0
52   */
53  public class StringsComparator {
54  
55      /**
56       * First character sequence.
57       */
58      private final String left;
59      /**
60       * Second character sequence.
61       */
62      private final String right;
63      /**
64       * Temporary array.
65       */
66      private final int[] vDown;
67      /**
68       * Temporary array.
69       */
70      private final int[] vUp;
71  
72      /**
73       * Simple constructor.
74       * <p>
75       * Creates a new instance of StringsComparator.
76       * </p>
77       * <p>
78       * It is <em>guaranteed</em> that the comparisons will always be done as
79       * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
80       * sequence and <code>o2</code> belongs to the second sequence. This can be
81       * important if subclassing is used for some elements in the first sequence
82       * and the <code>equals</code> method is specialized.
83       * </p>
84       *
85       * @param left first character sequence to be compared
86       * @param right second character sequence to be compared
87       */
88      public StringsComparator(final String left, final String right) {
89          this.left = left;
90          this.right = right;
91  
92          final int size = left.length() + right.length() + 2;
93          vDown = new int[size];
94          vUp   = new int[size];
95      }
96  
97      /**
98       * Get the {@link EditScript} object.
99       * <p>
100      * It is guaranteed that the objects embedded in the {@link InsertCommand
101      * insert commands} come from the second sequence and that the objects
102      * embedded in either the {@link DeleteCommand delete commands} or
103      * {@link KeepCommand keep commands} come from the first sequence. This can
104      * be important if subclassing is used for some elements in the first
105      * sequence and the <code>equals</code> method is specialized.
106      * </p>
107      *
108      * @return the edit script resulting from the comparison of the two
109      *         sequences
110      */
111     public EditScript<Character> getScript() {
112         final EditScript<Character> script = new EditScript<>();
113         buildScript(0, left.length(), 0, right.length(), script);
114         return script;
115     }
116 
117     /**
118      * Build an edit script.
119      *
120      * @param start1  the begin of the first sequence to be compared
121      * @param end1  the end of the first sequence to be compared
122      * @param start2  the begin of the second sequence to be compared
123      * @param end2  the end of the second sequence to be compared
124      * @param script the edited script
125      */
126     private void buildScript(final int start1, final int end1, final int start2, final int end2,
127             final EditScript<Character> script) {
128         final Snake middle = getMiddleSnake(start1, end1, start2, end2);
129 
130         if (middle == null
131                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
132                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
133 
134             int i = start1;
135             int j = start2;
136             while (i < end1 || j < end2) {
137                 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
138                     script.append(new KeepCommand<>(left.charAt(i)));
139                     ++i;
140                     ++j;
141                 } else {
142                     if (end1 - start1 > end2 - start2) {
143                         script.append(new DeleteCommand<>(left.charAt(i)));
144                         ++i;
145                     } else {
146                         script.append(new InsertCommand<>(right.charAt(j)));
147                         ++j;
148                     }
149                 }
150             }
151 
152         } else {
153 
154             buildScript(start1, middle.getStart(),
155                         start2, middle.getStart() - middle.getDiag(),
156                         script);
157             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
158                 script.append(new KeepCommand<>(left.charAt(i)));
159             }
160             buildScript(middle.getEnd(), end1,
161                         middle.getEnd() - middle.getDiag(), end2,
162                         script);
163         }
164     }
165 
166     /**
167      * Get the middle snake corresponding to two subsequences of the
168      * main sequences.
169      * <p>
170      * The snake is found using the MYERS Algorithm (this algorithms has
171      * also been implemented in the GNU diff program). This algorithm is
172      * explained in Eugene Myers article:
173      * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
174      * An O(ND) Difference Algorithm and Its Variations</a>.
175      * </p>
176      *
177      * @param start1  the begin of the first sequence to be compared
178      * @param end1  the end of the first sequence to be compared
179      * @param start2  the begin of the second sequence to be compared
180      * @param end2  the end of the second sequence to be compared
181      * @return the middle snake
182      */
183     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
184         // Myers Algorithm
185         // Initialisations
186         final int m = end1 - start1;
187         final int n = end2 - start2;
188         if (m == 0 || n == 0) {
189             return null;
190         }
191 
192         final int delta  = m - n;
193         final int sum    = n + m;
194         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
195         vDown[1 + offset] = start1;
196         vUp[1 + offset]   = end1 + 1;
197 
198         for (int d = 0; d <= offset; ++d) {
199             // Down
200             for (int k = -d; k <= d; k += 2) {
201                 // First step
202 
203                 final int i = k + offset;
204                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
205                     vDown[i] = vDown[i + 1];
206                 } else {
207                     vDown[i] = vDown[i - 1] + 1;
208                 }
209 
210                 int x = vDown[i];
211                 int y = x - start1 + start2 - k;
212 
213                 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
214                     vDown[i] = ++x;
215                     ++y;
216                 }
217                 // Second step
218                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
219                     if (vUp[i - delta] <= vDown[i]) { // NOPMD
220                         return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
221                     }
222                 }
223             }
224 
225             // Up
226             for (int k = delta - d; k <= delta + d; k += 2) {
227                 // First step
228                 final int i = k + offset - delta;
229                 if (k == delta - d
230                         || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
231                     vUp[i] = vUp[i + 1] - 1;
232                 } else {
233                     vUp[i] = vUp[i - 1];
234                 }
235 
236                 int x = vUp[i] - 1;
237                 int y = x - start1 + start2 - k;
238                 while (x >= start1 && y >= start2
239                         && left.charAt(x) == right.charAt(y)) {
240                     vUp[i] = x--;
241                     y--;
242                 }
243                 // Second step
244                 if (delta % 2 == 0 && -d <= k && k <= d) {
245                     if (vUp[i] <= vDown[i + delta]) { // NOPMD
246                         return buildSnake(vUp[i], k + start1 - start2, end1, end2);
247                     }
248                 }
249             }
250         }
251 
252         // this should not happen
253         throw new RuntimeException("Internal Error");
254     }
255 
256     /**
257      * Build a snake.
258      *
259      * @param start  the value of the start of the snake
260      * @param diag  the value of the diagonal of the snake
261      * @param end1  the value of the end of the first sequence to be compared
262      * @param end2  the value of the end of the second sequence to be compared
263      * @return the snake built
264      */
265     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
266         int end = start;
267         while (end - diag < end2
268                 && end < end1
269                 && left.charAt(end) == right.charAt(end - diag)) {
270             ++end;
271         }
272         return new Snake(start, end, diag);
273     }
274 
275     /**
276      * This class is a simple placeholder to hold the end part of a path
277      * under construction in a {@link StringsComparator StringsComparator}.
278      */
279     private static class Snake {
280 
281         /** Start index. */
282         private final int start;
283 
284         /** End index. */
285         private final int end;
286 
287         /** Diagonal number. */
288         private final int diag;
289 
290         /**
291          * Simple constructor. Creates a new instance of Snake with specified indices.
292          *
293          * @param start  start index of the snake
294          * @param end  end index of the snake
295          * @param diag  diagonal number
296          */
297         Snake(final int start, final int end, final int diag) {
298             this.start = start;
299             this.end   = end;
300             this.diag  = diag;
301         }
302 
303         /**
304          * Get the start index of the snake.
305          *
306          * @return start index of the snake
307          */
308         public int getStart() {
309             return start;
310         }
311 
312         /**
313          * Get the end index of the snake.
314          *
315          * @return end index of the snake
316          */
317         public int getEnd() {
318             return end;
319         }
320 
321         /**
322          * Get the diagonal number of the snake.
323          *
324          * @return diagonal number of the snake
325          */
326         public int getDiag() {
327             return diag;
328         }
329     }
330 
331 }