View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.sequence;
18  
19  import java.util.List;
20  
21  import org.apache.commons.collections4.Equator;
22  import org.apache.commons.collections4.functors.DefaultEquator;
23  
24  /**
25   * This class allows to compare two objects sequences.
26   * <p>
27   * The two sequences can hold any object type, as only the {@code equals}
28   * method is used to compare the elements of the sequences. It is guaranteed
29   * that the comparisons will always be done as {@code o1.equals(o2)} where
30   * {@code o1} belongs to the first sequence and {@code o2} belongs to
31   * the second sequence. This can be important if subclassing is used for some
32   * elements in the first sequence and the {@code equals} method is
33   * specialized.
34   * </p>
35   * <p>
36   * Comparison can be seen from two points of view: either as giving the smallest
37   * modification allowing to transform the first sequence into the second one, or
38   * as giving the longest sequence which is a subsequence of both initial
39   * sequences. The {@code equals} method is used to compare objects, so any
40   * object can be put into sequences. Modifications include deleting, inserting
41   * or keeping one object, starting from the beginning of the first sequence.
42   * </p>
43   * <p>
44   * This class implements the comparison algorithm, which is the very efficient
45   * algorithm from Eugene W. Myers
46   * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
47   * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
48   * the shortest possible
49   * {@link EditScript edit script}
50   * containing all the
51   * {@link EditCommand commands}
52   * needed to transform the first sequence into the second one.
53   * </p>
54   *
55   * @see EditScript
56   * @see EditCommand
57   * @see CommandVisitor
58   *
59   * @since 4.0
60   */
61  public class SequencesComparator<T> {
62  
63      /**
64       * This class is a simple placeholder to hold the end part of a path
65       * under construction in a {@link SequencesComparator SequencesComparator}.
66       */
67      private static final class Snake {
68  
69          /** Start index. */
70          private final int start;
71  
72          /** End index. */
73          private final int end;
74  
75          /** Diagonal number. */
76          private final int diag;
77  
78          /**
79           * Simple constructor. Creates a new instance of Snake with specified indices.
80           *
81           * @param start  start index of the snake
82           * @param end  end index of the snake
83           * @param diag  diagonal number
84           */
85          Snake(final int start, final int end, final int diag) {
86              this.start = start;
87              this.end   = end;
88              this.diag  = diag;
89          }
90  
91          /**
92           * Gets the diagonal number of the snake.
93           *
94           * @return diagonal number of the snake
95           */
96          public int getDiag() {
97              return diag;
98          }
99  
100         /**
101          * Gets the end index of the snake.
102          *
103          * @return end index of the snake
104          */
105         public int getEnd() {
106             return end;
107         }
108 
109         /**
110          * Gets the start index of the snake.
111          *
112          * @return start index of the snake
113          */
114         public int getStart() {
115             return start;
116         }
117     }
118 
119     /** First sequence. */
120     private final List<T> sequence1;
121 
122     /** Second sequence. */
123     private final List<T> sequence2;
124 
125     /** The equator used for testing object equality. */
126     private final Equator<? super T> equator;
127     /** Temporary variables. */
128     private final int[] vDown;
129 
130     private final int[] vUp;
131 
132     /**
133      * Simple constructor.
134      * <p>
135      * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
136      * <p>
137      * It is <em>guaranteed</em> that the comparisons will always be done as
138      * {@code o1.equals(o2)} where {@code o1} belongs to the first
139      * sequence and {@code o2} belongs to the second sequence. This can be
140      * important if subclassing is used for some elements in the first sequence
141      * and the {@code equals} method is specialized.
142      *
143      * @param sequence1  first sequence to be compared
144      * @param sequence2  second sequence to be compared
145      */
146     public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
147         this(sequence1, sequence2, DefaultEquator.defaultEquator());
148     }
149 
150     /**
151      * Simple constructor.
152      * <p>
153      * Creates a new instance of SequencesComparator with a custom {@link Equator}.
154      * <p>
155      * It is <em>guaranteed</em> that the comparisons will always be done as
156      * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first
157      * sequence and {@code o2} belongs to the second sequence.
158      *
159      * @param sequence1  first sequence to be compared
160      * @param sequence2  second sequence to be compared
161      * @param equator  the equator to use for testing object equality
162      */
163     public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
164         this.sequence1 = sequence1;
165         this.sequence2 = sequence2;
166         this.equator = equator;
167 
168         final int size = sequence1.size() + sequence2.size() + 2;
169         vDown = new int[size];
170         vUp   = new int[size];
171     }
172 
173     /**
174      * Build an edit script.
175      *
176      * @param start1  the start of the first sequence to be compared
177      * @param end1  the end of the first sequence to be compared
178      * @param start2  the start of the second sequence to be compared
179      * @param end2  the end of the second sequence to be compared
180      * @param script the edited script
181      */
182     private void buildScript(final int start1, final int end1, final int start2, final int end2,
183                              final EditScript<T> script) {
184 
185         final Snake middle = getMiddleSnake(start1, end1, start2, end2);
186 
187         if (middle == null
188                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
189                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
190 
191             int i = start1;
192             int j = start2;
193             while (i < end1 || j < end2) {
194                 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
195                     script.append(new KeepCommand<>(sequence1.get(i)));
196                     ++i;
197                     ++j;
198                 } else {
199                     if (end1 - start1 > end2 - start2) {
200                         script.append(new DeleteCommand<>(sequence1.get(i)));
201                         ++i;
202                     } else {
203                         script.append(new InsertCommand<>(sequence2.get(j)));
204                         ++j;
205                     }
206                 }
207             }
208 
209         } else {
210 
211             buildScript(start1, middle.getStart(),
212                         start2, middle.getStart() - middle.getDiag(),
213                         script);
214             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
215                 script.append(new KeepCommand<>(sequence1.get(i)));
216             }
217             buildScript(middle.getEnd(), end1,
218                         middle.getEnd() - middle.getDiag(), end2,
219                         script);
220         }
221     }
222 
223     /**
224      * Build a snake.
225      *
226      * @param start  the value of the start of the snake
227      * @param diag  the value of the diagonal of the snake
228      * @param end1  the value of the end of the first sequence to be compared
229      * @param end2  the value of the end of the second sequence to be compared
230      * @return the snake built
231      */
232     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
233         int end = start;
234         while (end - diag < end2
235                 && end < end1
236                 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
237             ++end;
238         }
239         return new Snake(start, end, diag);
240     }
241 
242     /**
243      * Gets the middle snake corresponding to two subsequences of the
244      * main sequences.
245      * <p>
246      * The snake is found using the MYERS Algorithm (this algorithm has
247      * also been implemented in the GNU diff program). This algorithm is
248      * explained in Eugene Myers article:
249      * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
250      * An O(ND) Difference Algorithm and Its Variations</a>.
251      *
252      * @param start1  the start of the first sequence to be compared
253      * @param end1  the end of the first sequence to be compared
254      * @param start2  the start of the second sequence to be compared
255      * @param end2  the end of the second sequence to be compared
256      * @return the middle snake
257      */
258     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
259         // Myers Algorithm
260         // Initialisations
261         final int m = end1 - start1;
262         final int n = end2 - start2;
263         if (m == 0 || n == 0) {
264             return null;
265         }
266 
267         final int delta  = m - n;
268         final int sum    = n + m;
269         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
270         vDown[1+offset] = start1;
271         vUp[1+offset]   = end1 + 1;
272 
273         for (int d = 0; d <= offset; ++d) {
274             // Down
275             for (int k = -d; k <= d; k += 2) {
276                 // First step
277 
278                 final int i = k + offset;
279                 if (k == -d || k != d && vDown[i-1] < vDown[i+1]) {
280                     vDown[i] = vDown[i+1];
281                 } else {
282                     vDown[i] = vDown[i-1] + 1;
283                 }
284 
285                 int x = vDown[i];
286                 int y = x - start1 + start2 - k;
287 
288                 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
289                     vDown[i] = ++x;
290                     ++y;
291                 }
292                 // Second step
293                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i-delta] <= vDown[i]) { // NOPMD
294                     return buildSnake(vUp[i-delta], k + start1 - start2, end1, end2);
295                 }
296             }
297 
298             // Up
299             for (int k = delta - d; k <= delta + d; k += 2) {
300                 // First step
301                 final int i = k + offset - delta;
302                 if (k == delta - d
303                         || k != delta + d && vUp[i+1] <= vUp[i-1]) {
304                     vUp[i] = vUp[i+1] - 1;
305                 } else {
306                     vUp[i] = vUp[i-1];
307                 }
308 
309                 int x = vUp[i] - 1;
310                 int y = x - start1 + start2 - k;
311                 while (x >= start1 && y >= start2
312                         && equator.equate(sequence1.get(x), sequence2.get(y))) {
313                     vUp[i] = x--;
314                     y--;
315                 }
316                 // Second step
317                 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
318                     return buildSnake(vUp[i], k + start1 - start2, end1, end2);
319                 }
320             }
321         }
322 
323         // this should not happen
324         throw new IllegalStateException("Internal Error");
325     }
326 
327     /**
328      * Gets the {@link EditScript} object.
329      * <p>
330      * It is guaranteed that the objects embedded in the {@link InsertCommand
331      * insert commands} come from the second sequence and that the objects
332      * embedded in either the {@link DeleteCommand delete commands} or
333      * {@link KeepCommand keep commands} come from the first sequence. This can
334      * be important if subclassing is used for some elements in the first
335      * sequence and the {@code equals} method is specialized.
336      *
337      * @return the edit script resulting from the comparison of the two
338      *         sequences
339      */
340     public EditScript<T> getScript() {
341         final EditScript<T> script = new EditScript<>();
342         buildScript(0, sequence1.size(), 0, sequence2.size(), script);
343         return script;
344     }
345 }