View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.sequence;
18  
19  import java.util.List;
20  
21  import org.apache.commons.collections4.Equator;
22  import org.apache.commons.collections4.functors.DefaultEquator;
23  
24  /**
25   * This class allows to compare two objects sequences.
26   * <p>
27   * The two sequences can hold any object type, as only the <code>equals</code>
28   * method is used to compare the elements of the sequences. It is guaranteed
29   * that the comparisons will always be done as <code>o1.equals(o2)</code> where
30   * <code>o1</code> belongs to the first sequence and <code>o2</code> belongs to
31   * the second sequence. This can be important if subclassing is used for some
32   * elements in the first sequence and the <code>equals</code> method is
33   * specialized.
34   * </p>
35   * <p>
36   * Comparison can be seen from two points of view: either as giving the smallest
37   * modification allowing to transform the first sequence into the second one, or
38   * as giving the longest sequence which is a subsequence of both initial
39   * sequences. The <code>equals</code> method is used to compare objects, so any
40   * object can be put into sequences. Modifications include deleting, inserting
41   * or keeping one object, starting from the beginning of the first sequence.
42   * </p>
43   * <p>
44   * This class implements the comparison algorithm, which is the very efficient
45   * algorithm from Eugene W. Myers
46   * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
47   * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
48   * the shortest possible
49   * {@link EditScript edit script}
50   * containing all the
51   * {@link EditCommand commands}
52   * needed to transform the first sequence into the second one.
53   * </p>
54   *
55   * @see EditScript
56   * @see EditCommand
57   * @see CommandVisitor
58   *
59   * @since 4.0
60   */
61  public class SequencesComparator<T> {
62  
63      /** First sequence. */
64      private final List<T> sequence1;
65  
66      /** Second sequence. */
67      private final List<T> sequence2;
68  
69      /** The equator used for testing object equality. */
70      private final Equator<? super T> equator;
71  
72      /** Temporary variables. */
73      private final int[] vDown;
74      private final int[] vUp;
75  
76      /**
77       * Simple constructor.
78       * <p>
79       * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
80       * <p>
81       * It is <em>guaranteed</em> that the comparisons will always be done as
82       * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
83       * sequence and <code>o2</code> belongs to the second sequence. This can be
84       * important if subclassing is used for some elements in the first sequence
85       * and the <code>equals</code> method is specialized.
86       *
87       * @param sequence1  first sequence to be compared
88       * @param sequence2  second sequence to be compared
89       */
90      public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
91          this(sequence1, sequence2, DefaultEquator.defaultEquator());
92      }
93  
94      /**
95       * Simple constructor.
96       * <p>
97       * Creates a new instance of SequencesComparator with a custom {@link Equator}.
98       * <p>
99       * It is <em>guaranteed</em> that the comparisons will always be done as
100      * <code>Equator.equate(o1, o2)</code> where <code>o1</code> belongs to the first
101      * sequence and <code>o2</code> belongs to the second sequence.
102      *
103      * @param sequence1  first sequence to be compared
104      * @param sequence2  second sequence to be compared
105      * @param equator  the equator to use for testing object equality
106      */
107     public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
108         this.sequence1 = sequence1;
109         this.sequence2 = sequence2;
110         this.equator = equator;
111 
112         final int size = sequence1.size() + sequence2.size() + 2;
113         vDown = new int[size];
114         vUp   = new int[size];
115     }
116 
117     /**
118      * Get the {@link EditScript} object.
119      * <p>
120      * It is guaranteed that the objects embedded in the {@link InsertCommand
121      * insert commands} come from the second sequence and that the objects
122      * embedded in either the {@link DeleteCommand delete commands} or
123      * {@link KeepCommand keep commands} come from the first sequence. This can
124      * be important if subclassing is used for some elements in the first
125      * sequence and the <code>equals</code> method is specialized.
126      *
127      * @return the edit script resulting from the comparison of the two
128      *         sequences
129      */
130     public EditScript<T> getScript() {
131         final EditScript<T> script = new EditScript<>();
132         buildScript(0, sequence1.size(), 0, sequence2.size(), script);
133         return script;
134     }
135 
136     /**
137      * Build a snake.
138      *
139      * @param start  the value of the start of the snake
140      * @param diag  the value of the diagonal of the snake
141      * @param end1  the value of the end of the first sequence to be compared
142      * @param end2  the value of the end of the second sequence to be compared
143      * @return the snake built
144      */
145     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
146         int end = start;
147         while (end - diag < end2
148                 && end < end1
149                 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
150             ++end;
151         }
152         return new Snake(start, end, diag);
153     }
154 
155     /**
156      * Get the middle snake corresponding to two subsequences of the
157      * main sequences.
158      * <p>
159      * The snake is found using the MYERS Algorithm (this algorithms has
160      * also been implemented in the GNU diff program). This algorithm is
161      * explained in Eugene Myers article:
162      * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
163      * An O(ND) Difference Algorithm and Its Variations</a>.
164      *
165      * @param start1  the begin of the first sequence to be compared
166      * @param end1  the end of the first sequence to be compared
167      * @param start2  the begin of the second sequence to be compared
168      * @param end2  the end of the second sequence to be compared
169      * @return the middle snake
170      */
171     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
172         // Myers Algorithm
173         // Initialisations
174         final int m = end1 - start1;
175         final int n = end2 - start2;
176         if (m == 0 || n == 0) {
177             return null;
178         }
179 
180         final int delta  = m - n;
181         final int sum    = n + m;
182         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
183         vDown[1+offset] = start1;
184         vUp[1+offset]   = end1 + 1;
185 
186         for (int d = 0; d <= offset ; ++d) {
187             // Down
188             for (int k = -d; k <= d; k += 2) {
189                 // First step
190 
191                 final int i = k + offset;
192                 if (k == -d || k != d && vDown[i-1] < vDown[i+1]) {
193                     vDown[i] = vDown[i+1];
194                 } else {
195                     vDown[i] = vDown[i-1] + 1;
196                 }
197 
198                 int x = vDown[i];
199                 int y = x - start1 + start2 - k;
200 
201                 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
202                     vDown[i] = ++x;
203                     ++y;
204                 }
205                 // Second step
206                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
207                     if (vUp[i-delta] <= vDown[i]) { // NOPMD
208                         return buildSnake(vUp[i-delta], k + start1 - start2, end1, end2);
209                     }
210                 }
211             }
212 
213             // Up
214             for (int k = delta - d; k <= delta + d; k += 2) {
215                 // First step
216                 final int i = k + offset - delta;
217                 if (k == delta - d
218                         || k != delta + d && vUp[i+1] <= vUp[i-1]) {
219                     vUp[i] = vUp[i+1] - 1;
220                 } else {
221                     vUp[i] = vUp[i-1];
222                 }
223 
224                 int x = vUp[i] - 1;
225                 int y = x - start1 + start2 - k;
226                 while (x >= start1 && y >= start2
227                         && equator.equate(sequence1.get(x), sequence2.get(y))) {
228                     vUp[i] = x--;
229                     y--;
230                 }
231                 // Second step
232                 if (delta % 2 == 0 && -d <= k && k <= d ) {
233                     if (vUp[i] <= vDown[i + delta]) { // NOPMD
234                         return buildSnake(vUp[i], k + start1 - start2, end1, end2);
235                     }
236                 }
237             }
238         }
239 
240         // this should not happen
241         throw new RuntimeException("Internal Error");
242     }
243 
244 
245     /**
246      * Build an edit script.
247      *
248      * @param start1  the begin of the first sequence to be compared
249      * @param end1  the end of the first sequence to be compared
250      * @param start2  the begin of the second sequence to be compared
251      * @param end2  the end of the second sequence to be compared
252      * @param script the edited script
253      */
254     private void buildScript(final int start1, final int end1, final int start2, final int end2,
255                              final EditScript<T> script) {
256 
257         final Snake middle = getMiddleSnake(start1, end1, start2, end2);
258 
259         if (middle == null
260                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
261                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
262 
263             int i = start1;
264             int j = start2;
265             while (i < end1 || j < end2) {
266                 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
267                     script.append(new KeepCommand<>(sequence1.get(i)));
268                     ++i;
269                     ++j;
270                 } else {
271                     if (end1 - start1 > end2 - start2) {
272                         script.append(new DeleteCommand<>(sequence1.get(i)));
273                         ++i;
274                     } else {
275                         script.append(new InsertCommand<>(sequence2.get(j)));
276                         ++j;
277                     }
278                 }
279             }
280 
281         } else {
282 
283             buildScript(start1, middle.getStart(),
284                         start2, middle.getStart() - middle.getDiag(),
285                         script);
286             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
287                 script.append(new KeepCommand<>(sequence1.get(i)));
288             }
289             buildScript(middle.getEnd(), end1,
290                         middle.getEnd() - middle.getDiag(), end2,
291                         script);
292         }
293     }
294 
295     /**
296      * This class is a simple placeholder to hold the end part of a path
297      * under construction in a {@link SequencesComparator SequencesComparator}.
298      */
299     private static class Snake {
300 
301         /** Start index. */
302         private final int start;
303 
304         /** End index. */
305         private final int end;
306 
307         /** Diagonal number. */
308         private final int diag;
309 
310         /**
311          * Simple constructor. Creates a new instance of Snake with specified indices.
312          *
313          * @param start  start index of the snake
314          * @param end  end index of the snake
315          * @param diag  diagonal number
316          */
317         public Snake(final int start, final int end, final int diag) {
318             this.start = start;
319             this.end   = end;
320             this.diag  = diag;
321         }
322 
323         /**
324          * Get the start index of the snake.
325          *
326          * @return start index of the snake
327          */
328         public int getStart() {
329             return start;
330         }
331 
332         /**
333          * Get the end index of the snake.
334          *
335          * @return end index of the snake
336          */
337         public int getEnd() {
338             return end;
339         }
340 
341         /**
342          * Get the diagonal number of the snake.
343          *
344          * @return diagonal number of the snake
345          */
346         public int getDiag() {
347             return diag;
348         }
349     }
350 }