001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.sequence;
018
019import java.util.List;
020
021import org.apache.commons.collections4.Equator;
022import org.apache.commons.collections4.functors.DefaultEquator;
023
024/**
025 * This class allows to compare two objects sequences.
026 * <p>
027 * The two sequences can hold any object type, as only the {@code equals}
028 * method is used to compare the elements of the sequences. It is guaranteed
029 * that the comparisons will always be done as {@code o1.equals(o2)} where
030 * {@code o1} belongs to the first sequence and {@code o2} belongs to
031 * the second sequence. This can be important if subclassing is used for some
032 * elements in the first sequence and the {@code equals} method is
033 * specialized.
034 * </p>
035 * <p>
036 * Comparison can be seen from two points of view: either as giving the smallest
037 * modification allowing to transform the first sequence into the second one, or
038 * as giving the longest sequence which is a subsequence of both initial
039 * sequences. The {@code equals} method is used to compare objects, so any
040 * object can be put into sequences. Modifications include deleting, inserting
041 * or keeping one object, starting from the beginning of the first sequence.
042 * </p>
043 * <p>
044 * This class implements the comparison algorithm, which is the very efficient
045 * algorithm from Eugene W. Myers
046 * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
047 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
048 * the shortest possible
049 * {@link EditScript edit script}
050 * containing all the
051 * {@link EditCommand commands}
052 * needed to transform the first sequence into the second one.
053 * </p>
054 *
055 * @param <T> the type of elements in the lists.
056 * @see EditScript
057 * @see EditCommand
058 * @see CommandVisitor
059 * @since 4.0
060 */
061public class SequencesComparator<T> {
062
063    /**
064     * This class is a simple placeholder to hold the end part of a path
065     * under construction in a {@link SequencesComparator SequencesComparator}.
066     */
067    private static final class Snake {
068
069        /** Start index. */
070        private final int start;
071
072        /** End index. */
073        private final int end;
074
075        /** Diagonal number. */
076        private final int diag;
077
078        /**
079         * Simple constructor. Creates a new instance of Snake with specified indices.
080         *
081         * @param start  start index of the snake
082         * @param end  end index of the snake
083         * @param diag  diagonal number
084         */
085        Snake(final int start, final int end, final int diag) {
086            this.start = start;
087            this.end   = end;
088            this.diag  = diag;
089        }
090
091        /**
092         * Gets the diagonal number of the snake.
093         *
094         * @return diagonal number of the snake
095         */
096        public int getDiag() {
097            return diag;
098        }
099
100        /**
101         * Gets the end index of the snake.
102         *
103         * @return end index of the snake
104         */
105        public int getEnd() {
106            return end;
107        }
108
109        /**
110         * Gets the start index of the snake.
111         *
112         * @return start index of the snake
113         */
114        public int getStart() {
115            return start;
116        }
117    }
118
119    /** First sequence. */
120    private final List<T> sequence1;
121
122    /** Second sequence. */
123    private final List<T> sequence2;
124
125    /** The equator used for testing object equality. */
126    private final Equator<? super T> equator;
127    /** Temporary variables. */
128    private final int[] vDown;
129
130    private final int[] vUp;
131
132    /**
133     * Simple constructor.
134     * <p>
135     * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
136     * <p>
137     * It is <em>guaranteed</em> that the comparisons will always be done as
138     * {@code o1.equals(o2)} where {@code o1} belongs to the first
139     * sequence and {@code o2} belongs to the second sequence. This can be
140     * important if subclassing is used for some elements in the first sequence
141     * and the {@code equals} method is specialized.
142     *
143     * @param sequence1  first sequence to be compared
144     * @param sequence2  second sequence to be compared
145     */
146    public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
147        this(sequence1, sequence2, DefaultEquator.defaultEquator());
148    }
149
150    /**
151     * Simple constructor.
152     * <p>
153     * Creates a new instance of SequencesComparator with a custom {@link Equator}.
154     * <p>
155     * It is <em>guaranteed</em> that the comparisons will always be done as
156     * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first
157     * sequence and {@code o2} belongs to the second sequence.
158     *
159     * @param sequence1  first sequence to be compared
160     * @param sequence2  second sequence to be compared
161     * @param equator  the equator to use for testing object equality
162     */
163    public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
164        this.sequence1 = sequence1;
165        this.sequence2 = sequence2;
166        this.equator = equator;
167
168        final int size = sequence1.size() + sequence2.size() + 2;
169        vDown = new int[size];
170        vUp   = new int[size];
171    }
172
173    /**
174     * Build an edit script.
175     *
176     * @param start1  the start of the first sequence to be compared
177     * @param end1  the end of the first sequence to be compared
178     * @param start2  the start of the second sequence to be compared
179     * @param end2  the end of the second sequence to be compared
180     * @param script the edited script
181     */
182    private void buildScript(final int start1, final int end1, final int start2, final int end2,
183                             final EditScript<T> script) {
184
185        final Snake middle = getMiddleSnake(start1, end1, start2, end2);
186
187        if (middle == null
188                || middle.getStart() == end1 && middle.getDiag() == end1 - end2
189                || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
190
191            int i = start1;
192            int j = start2;
193            while (i < end1 || j < end2) {
194                if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
195                    script.append(new KeepCommand<>(sequence1.get(i)));
196                    ++i;
197                    ++j;
198                } else if (end1 - start1 > end2 - start2) {
199                    script.append(new DeleteCommand<>(sequence1.get(i)));
200                    ++i;
201                } else {
202                    script.append(new InsertCommand<>(sequence2.get(j)));
203                    ++j;
204                }
205            }
206
207        } else {
208
209            buildScript(start1, middle.getStart(),
210                        start2, middle.getStart() - middle.getDiag(),
211                        script);
212            for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
213                script.append(new KeepCommand<>(sequence1.get(i)));
214            }
215            buildScript(middle.getEnd(), end1,
216                        middle.getEnd() - middle.getDiag(), end2,
217                        script);
218        }
219    }
220
221    /**
222     * Build a snake.
223     *
224     * @param start  the value of the start of the snake
225     * @param diag  the value of the diagonal of the snake
226     * @param end1  the value of the end of the first sequence to be compared
227     * @param end2  the value of the end of the second sequence to be compared
228     * @return the snake built
229     */
230    private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
231        int end = start;
232        while (end - diag < end2
233                && end < end1
234                && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
235            ++end;
236        }
237        return new Snake(start, end, diag);
238    }
239
240    /**
241     * Gets the middle snake corresponding to two subsequences of the
242     * main sequences.
243     * <p>
244     * The snake is found using the MYERS Algorithm (this algorithm has
245     * also been implemented in the GNU diff program). This algorithm is
246     * explained in Eugene Myers article:
247     * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
248     * An O(ND) Difference Algorithm and Its Variations</a>.
249     *
250     * @param start1  the start of the first sequence to be compared
251     * @param end1  the end of the first sequence to be compared
252     * @param start2  the start of the second sequence to be compared
253     * @param end2  the end of the second sequence to be compared
254     * @return the middle snake
255     */
256    private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
257        // Myers Algorithm
258        // Initialisations
259        final int m = end1 - start1;
260        final int n = end2 - start2;
261        if (m == 0 || n == 0) {
262            return null;
263        }
264
265        final int delta = m - n;
266        final int sum = n + m;
267        final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
268        vDown[1 + offset] = start1;
269        vUp[1 + offset] = end1 + 1;
270
271        for (int d = 0; d <= offset; ++d) {
272            // Down
273            for (int k = -d; k <= d; k += 2) {
274                // First step
275
276                final int i = k + offset;
277                if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
278                    vDown[i] = vDown[i + 1];
279                } else {
280                    vDown[i] = vDown[i - 1] + 1;
281                }
282
283                int x = vDown[i];
284                int y = x - start1 + start2 - k;
285
286                while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
287                    vDown[i] = ++x;
288                    ++y;
289                }
290                // Second step
291                if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
292                    return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
293                }
294            }
295
296            // Up
297            for (int k = delta - d; k <= delta + d; k += 2) {
298                // First step
299                final int i = k + offset - delta;
300                if (k == delta - d || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
301                    vUp[i] = vUp[i + 1] - 1;
302                } else {
303                    vUp[i] = vUp[i - 1];
304                }
305
306                int x = vUp[i] - 1;
307                int y = x - start1 + start2 - k;
308                while (x >= start1 && y >= start2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
309                    vUp[i] = x--;
310                    y--;
311                }
312                // Second step
313                if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
314                    return buildSnake(vUp[i], k + start1 - start2, end1, end2);
315                }
316            }
317        }
318
319        // this should not happen
320        throw new IllegalStateException("Internal Error");
321    }
322
323    /**
324     * Gets the {@link EditScript} object.
325     * <p>
326     * It is guaranteed that the objects embedded in the {@link InsertCommand
327     * insert commands} come from the second sequence and that the objects
328     * embedded in either the {@link DeleteCommand delete commands} or
329     * {@link KeepCommand keep commands} come from the first sequence. This can
330     * be important if subclassing is used for some elements in the first
331     * sequence and the {@code equals} method is specialized.
332     *
333     * @return the edit script resulting from the comparison of the two
334     *         sequences
335     */
336    public EditScript<T> getScript() {
337        final EditScript<T> script = new EditScript<>();
338        buildScript(0, sequence1.size(), 0, sequence2.size(), script);
339        return script;
340    }
341}