001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.diff;
018
019/**
020 * <p>
021 * It is guaranteed that the comparisons will always be done as
022 * {@code o1.equals(o2)} where {@code o1} belongs to the first
023 * sequence and {@code o2} belongs to the second sequence. This can
024 * be important if subclassing is used for some elements in the first
025 * sequence and the {@code equals} method is specialized.
026 * </p>
027 * <p>
028 * Comparison can be seen from two points of view: either as giving the smallest
029 * modification allowing to transform the first sequence into the second one, or
030 * as giving the longest sequence which is a subsequence of both initial
031 * sequences. The {@code equals} method is used to compare objects, so any
032 * object can be put into sequences. Modifications include deleting, inserting
033 * or keeping one object, starting from the beginning of the first sequence.
034 * </p>
035 * <p>
036 * This class implements the comparison algorithm, which is the very efficient
037 * algorithm from Eugene W. Myers
038 * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
039 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
040 * the shortest possible {@link EditScript edit script} containing all the
041 * {@link EditCommand commands} needed to transform the first sequence into
042 * the second one.
043 *
044 * <p>
045 * This code has been adapted from Apache Commons Collections 4.0.
046 * </p>
047 *
048 * @see EditScript
049 * @see EditCommand
050 * @see CommandVisitor
051 * @since 1.0
052 */
053public class StringsComparator {
054
055    /**
056     * Holds the end part of a path under construction in a {@link StringsComparator StringsComparator}.
057     */
058    private static final class Snake {
059
060        /** Start index. */
061        private final int start;
062
063        /** End index. */
064        private final int end;
065
066        /** Diagonal number. */
067        private final int diag;
068
069        /**
070         * Constructs a new instance of Snake with specified indices.
071         *
072         * @param start  start index of the snake.
073         * @param end  end index of the snake.
074         * @param diag  diagonal number.
075         */
076        Snake(final int start, final int end, final int diag) {
077            this.start = start;
078            this.end   = end;
079            this.diag  = diag;
080        }
081
082        /**
083         * Gets the diagonal number of the snake.
084         *
085         * @return diagonal number of the snake.
086         */
087        public int getDiag() {
088            return diag;
089        }
090
091        /**
092         * Gets the end index of the snake.
093         *
094         * @return end index of the snake.
095         */
096        public int getEnd() {
097            return end;
098        }
099
100        /**
101         * Gets the start index of the snake.
102         *
103         * @return start index of the snake.
104         */
105        public int getStart() {
106            return start;
107        }
108    }
109    /**
110     * First character sequence.
111     */
112    private final String left;
113    /**
114     * Second character sequence.
115     */
116    private final String right;
117    /**
118     * Temporary array.
119     */
120    private final int[] vDown;
121
122    /**
123     * Temporary array.
124     */
125    private final int[] vUp;
126
127    /**
128     * Constructs a new instance of StringsComparator.
129     * <p>
130     * It is <em>guaranteed</em> that the comparisons will always be done as {@code o1.equals(o2)} where {@code o1} belongs to the first sequence and {@code o2}
131     * belongs to the second sequence. This can be important if subclassing is used for some elements in the first sequence and the {@code equals} method is
132     * specialized.
133     * </p>
134     *
135     * @param left  first character sequence to be compared.
136     * @param right second character sequence to be compared.
137     */
138    public StringsComparator(final String left, final String right) {
139        this.left = left;
140        this.right = right;
141
142        final int size = left.length() + right.length() + 2;
143        vDown = new int[size];
144        vUp   = new int[size];
145    }
146
147    /**
148     * Builds an edit script.
149     *
150     * @param start1 the begin of the first sequence to be compared.
151     * @param end1   the end of the first sequence to be compared.
152     * @param start2 the begin of the second sequence to be compared.
153     * @param end2   the end of the second sequence to be compared.
154     * @param script the edited script.
155     */
156    private void buildScript(final int start1, final int end1, final int start2, final int end2,
157            final EditScript<Character> script) {
158        final Snake middle = getMiddleSnake(start1, end1, start2, end2);
159
160        if (middle == null
161                || middle.getStart() == end1 && middle.getDiag() == end1 - end2
162                || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
163
164            int i = start1;
165            int j = start2;
166            while (i < end1 || j < end2) {
167                if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
168                    script.append(new KeepCommand<>(left.charAt(i)));
169                    ++i;
170                    ++j;
171                } else if (end1 - start1 > end2 - start2) {
172                    script.append(new DeleteCommand<>(left.charAt(i)));
173                    ++i;
174                } else {
175                    script.append(new InsertCommand<>(right.charAt(j)));
176                    ++j;
177                }
178            }
179
180        } else {
181
182            buildScript(start1, middle.getStart(),
183                        start2, middle.getStart() - middle.getDiag(),
184                        script);
185            for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
186                script.append(new KeepCommand<>(left.charAt(i)));
187            }
188            buildScript(middle.getEnd(), end1,
189                        middle.getEnd() - middle.getDiag(), end2,
190                        script);
191        }
192    }
193
194    /**
195     * Builds a snake.
196     *
197     * @param start the value of the start of the snake.
198     * @param diag  the value of the diagonal of the snake.
199     * @param end1  the value of the end of the first sequence to be compared.
200     * @param end2  the value of the end of the second sequence to be compared.
201     * @return The snake built.
202     */
203    private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
204        int end = start;
205        while (end - diag < end2
206                && end < end1
207                && left.charAt(end) == right.charAt(end - diag)) {
208            ++end;
209        }
210        return new Snake(start, end, diag);
211    }
212
213    /**
214     * Gets the middle snake corresponding to two subsequences of the main sequences.
215     * <p>
216     * The snake is found using the MYERS Algorithm (this algorithms has also been implemented in the GNU diff program). This algorithm is explained in Eugene
217     * Myers article: <a href="https://www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> An O(ND) Difference Algorithm and Its Variations</a>.
218     * </p>
219     *
220     * @param start1 the begin of the first sequence to be compared.
221     * @param end1   the end of the first sequence to be compared.
222     * @param start2 the begin of the second sequence to be compared.
223     * @param end2   the end of the second sequence to be compared.
224     * @return The middle snake.
225     */
226    private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
227        // Myers Algorithm
228        // Initializations
229        final int m = end1 - start1;
230        final int n = end2 - start2;
231        if (m == 0 || n == 0) {
232            return null;
233        }
234
235        final int delta  = m - n;
236        final int sum    = n + m;
237        final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
238        vDown[1 + offset] = start1;
239        vUp[1 + offset]   = end1 + 1;
240
241        for (int d = 0; d <= offset; ++d) {
242            // Down
243            for (int k = -d; k <= d; k += 2) {
244                // First step
245
246                final int i = k + offset;
247                if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
248                    vDown[i] = vDown[i + 1];
249                } else {
250                    vDown[i] = vDown[i - 1] + 1;
251                }
252
253                int x = vDown[i];
254                int y = x - start1 + start2 - k;
255
256                while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
257                    vDown[i] = ++x;
258                    ++y;
259                }
260                // Second step
261                if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
262                    return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
263                }
264            }
265
266            // Up
267            for (int k = delta - d; k <= delta + d; k += 2) {
268                // First step
269                final int i = k + offset - delta;
270                if (k == delta - d
271                        || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
272                    vUp[i] = vUp[i + 1] - 1;
273                } else {
274                    vUp[i] = vUp[i - 1];
275                }
276
277                int x = vUp[i] - 1;
278                int y = x - start1 + start2 - k;
279                while (x >= start1 && y >= start2
280                        && left.charAt(x) == right.charAt(y)) {
281                    vUp[i] = x--;
282                    y--;
283                }
284                // Second step
285                if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
286                    return buildSnake(vUp[i], k + start1 - start2, end1, end2);
287                }
288            }
289        }
290
291        // this should not happen
292        throw new IllegalStateException("Internal Error");
293    }
294
295    /**
296     * Gets the {@link EditScript} object.
297     * <p>
298     * It is guaranteed that the objects embedded in the {@link InsertCommand insert commands} come from the second sequence and that the objects embedded in
299     * either the {@link DeleteCommand delete commands} or {@link KeepCommand keep commands} come from the first sequence. This can be important if subclassing
300     * is used for some elements in the first sequence and the {@code equals} method is specialized.
301     * </p>
302     *
303     * @return The edit script resulting from the comparison of the two sequences.
304     */
305    public EditScript<Character> getScript() {
306        final EditScript<Character> script = new EditScript<>();
307        buildScript(0, left.length(), 0, right.length(), script);
308        return script;
309    }
310
311}