001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.diff;
018
019/**
020 * <p>
021 * It is guaranteed that the comparisons will always be done as
022 * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
023 * sequence and <code>o2</code> belongs to the second sequence. This can
024 * be important if subclassing is used for some elements in the first
025 * sequence and the <code>equals</code> method is specialized.
026 * </p>
027 * <p>
028 * Comparison can be seen from two points of view: either as giving the smallest
029 * modification allowing to transform the first sequence into the second one, or
030 * as giving the longest sequence which is a subsequence of both initial
031 * sequences. The <code>equals</code> method is used to compare objects, so any
032 * object can be put into sequences. Modifications include deleting, inserting
033 * or keeping one object, starting from the beginning of the first sequence.
034 * </p>
035 * <p>
036 * This class implements the comparison algorithm, which is the very efficient
037 * algorithm from Eugene W. Myers
038 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
039 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
040 * the shortest possible {@link EditScript edit script} containing all the
041 * {@link EditCommand commands} needed to transform the first sequence into
042 * the second one.
043 *
044 * <p>
045 * This code has been adapted from Apache Commons Collections 4.0.
046 * </p>
047 *
048 * @see EditScript
049 * @see EditCommand
050 * @see CommandVisitor
051 * @since 1.0
052 */
053public class StringsComparator {
054
055    /**
056     * First character sequence.
057     */
058    private final String left;
059    /**
060     * Second character sequence.
061     */
062    private final String right;
063    /**
064     * Temporary array.
065     */
066    private final int[] vDown;
067    /**
068     * Temporary array.
069     */
070    private final int[] vUp;
071
072    /**
073     * Simple constructor.
074     * <p>
075     * Creates a new instance of StringsComparator.
076     * </p>
077     * <p>
078     * It is <em>guaranteed</em> that the comparisons will always be done as
079     * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
080     * sequence and <code>o2</code> belongs to the second sequence. This can be
081     * important if subclassing is used for some elements in the first sequence
082     * and the <code>equals</code> method is specialized.
083     * </p>
084     *
085     * @param left first character sequence to be compared
086     * @param right second character sequence to be compared
087     */
088    public StringsComparator(final String left, final String right) {
089        this.left = left;
090        this.right = right;
091
092        final int size = left.length() + right.length() + 2;
093        vDown = new int[size];
094        vUp   = new int[size];
095    }
096
097    /**
098     * Get the {@link EditScript} object.
099     * <p>
100     * It is guaranteed that the objects embedded in the {@link InsertCommand
101     * insert commands} come from the second sequence and that the objects
102     * embedded in either the {@link DeleteCommand delete commands} or
103     * {@link KeepCommand keep commands} come from the first sequence. This can
104     * be important if subclassing is used for some elements in the first
105     * sequence and the <code>equals</code> method is specialized.
106     * </p>
107     *
108     * @return the edit script resulting from the comparison of the two
109     *         sequences
110     */
111    public EditScript<Character> getScript() {
112        final EditScript<Character> script = new EditScript<>();
113        buildScript(0, left.length(), 0, right.length(), script);
114        return script;
115    }
116
117    /**
118     * Build an edit script.
119     *
120     * @param start1  the begin of the first sequence to be compared
121     * @param end1  the end of the first sequence to be compared
122     * @param start2  the begin of the second sequence to be compared
123     * @param end2  the end of the second sequence to be compared
124     * @param script the edited script
125     */
126    private void buildScript(final int start1, final int end1, final int start2, final int end2,
127            final EditScript<Character> script) {
128        final Snake middle = getMiddleSnake(start1, end1, start2, end2);
129
130        if (middle == null
131                || middle.getStart() == end1 && middle.getDiag() == end1 - end2
132                || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
133
134            int i = start1;
135            int j = start2;
136            while (i < end1 || j < end2) {
137                if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
138                    script.append(new KeepCommand<>(left.charAt(i)));
139                    ++i;
140                    ++j;
141                } else {
142                    if (end1 - start1 > end2 - start2) {
143                        script.append(new DeleteCommand<>(left.charAt(i)));
144                        ++i;
145                    } else {
146                        script.append(new InsertCommand<>(right.charAt(j)));
147                        ++j;
148                    }
149                }
150            }
151
152        } else {
153
154            buildScript(start1, middle.getStart(),
155                        start2, middle.getStart() - middle.getDiag(),
156                        script);
157            for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
158                script.append(new KeepCommand<>(left.charAt(i)));
159            }
160            buildScript(middle.getEnd(), end1,
161                        middle.getEnd() - middle.getDiag(), end2,
162                        script);
163        }
164    }
165
166    /**
167     * Get the middle snake corresponding to two subsequences of the
168     * main sequences.
169     * <p>
170     * The snake is found using the MYERS Algorithm (this algorithms has
171     * also been implemented in the GNU diff program). This algorithm is
172     * explained in Eugene Myers article:
173     * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
174     * An O(ND) Difference Algorithm and Its Variations</a>.
175     * </p>
176     *
177     * @param start1  the begin of the first sequence to be compared
178     * @param end1  the end of the first sequence to be compared
179     * @param start2  the begin of the second sequence to be compared
180     * @param end2  the end of the second sequence to be compared
181     * @return the middle snake
182     */
183    private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
184        // Myers Algorithm
185        // Initialisations
186        final int m = end1 - start1;
187        final int n = end2 - start2;
188        if (m == 0 || n == 0) {
189            return null;
190        }
191
192        final int delta  = m - n;
193        final int sum    = n + m;
194        final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
195        vDown[1 + offset] = start1;
196        vUp[1 + offset]   = end1 + 1;
197
198        for (int d = 0; d <= offset; ++d) {
199            // Down
200            for (int k = -d; k <= d; k += 2) {
201                // First step
202
203                final int i = k + offset;
204                if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
205                    vDown[i] = vDown[i + 1];
206                } else {
207                    vDown[i] = vDown[i - 1] + 1;
208                }
209
210                int x = vDown[i];
211                int y = x - start1 + start2 - k;
212
213                while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
214                    vDown[i] = ++x;
215                    ++y;
216                }
217                // Second step
218                if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
219                    if (vUp[i - delta] <= vDown[i]) { // NOPMD
220                        return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
221                    }
222                }
223            }
224
225            // Up
226            for (int k = delta - d; k <= delta + d; k += 2) {
227                // First step
228                final int i = k + offset - delta;
229                if (k == delta - d
230                        || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
231                    vUp[i] = vUp[i + 1] - 1;
232                } else {
233                    vUp[i] = vUp[i - 1];
234                }
235
236                int x = vUp[i] - 1;
237                int y = x - start1 + start2 - k;
238                while (x >= start1 && y >= start2
239                        && left.charAt(x) == right.charAt(y)) {
240                    vUp[i] = x--;
241                    y--;
242                }
243                // Second step
244                if (delta % 2 == 0 && -d <= k && k <= d) {
245                    if (vUp[i] <= vDown[i + delta]) { // NOPMD
246                        return buildSnake(vUp[i], k + start1 - start2, end1, end2);
247                    }
248                }
249            }
250        }
251
252        // this should not happen
253        throw new RuntimeException("Internal Error");
254    }
255
256    /**
257     * Build a snake.
258     *
259     * @param start  the value of the start of the snake
260     * @param diag  the value of the diagonal of the snake
261     * @param end1  the value of the end of the first sequence to be compared
262     * @param end2  the value of the end of the second sequence to be compared
263     * @return the snake built
264     */
265    private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
266        int end = start;
267        while (end - diag < end2
268                && end < end1
269                && left.charAt(end) == right.charAt(end - diag)) {
270            ++end;
271        }
272        return new Snake(start, end, diag);
273    }
274
275    /**
276     * This class is a simple placeholder to hold the end part of a path
277     * under construction in a {@link StringsComparator StringsComparator}.
278     */
279    private static class Snake {
280
281        /** Start index. */
282        private final int start;
283
284        /** End index. */
285        private final int end;
286
287        /** Diagonal number. */
288        private final int diag;
289
290        /**
291         * Simple constructor. Creates a new instance of Snake with specified indices.
292         *
293         * @param start  start index of the snake
294         * @param end  end index of the snake
295         * @param diag  diagonal number
296         */
297        public Snake(final int start, final int end, final int diag) {
298            this.start = start;
299            this.end   = end;
300            this.diag  = diag;
301        }
302
303        /**
304         * Get the start index of the snake.
305         *
306         * @return start index of the snake
307         */
308        public int getStart() {
309            return start;
310        }
311
312        /**
313         * Get the end index of the snake.
314         *
315         * @return end index of the snake
316         */
317        public int getEnd() {
318            return end;
319        }
320
321        /**
322         * Get the diagonal number of the snake.
323         *
324         * @return diagonal number of the snake
325         */
326        public int getDiag() {
327            return diag;
328        }
329    }
330
331}