View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.diff;
18  
19  /**
20   * <p>
21   * It is guaranteed that the comparisons will always be done as
22   * {@code o1.equals(o2)} where {@code o1} belongs to the first
23   * sequence and {@code o2} belongs to the second sequence. This can
24   * be important if subclassing is used for some elements in the first
25   * sequence and the {@code equals} method is specialized.
26   * </p>
27   * <p>
28   * Comparison can be seen from two points of view: either as giving the smallest
29   * modification allowing to transform the first sequence into the second one, or
30   * as giving the longest sequence which is a subsequence of both initial
31   * sequences. The {@code equals} method is used to compare objects, so any
32   * object can be put into sequences. Modifications include deleting, inserting
33   * or keeping one object, starting from the beginning of the first sequence.
34   * </p>
35   * <p>
36   * This class implements the comparison algorithm, which is the very efficient
37   * algorithm from Eugene W. Myers
38   * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
39   * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
40   * the shortest possible {@link EditScript edit script} containing all the
41   * {@link EditCommand commands} needed to transform the first sequence into
42   * the second one.
43   *
44   * <p>
45   * This code has been adapted from Apache Commons Collections 4.0.
46   * </p>
47   *
48   * @see EditScript
49   * @see EditCommand
50   * @see CommandVisitor
51   * @since 1.0
52   */
53  public class StringsComparator {
54  
55      /**
56       * This class is a simple placeholder to hold the end part of a path
57       * under construction in a {@link StringsComparator StringsComparator}.
58       */
59      private static final class Snake {
60  
61          /** Start index. */
62          private final int start;
63  
64          /** End index. */
65          private final int end;
66  
67          /** Diagonal number. */
68          private final int diag;
69  
70          /**
71           * Constructs a new instance of Snake with specified indices.
72           *
73           * @param start  start index of the snake
74           * @param end  end index of the snake
75           * @param diag  diagonal number
76           */
77          Snake(final int start, final int end, final int diag) {
78              this.start = start;
79              this.end   = end;
80              this.diag  = diag;
81          }
82  
83          /**
84           * Gets the diagonal number of the snake.
85           *
86           * @return diagonal number of the snake
87           */
88          public int getDiag() {
89              return diag;
90          }
91  
92          /**
93           * Gets the end index of the snake.
94           *
95           * @return end index of the snake
96           */
97          public int getEnd() {
98              return end;
99          }
100 
101         /**
102          * Gets the start index of the snake.
103          *
104          * @return start index of the snake
105          */
106         public int getStart() {
107             return start;
108         }
109     }
110     /**
111      * First character sequence.
112      */
113     private final String left;
114     /**
115      * Second character sequence.
116      */
117     private final String right;
118     /**
119      * Temporary array.
120      */
121     private final int[] vDown;
122 
123     /**
124      * Temporary array.
125      */
126     private final int[] vUp;
127 
128     /**
129      * Constructs a new instance of StringsComparator.
130      * <p>
131      * It is <em>guaranteed</em> that the comparisons will always be done as
132      * {@code o1.equals(o2)} where {@code o1} belongs to the first
133      * sequence and {@code o2} belongs to the second sequence. This can be
134      * important if subclassing is used for some elements in the first sequence
135      * and the {@code equals} method is specialized.
136      * </p>
137      *
138      * @param left first character sequence to be compared
139      * @param right second character sequence to be compared
140      */
141     public StringsComparator(final String left, final String right) {
142         this.left = left;
143         this.right = right;
144 
145         final int size = left.length() + right.length() + 2;
146         vDown = new int[size];
147         vUp   = new int[size];
148     }
149 
150     /**
151      * Builds an edit script.
152      *
153      * @param start1  the begin of the first sequence to be compared
154      * @param end1  the end of the first sequence to be compared
155      * @param start2  the begin of the second sequence to be compared
156      * @param end2  the end of the second sequence to be compared
157      * @param script the edited script
158      */
159     private void buildScript(final int start1, final int end1, final int start2, final int end2,
160             final EditScript<Character> script) {
161         final Snake middle = getMiddleSnake(start1, end1, start2, end2);
162 
163         if (middle == null
164                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
165                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
166 
167             int i = start1;
168             int j = start2;
169             while (i < end1 || j < end2) {
170                 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
171                     script.append(new KeepCommand<>(left.charAt(i)));
172                     ++i;
173                     ++j;
174                 } else if (end1 - start1 > end2 - start2) {
175                     script.append(new DeleteCommand<>(left.charAt(i)));
176                     ++i;
177                 } else {
178                     script.append(new InsertCommand<>(right.charAt(j)));
179                     ++j;
180                 }
181             }
182 
183         } else {
184 
185             buildScript(start1, middle.getStart(),
186                         start2, middle.getStart() - middle.getDiag(),
187                         script);
188             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
189                 script.append(new KeepCommand<>(left.charAt(i)));
190             }
191             buildScript(middle.getEnd(), end1,
192                         middle.getEnd() - middle.getDiag(), end2,
193                         script);
194         }
195     }
196 
197     /**
198      * Builds a snake.
199      *
200      * @param start  the value of the start of the snake
201      * @param diag  the value of the diagonal of the snake
202      * @param end1  the value of the end of the first sequence to be compared
203      * @param end2  the value of the end of the second sequence to be compared
204      * @return The snake built
205      */
206     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
207         int end = start;
208         while (end - diag < end2
209                 && end < end1
210                 && left.charAt(end) == right.charAt(end - diag)) {
211             ++end;
212         }
213         return new Snake(start, end, diag);
214     }
215 
216     /**
217      * Gets the middle snake corresponding to two subsequences of the
218      * main sequences.
219      * <p>
220      * The snake is found using the MYERS Algorithm (this algorithms has
221      * also been implemented in the GNU diff program). This algorithm is
222      * explained in Eugene Myers article:
223      * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
224      * An O(ND) Difference Algorithm and Its Variations</a>.
225      * </p>
226      *
227      * @param start1  the begin of the first sequence to be compared
228      * @param end1  the end of the first sequence to be compared
229      * @param start2  the begin of the second sequence to be compared
230      * @param end2  the end of the second sequence to be compared
231      * @return The middle snake
232      */
233     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
234         // Myers Algorithm
235         // Initialisations
236         final int m = end1 - start1;
237         final int n = end2 - start2;
238         if (m == 0 || n == 0) {
239             return null;
240         }
241 
242         final int delta  = m - n;
243         final int sum    = n + m;
244         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
245         vDown[1 + offset] = start1;
246         vUp[1 + offset]   = end1 + 1;
247 
248         for (int d = 0; d <= offset; ++d) {
249             // Down
250             for (int k = -d; k <= d; k += 2) {
251                 // First step
252 
253                 final int i = k + offset;
254                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
255                     vDown[i] = vDown[i + 1];
256                 } else {
257                     vDown[i] = vDown[i - 1] + 1;
258                 }
259 
260                 int x = vDown[i];
261                 int y = x - start1 + start2 - k;
262 
263                 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
264                     vDown[i] = ++x;
265                     ++y;
266                 }
267                 // Second step
268                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
269                     if (vUp[i - delta] <= vDown[i]) { // NOPMD
270                         return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
271                     }
272                 }
273             }
274 
275             // Up
276             for (int k = delta - d; k <= delta + d; k += 2) {
277                 // First step
278                 final int i = k + offset - delta;
279                 if (k == delta - d
280                         || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
281                     vUp[i] = vUp[i + 1] - 1;
282                 } else {
283                     vUp[i] = vUp[i - 1];
284                 }
285 
286                 int x = vUp[i] - 1;
287                 int y = x - start1 + start2 - k;
288                 while (x >= start1 && y >= start2
289                         && left.charAt(x) == right.charAt(y)) {
290                     vUp[i] = x--;
291                     y--;
292                 }
293                 // Second step
294                 if (delta % 2 == 0 && -d <= k && k <= d) {
295                     if (vUp[i] <= vDown[i + delta]) { // NOPMD
296                         return buildSnake(vUp[i], k + start1 - start2, end1, end2);
297                     }
298                 }
299             }
300         }
301 
302         // this should not happen
303         throw new IllegalStateException("Internal Error");
304     }
305 
306     /**
307      * Gets the {@link EditScript} object.
308      * <p>
309      * It is guaranteed that the objects embedded in the {@link InsertCommand
310      * insert commands} come from the second sequence and that the objects
311      * embedded in either the {@link DeleteCommand delete commands} or
312      * {@link KeepCommand keep commands} come from the first sequence. This can
313      * be important if subclassing is used for some elements in the first
314      * sequence and the {@code equals} method is specialized.
315      * </p>
316      *
317      * @return The edit script resulting from the comparison of the two
318      *         sequences
319      */
320     public EditScript<Character> getScript() {
321         final EditScript<Character> script = new EditScript<>();
322         buildScript(0, left.length(), 0, right.length(), script);
323         return script;
324     }
325 
326 }