1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text.diff;
18
19 /**
20 * <p>
21 * It is guaranteed that the comparisons will always be done as
22 * {@code o1.equals(o2)} where {@code o1} belongs to the first
23 * sequence and {@code o2} belongs to the second sequence. This can
24 * be important if subclassing is used for some elements in the first
25 * sequence and the {@code equals} method is specialized.
26 * </p>
27 * <p>
28 * Comparison can be seen from two points of view: either as giving the smallest
29 * modification allowing to transform the first sequence into the second one, or
30 * as giving the longest sequence which is a subsequence of both initial
31 * sequences. The {@code equals} method is used to compare objects, so any
32 * object can be put into sequences. Modifications include deleting, inserting
33 * or keeping one object, starting from the beginning of the first sequence.
34 * </p>
35 * <p>
36 * This class implements the comparison algorithm, which is the very efficient
37 * algorithm from Eugene W. Myers
38 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
39 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
40 * the shortest possible {@link EditScript edit script} containing all the
41 * {@link EditCommand commands} needed to transform the first sequence into
42 * the second one.
43 *
44 * <p>
45 * This code has been adapted from Apache Commons Collections 4.0.
46 * </p>
47 *
48 * @see EditScript
49 * @see EditCommand
50 * @see CommandVisitor
51 * @since 1.0
52 */
53 public class StringsComparator {
54
55 /**
56 * This class is a simple placeholder to hold the end part of a path
57 * under construction in a {@link StringsComparator StringsComparator}.
58 */
59 private static final class Snake {
60
61 /** Start index. */
62 private final int start;
63
64 /** End index. */
65 private final int end;
66
67 /** Diagonal number. */
68 private final int diag;
69
70 /**
71 * Constructs a new instance of Snake with specified indices.
72 *
73 * @param start start index of the snake
74 * @param end end index of the snake
75 * @param diag diagonal number
76 */
77 Snake(final int start, final int end, final int diag) {
78 this.start = start;
79 this.end = end;
80 this.diag = diag;
81 }
82
83 /**
84 * Gets the diagonal number of the snake.
85 *
86 * @return diagonal number of the snake
87 */
88 public int getDiag() {
89 return diag;
90 }
91
92 /**
93 * Gets the end index of the snake.
94 *
95 * @return end index of the snake
96 */
97 public int getEnd() {
98 return end;
99 }
100
101 /**
102 * Gets the start index of the snake.
103 *
104 * @return start index of the snake
105 */
106 public int getStart() {
107 return start;
108 }
109 }
110 /**
111 * First character sequence.
112 */
113 private final String left;
114 /**
115 * Second character sequence.
116 */
117 private final String right;
118 /**
119 * Temporary array.
120 */
121 private final int[] vDown;
122
123 /**
124 * Temporary array.
125 */
126 private final int[] vUp;
127
128 /**
129 * Constructs a new instance of StringsComparator.
130 * <p>
131 * It is <em>guaranteed</em> that the comparisons will always be done as
132 * {@code o1.equals(o2)} where {@code o1} belongs to the first
133 * sequence and {@code o2} belongs to the second sequence. This can be
134 * important if subclassing is used for some elements in the first sequence
135 * and the {@code equals} method is specialized.
136 * </p>
137 *
138 * @param left first character sequence to be compared
139 * @param right second character sequence to be compared
140 */
141 public StringsComparator(final String left, final String right) {
142 this.left = left;
143 this.right = right;
144
145 final int size = left.length() + right.length() + 2;
146 vDown = new int[size];
147 vUp = new int[size];
148 }
149
150 /**
151 * Builds an edit script.
152 *
153 * @param start1 the begin of the first sequence to be compared
154 * @param end1 the end of the first sequence to be compared
155 * @param start2 the begin of the second sequence to be compared
156 * @param end2 the end of the second sequence to be compared
157 * @param script the edited script
158 */
159 private void buildScript(final int start1, final int end1, final int start2, final int end2,
160 final EditScript<Character> script) {
161 final Snake middle = getMiddleSnake(start1, end1, start2, end2);
162
163 if (middle == null
164 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
165 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
166
167 int i = start1;
168 int j = start2;
169 while (i < end1 || j < end2) {
170 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
171 script.append(new KeepCommand<>(left.charAt(i)));
172 ++i;
173 ++j;
174 } else if (end1 - start1 > end2 - start2) {
175 script.append(new DeleteCommand<>(left.charAt(i)));
176 ++i;
177 } else {
178 script.append(new InsertCommand<>(right.charAt(j)));
179 ++j;
180 }
181 }
182
183 } else {
184
185 buildScript(start1, middle.getStart(),
186 start2, middle.getStart() - middle.getDiag(),
187 script);
188 for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
189 script.append(new KeepCommand<>(left.charAt(i)));
190 }
191 buildScript(middle.getEnd(), end1,
192 middle.getEnd() - middle.getDiag(), end2,
193 script);
194 }
195 }
196
197 /**
198 * Builds a snake.
199 *
200 * @param start the value of the start of the snake
201 * @param diag the value of the diagonal of the snake
202 * @param end1 the value of the end of the first sequence to be compared
203 * @param end2 the value of the end of the second sequence to be compared
204 * @return The snake built
205 */
206 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
207 int end = start;
208 while (end - diag < end2
209 && end < end1
210 && left.charAt(end) == right.charAt(end - diag)) {
211 ++end;
212 }
213 return new Snake(start, end, diag);
214 }
215
216 /**
217 * Gets the middle snake corresponding to two subsequences of the
218 * main sequences.
219 * <p>
220 * The snake is found using the MYERS Algorithm (this algorithms has
221 * also been implemented in the GNU diff program). This algorithm is
222 * explained in Eugene Myers article:
223 * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
224 * An O(ND) Difference Algorithm and Its Variations</a>.
225 * </p>
226 *
227 * @param start1 the begin of the first sequence to be compared
228 * @param end1 the end of the first sequence to be compared
229 * @param start2 the begin of the second sequence to be compared
230 * @param end2 the end of the second sequence to be compared
231 * @return The middle snake
232 */
233 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
234 // Myers Algorithm
235 // Initialisations
236 final int m = end1 - start1;
237 final int n = end2 - start2;
238 if (m == 0 || n == 0) {
239 return null;
240 }
241
242 final int delta = m - n;
243 final int sum = n + m;
244 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
245 vDown[1 + offset] = start1;
246 vUp[1 + offset] = end1 + 1;
247
248 for (int d = 0; d <= offset; ++d) {
249 // Down
250 for (int k = -d; k <= d; k += 2) {
251 // First step
252
253 final int i = k + offset;
254 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
255 vDown[i] = vDown[i + 1];
256 } else {
257 vDown[i] = vDown[i - 1] + 1;
258 }
259
260 int x = vDown[i];
261 int y = x - start1 + start2 - k;
262
263 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
264 vDown[i] = ++x;
265 ++y;
266 }
267 // Second step
268 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
269 if (vUp[i - delta] <= vDown[i]) { // NOPMD
270 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
271 }
272 }
273 }
274
275 // Up
276 for (int k = delta - d; k <= delta + d; k += 2) {
277 // First step
278 final int i = k + offset - delta;
279 if (k == delta - d
280 || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
281 vUp[i] = vUp[i + 1] - 1;
282 } else {
283 vUp[i] = vUp[i - 1];
284 }
285
286 int x = vUp[i] - 1;
287 int y = x - start1 + start2 - k;
288 while (x >= start1 && y >= start2
289 && left.charAt(x) == right.charAt(y)) {
290 vUp[i] = x--;
291 y--;
292 }
293 // Second step
294 if (delta % 2 == 0 && -d <= k && k <= d) {
295 if (vUp[i] <= vDown[i + delta]) { // NOPMD
296 return buildSnake(vUp[i], k + start1 - start2, end1, end2);
297 }
298 }
299 }
300 }
301
302 // this should not happen
303 throw new IllegalStateException("Internal Error");
304 }
305
306 /**
307 * Gets the {@link EditScript} object.
308 * <p>
309 * It is guaranteed that the objects embedded in the {@link InsertCommand
310 * insert commands} come from the second sequence and that the objects
311 * embedded in either the {@link DeleteCommand delete commands} or
312 * {@link KeepCommand keep commands} come from the first sequence. This can
313 * be important if subclassing is used for some elements in the first
314 * sequence and the {@code equals} method is specialized.
315 * </p>
316 *
317 * @return The edit script resulting from the comparison of the two
318 * sequences
319 */
320 public EditScript<Character> getScript() {
321 final EditScript<Character> script = new EditScript<>();
322 buildScript(0, left.length(), 0, right.length(), script);
323 return script;
324 }
325
326 }