1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text.diff;
18
19 /**
20 * <p>
21 * It is guaranteed that the comparisons will always be done as
22 * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
23 * sequence and <code>o2</code> belongs to the second sequence. This can
24 * be important if subclassing is used for some elements in the first
25 * sequence and the <code>equals</code> method is specialized.
26 * </p>
27 * <p>
28 * Comparison can be seen from two points of view: either as giving the smallest
29 * modification allowing to transform the first sequence into the second one, or
30 * as giving the longest sequence which is a subsequence of both initial
31 * sequences. The <code>equals</code> method is used to compare objects, so any
32 * object can be put into sequences. Modifications include deleting, inserting
33 * or keeping one object, starting from the beginning of the first sequence.
34 * </p>
35 * <p>
36 * This class implements the comparison algorithm, which is the very efficient
37 * algorithm from Eugene W. Myers
38 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
39 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
40 * the shortest possible {@link EditScript edit script} containing all the
41 * {@link EditCommand commands} needed to transform the first sequence into
42 * the second one.
43 *
44 * <p>
45 * This code has been adapted from Apache Commons Collections 4.0.
46 * </p>
47 *
48 * @see EditScript
49 * @see EditCommand
50 * @see CommandVisitor
51 * @since 1.0
52 */
53 public class StringsComparator {
54
55 /**
56 * First character sequence.
57 */
58 private final String left;
59 /**
60 * Second character sequence.
61 */
62 private final String right;
63 /**
64 * Temporary array.
65 */
66 private final int[] vDown;
67 /**
68 * Temporary array.
69 */
70 private final int[] vUp;
71
72 /**
73 * Simple constructor.
74 * <p>
75 * Creates a new instance of StringsComparator.
76 * </p>
77 * <p>
78 * It is <em>guaranteed</em> that the comparisons will always be done as
79 * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
80 * sequence and <code>o2</code> belongs to the second sequence. This can be
81 * important if subclassing is used for some elements in the first sequence
82 * and the <code>equals</code> method is specialized.
83 * </p>
84 *
85 * @param left first character sequence to be compared
86 * @param right second character sequence to be compared
87 */
88 public StringsComparator(final String left, final String right) {
89 this.left = left;
90 this.right = right;
91
92 final int size = left.length() + right.length() + 2;
93 vDown = new int[size];
94 vUp = new int[size];
95 }
96
97 /**
98 * Get the {@link EditScript} object.
99 * <p>
100 * It is guaranteed that the objects embedded in the {@link InsertCommand
101 * insert commands} come from the second sequence and that the objects
102 * embedded in either the {@link DeleteCommand delete commands} or
103 * {@link KeepCommand keep commands} come from the first sequence. This can
104 * be important if subclassing is used for some elements in the first
105 * sequence and the <code>equals</code> method is specialized.
106 * </p>
107 *
108 * @return the edit script resulting from the comparison of the two
109 * sequences
110 */
111 public EditScript<Character> getScript() {
112 final EditScript<Character> script = new EditScript<>();
113 buildScript(0, left.length(), 0, right.length(), script);
114 return script;
115 }
116
117 /**
118 * Build an edit script.
119 *
120 * @param start1 the begin of the first sequence to be compared
121 * @param end1 the end of the first sequence to be compared
122 * @param start2 the begin of the second sequence to be compared
123 * @param end2 the end of the second sequence to be compared
124 * @param script the edited script
125 */
126 private void buildScript(final int start1, final int end1, final int start2, final int end2,
127 final EditScript<Character> script) {
128 final Snake middle = getMiddleSnake(start1, end1, start2, end2);
129
130 if (middle == null
131 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
132 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {
133
134 int i = start1;
135 int j = start2;
136 while (i < end1 || j < end2) {
137 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
138 script.append(new KeepCommand<>(left.charAt(i)));
139 ++i;
140 ++j;
141 } else {
142 if (end1 - start1 > end2 - start2) {
143 script.append(new DeleteCommand<>(left.charAt(i)));
144 ++i;
145 } else {
146 script.append(new InsertCommand<>(right.charAt(j)));
147 ++j;
148 }
149 }
150 }
151
152 } else {
153
154 buildScript(start1, middle.getStart(),
155 start2, middle.getStart() - middle.getDiag(),
156 script);
157 for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
158 script.append(new KeepCommand<>(left.charAt(i)));
159 }
160 buildScript(middle.getEnd(), end1,
161 middle.getEnd() - middle.getDiag(), end2,
162 script);
163 }
164 }
165
166 /**
167 * Get the middle snake corresponding to two subsequences of the
168 * main sequences.
169 * <p>
170 * The snake is found using the MYERS Algorithm (this algorithms has
171 * also been implemented in the GNU diff program). This algorithm is
172 * explained in Eugene Myers article:
173 * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
174 * An O(ND) Difference Algorithm and Its Variations</a>.
175 * </p>
176 *
177 * @param start1 the begin of the first sequence to be compared
178 * @param end1 the end of the first sequence to be compared
179 * @param start2 the begin of the second sequence to be compared
180 * @param end2 the end of the second sequence to be compared
181 * @return the middle snake
182 */
183 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
184 // Myers Algorithm
185 // Initialisations
186 final int m = end1 - start1;
187 final int n = end2 - start2;
188 if (m == 0 || n == 0) {
189 return null;
190 }
191
192 final int delta = m - n;
193 final int sum = n + m;
194 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
195 vDown[1 + offset] = start1;
196 vUp[1 + offset] = end1 + 1;
197
198 for (int d = 0; d <= offset; ++d) {
199 // Down
200 for (int k = -d; k <= d; k += 2) {
201 // First step
202
203 final int i = k + offset;
204 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
205 vDown[i] = vDown[i + 1];
206 } else {
207 vDown[i] = vDown[i - 1] + 1;
208 }
209
210 int x = vDown[i];
211 int y = x - start1 + start2 - k;
212
213 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
214 vDown[i] = ++x;
215 ++y;
216 }
217 // Second step
218 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
219 if (vUp[i - delta] <= vDown[i]) { // NOPMD
220 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
221 }
222 }
223 }
224
225 // Up
226 for (int k = delta - d; k <= delta + d; k += 2) {
227 // First step
228 final int i = k + offset - delta;
229 if (k == delta - d
230 || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
231 vUp[i] = vUp[i + 1] - 1;
232 } else {
233 vUp[i] = vUp[i - 1];
234 }
235
236 int x = vUp[i] - 1;
237 int y = x - start1 + start2 - k;
238 while (x >= start1 && y >= start2
239 && left.charAt(x) == right.charAt(y)) {
240 vUp[i] = x--;
241 y--;
242 }
243 // Second step
244 if (delta % 2 == 0 && -d <= k && k <= d) {
245 if (vUp[i] <= vDown[i + delta]) { // NOPMD
246 return buildSnake(vUp[i], k + start1 - start2, end1, end2);
247 }
248 }
249 }
250 }
251
252 // this should not happen
253 throw new RuntimeException("Internal Error");
254 }
255
256 /**
257 * Build a snake.
258 *
259 * @param start the value of the start of the snake
260 * @param diag the value of the diagonal of the snake
261 * @param end1 the value of the end of the first sequence to be compared
262 * @param end2 the value of the end of the second sequence to be compared
263 * @return the snake built
264 */
265 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
266 int end = start;
267 while (end - diag < end2
268 && end < end1
269 && left.charAt(end) == right.charAt(end - diag)) {
270 ++end;
271 }
272 return new Snake(start, end, diag);
273 }
274
275 /**
276 * This class is a simple placeholder to hold the end part of a path
277 * under construction in a {@link StringsComparator StringsComparator}.
278 */
279 private static class Snake {
280
281 /** Start index. */
282 private final int start;
283
284 /** End index. */
285 private final int end;
286
287 /** Diagonal number. */
288 private final int diag;
289
290 /**
291 * Simple constructor. Creates a new instance of Snake with specified indices.
292 *
293 * @param start start index of the snake
294 * @param end end index of the snake
295 * @param diag diagonal number
296 */
297 public Snake(final int start, final int end, final int diag) {
298 this.start = start;
299 this.end = end;
300 this.diag = diag;
301 }
302
303 /**
304 * Get the start index of the snake.
305 *
306 * @return start index of the snake
307 */
308 public int getStart() {
309 return start;
310 }
311
312 /**
313 * Get the end index of the snake.
314 *
315 * @return end index of the snake
316 */
317 public int getEnd() {
318 return end;
319 }
320
321 /**
322 * Get the diagonal number of the snake.
323 *
324 * @return diagonal number of the snake
325 */
326 public int getDiag() {
327 return diag;
328 }
329 }
330
331 }