001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.sequence; 018 019import java.util.List; 020 021import org.apache.commons.collections4.Equator; 022import org.apache.commons.collections4.functors.DefaultEquator; 023 024/** 025 * This class allows to compare two objects sequences. 026 * <p> 027 * The two sequences can hold any object type, as only the {@code equals} 028 * method is used to compare the elements of the sequences. It is guaranteed 029 * that the comparisons will always be done as {@code o1.equals(o2)} where 030 * {@code o1} belongs to the first sequence and {@code o2} belongs to 031 * the second sequence. This can be important if subclassing is used for some 032 * elements in the first sequence and the {@code equals} method is 033 * specialized. 034 * </p> 035 * <p> 036 * Comparison can be seen from two points of view: either as giving the smallest 037 * modification allowing to transform the first sequence into the second one, or 038 * as giving the longest sequence which is a subsequence of both initial 039 * sequences. The {@code equals} method is used to compare objects, so any 040 * object can be put into sequences. Modifications include deleting, inserting 041 * or keeping one object, starting from the beginning of the first sequence. 042 * </p> 043 * <p> 044 * This class implements the comparison algorithm, which is the very efficient 045 * algorithm from Eugene W. Myers 046 * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps"> 047 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces 048 * the shortest possible 049 * {@link EditScript edit script} 050 * containing all the 051 * {@link EditCommand commands} 052 * needed to transform the first sequence into the second one. 053 * </p> 054 * 055 * @param <T> the type of elements in the lists. 056 * @see EditScript 057 * @see EditCommand 058 * @see CommandVisitor 059 * @since 4.0 060 */ 061public class SequencesComparator<T> { 062 063 /** 064 * This class is a simple placeholder to hold the end part of a path 065 * under construction in a {@link SequencesComparator SequencesComparator}. 066 */ 067 private static final class Snake { 068 069 /** Start index. */ 070 private final int start; 071 072 /** End index. */ 073 private final int end; 074 075 /** Diagonal number. */ 076 private final int diag; 077 078 /** 079 * Simple constructor. Creates a new instance of Snake with specified indices. 080 * 081 * @param start start index of the snake 082 * @param end end index of the snake 083 * @param diag diagonal number 084 */ 085 Snake(final int start, final int end, final int diag) { 086 this.start = start; 087 this.end = end; 088 this.diag = diag; 089 } 090 091 /** 092 * Gets the diagonal number of the snake. 093 * 094 * @return diagonal number of the snake 095 */ 096 public int getDiag() { 097 return diag; 098 } 099 100 /** 101 * Gets the end index of the snake. 102 * 103 * @return end index of the snake 104 */ 105 public int getEnd() { 106 return end; 107 } 108 109 /** 110 * Gets the start index of the snake. 111 * 112 * @return start index of the snake 113 */ 114 public int getStart() { 115 return start; 116 } 117 } 118 119 /** First sequence. */ 120 private final List<T> sequence1; 121 122 /** Second sequence. */ 123 private final List<T> sequence2; 124 125 /** The equator used for testing object equality. */ 126 private final Equator<? super T> equator; 127 /** Temporary variables. */ 128 private final int[] vDown; 129 130 private final int[] vUp; 131 132 /** 133 * Simple constructor. 134 * <p> 135 * Creates a new instance of SequencesComparator using a {@link DefaultEquator}. 136 * <p> 137 * It is <em>guaranteed</em> that the comparisons will always be done as 138 * {@code o1.equals(o2)} where {@code o1} belongs to the first 139 * sequence and {@code o2} belongs to the second sequence. This can be 140 * important if subclassing is used for some elements in the first sequence 141 * and the {@code equals} method is specialized. 142 * 143 * @param sequence1 first sequence to be compared 144 * @param sequence2 second sequence to be compared 145 */ 146 public SequencesComparator(final List<T> sequence1, final List<T> sequence2) { 147 this(sequence1, sequence2, DefaultEquator.defaultEquator()); 148 } 149 150 /** 151 * Simple constructor. 152 * <p> 153 * Creates a new instance of SequencesComparator with a custom {@link Equator}. 154 * <p> 155 * It is <em>guaranteed</em> that the comparisons will always be done as 156 * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first 157 * sequence and {@code o2} belongs to the second sequence. 158 * 159 * @param sequence1 first sequence to be compared 160 * @param sequence2 second sequence to be compared 161 * @param equator the equator to use for testing object equality 162 */ 163 public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) { 164 this.sequence1 = sequence1; 165 this.sequence2 = sequence2; 166 this.equator = equator; 167 168 final int size = sequence1.size() + sequence2.size() + 2; 169 vDown = new int[size]; 170 vUp = new int[size]; 171 } 172 173 /** 174 * Build an edit script. 175 * 176 * @param start1 the start of the first sequence to be compared 177 * @param end1 the end of the first sequence to be compared 178 * @param start2 the start of the second sequence to be compared 179 * @param end2 the end of the second sequence to be compared 180 * @param script the edited script 181 */ 182 private void buildScript(final int start1, final int end1, final int start2, final int end2, 183 final EditScript<T> script) { 184 185 final Snake middle = getMiddleSnake(start1, end1, start2, end2); 186 187 if (middle == null 188 || middle.getStart() == end1 && middle.getDiag() == end1 - end2 189 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { 190 191 int i = start1; 192 int j = start2; 193 while (i < end1 || j < end2) { 194 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) { 195 script.append(new KeepCommand<>(sequence1.get(i))); 196 ++i; 197 ++j; 198 } else if (end1 - start1 > end2 - start2) { 199 script.append(new DeleteCommand<>(sequence1.get(i))); 200 ++i; 201 } else { 202 script.append(new InsertCommand<>(sequence2.get(j))); 203 ++j; 204 } 205 } 206 207 } else { 208 209 buildScript(start1, middle.getStart(), 210 start2, middle.getStart() - middle.getDiag(), 211 script); 212 for (int i = middle.getStart(); i < middle.getEnd(); ++i) { 213 script.append(new KeepCommand<>(sequence1.get(i))); 214 } 215 buildScript(middle.getEnd(), end1, 216 middle.getEnd() - middle.getDiag(), end2, 217 script); 218 } 219 } 220 221 /** 222 * Build a snake. 223 * 224 * @param start the value of the start of the snake 225 * @param diag the value of the diagonal of the snake 226 * @param end1 the value of the end of the first sequence to be compared 227 * @param end2 the value of the end of the second sequence to be compared 228 * @return the snake built 229 */ 230 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { 231 int end = start; 232 while (end - diag < end2 233 && end < end1 234 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) { 235 ++end; 236 } 237 return new Snake(start, end, diag); 238 } 239 240 /** 241 * Gets the middle snake corresponding to two subsequences of the 242 * main sequences. 243 * <p> 244 * The snake is found using the MYERS Algorithm (this algorithm has 245 * also been implemented in the GNU diff program). This algorithm is 246 * explained in Eugene Myers article: 247 * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> 248 * An O(ND) Difference Algorithm and Its Variations</a>. 249 * 250 * @param start1 the start of the first sequence to be compared 251 * @param end1 the end of the first sequence to be compared 252 * @param start2 the start of the second sequence to be compared 253 * @param end2 the end of the second sequence to be compared 254 * @return the middle snake 255 */ 256 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) { 257 // Myers Algorithm 258 // Initialisations 259 final int m = end1 - start1; 260 final int n = end2 - start2; 261 if (m == 0 || n == 0) { 262 return null; 263 } 264 265 final int delta = m - n; 266 final int sum = n + m; 267 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; 268 vDown[1 + offset] = start1; 269 vUp[1 + offset] = end1 + 1; 270 271 for (int d = 0; d <= offset; ++d) { 272 // Down 273 for (int k = -d; k <= d; k += 2) { 274 // First step 275 276 final int i = k + offset; 277 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) { 278 vDown[i] = vDown[i + 1]; 279 } else { 280 vDown[i] = vDown[i - 1] + 1; 281 } 282 283 int x = vDown[i]; 284 int y = x - start1 + start2 - k; 285 286 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) { 287 vDown[i] = ++x; 288 ++y; 289 } 290 // Second step 291 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD 292 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2); 293 } 294 } 295 296 // Up 297 for (int k = delta - d; k <= delta + d; k += 2) { 298 // First step 299 final int i = k + offset - delta; 300 if (k == delta - d || k != delta + d && vUp[i + 1] <= vUp[i - 1]) { 301 vUp[i] = vUp[i + 1] - 1; 302 } else { 303 vUp[i] = vUp[i - 1]; 304 } 305 306 int x = vUp[i] - 1; 307 int y = x - start1 + start2 - k; 308 while (x >= start1 && y >= start2 && equator.equate(sequence1.get(x), sequence2.get(y))) { 309 vUp[i] = x--; 310 y--; 311 } 312 // Second step 313 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD 314 return buildSnake(vUp[i], k + start1 - start2, end1, end2); 315 } 316 } 317 } 318 319 // this should not happen 320 throw new IllegalStateException("Internal Error"); 321 } 322 323 /** 324 * Gets the {@link EditScript} object. 325 * <p> 326 * It is guaranteed that the objects embedded in the {@link InsertCommand 327 * insert commands} come from the second sequence and that the objects 328 * embedded in either the {@link DeleteCommand delete commands} or 329 * {@link KeepCommand keep commands} come from the first sequence. This can 330 * be important if subclassing is used for some elements in the first 331 * sequence and the {@code equals} method is specialized. 332 * 333 * @return the edit script resulting from the comparison of the two 334 * sequences 335 */ 336 public EditScript<T> getScript() { 337 final EditScript<T> script = new EditScript<>(); 338 buildScript(0, sequence1.size(), 0, sequence2.size(), script); 339 return script; 340 } 341}