001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.diff; 018 019/** 020 * <p> 021 * It is guaranteed that the comparisons will always be done as 022 * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first 023 * sequence and <code>o2</code> belongs to the second sequence. This can 024 * be important if subclassing is used for some elements in the first 025 * sequence and the <code>equals</code> method is specialized. 026 * </p> 027 * <p> 028 * Comparison can be seen from two points of view: either as giving the smallest 029 * modification allowing to transform the first sequence into the second one, or 030 * as giving the longest sequence which is a subsequence of both initial 031 * sequences. The <code>equals</code> method is used to compare objects, so any 032 * object can be put into sequences. Modifications include deleting, inserting 033 * or keeping one object, starting from the beginning of the first sequence. 034 * </p> 035 * <p> 036 * This class implements the comparison algorithm, which is the very efficient 037 * algorithm from Eugene W. Myers 038 * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps"> 039 * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces 040 * the shortest possible {@link EditScript edit script} containing all the 041 * {@link EditCommand commands} needed to transform the first sequence into 042 * the second one. 043 * 044 * <p> 045 * This code has been adapted from Apache Commons Collections 4.0. 046 * </p> 047 * 048 * @see EditScript 049 * @see EditCommand 050 * @see CommandVisitor 051 * @since 1.0 052 */ 053public class StringsComparator { 054 055 /** 056 * First character sequence. 057 */ 058 private final String left; 059 /** 060 * Second character sequence. 061 */ 062 private final String right; 063 /** 064 * Temporary array. 065 */ 066 private final int[] vDown; 067 /** 068 * Temporary array. 069 */ 070 private final int[] vUp; 071 072 /** 073 * Simple constructor. 074 * <p> 075 * Creates a new instance of StringsComparator. 076 * </p> 077 * <p> 078 * It is <em>guaranteed</em> that the comparisons will always be done as 079 * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first 080 * sequence and <code>o2</code> belongs to the second sequence. This can be 081 * important if subclassing is used for some elements in the first sequence 082 * and the <code>equals</code> method is specialized. 083 * </p> 084 * 085 * @param left first character sequence to be compared 086 * @param right second character sequence to be compared 087 */ 088 public StringsComparator(final String left, final String right) { 089 this.left = left; 090 this.right = right; 091 092 final int size = left.length() + right.length() + 2; 093 vDown = new int[size]; 094 vUp = new int[size]; 095 } 096 097 /** 098 * Get the {@link EditScript} object. 099 * <p> 100 * It is guaranteed that the objects embedded in the {@link InsertCommand 101 * insert commands} come from the second sequence and that the objects 102 * embedded in either the {@link DeleteCommand delete commands} or 103 * {@link KeepCommand keep commands} come from the first sequence. This can 104 * be important if subclassing is used for some elements in the first 105 * sequence and the <code>equals</code> method is specialized. 106 * </p> 107 * 108 * @return the edit script resulting from the comparison of the two 109 * sequences 110 */ 111 public EditScript<Character> getScript() { 112 final EditScript<Character> script = new EditScript<>(); 113 buildScript(0, left.length(), 0, right.length(), script); 114 return script; 115 } 116 117 /** 118 * Build an edit script. 119 * 120 * @param start1 the begin of the first sequence to be compared 121 * @param end1 the end of the first sequence to be compared 122 * @param start2 the begin of the second sequence to be compared 123 * @param end2 the end of the second sequence to be compared 124 * @param script the edited script 125 */ 126 private void buildScript(final int start1, final int end1, final int start2, final int end2, 127 final EditScript<Character> script) { 128 final Snake middle = getMiddleSnake(start1, end1, start2, end2); 129 130 if (middle == null 131 || middle.getStart() == end1 && middle.getDiag() == end1 - end2 132 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) { 133 134 int i = start1; 135 int j = start2; 136 while (i < end1 || j < end2) { 137 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) { 138 script.append(new KeepCommand<>(left.charAt(i))); 139 ++i; 140 ++j; 141 } else { 142 if (end1 - start1 > end2 - start2) { 143 script.append(new DeleteCommand<>(left.charAt(i))); 144 ++i; 145 } else { 146 script.append(new InsertCommand<>(right.charAt(j))); 147 ++j; 148 } 149 } 150 } 151 152 } else { 153 154 buildScript(start1, middle.getStart(), 155 start2, middle.getStart() - middle.getDiag(), 156 script); 157 for (int i = middle.getStart(); i < middle.getEnd(); ++i) { 158 script.append(new KeepCommand<>(left.charAt(i))); 159 } 160 buildScript(middle.getEnd(), end1, 161 middle.getEnd() - middle.getDiag(), end2, 162 script); 163 } 164 } 165 166 /** 167 * Get the middle snake corresponding to two subsequences of the 168 * main sequences. 169 * <p> 170 * The snake is found using the MYERS Algorithm (this algorithms has 171 * also been implemented in the GNU diff program). This algorithm is 172 * explained in Eugene Myers article: 173 * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps"> 174 * An O(ND) Difference Algorithm and Its Variations</a>. 175 * </p> 176 * 177 * @param start1 the begin of the first sequence to be compared 178 * @param end1 the end of the first sequence to be compared 179 * @param start2 the begin of the second sequence to be compared 180 * @param end2 the end of the second sequence to be compared 181 * @return the middle snake 182 */ 183 private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) { 184 // Myers Algorithm 185 // Initialisations 186 final int m = end1 - start1; 187 final int n = end2 - start2; 188 if (m == 0 || n == 0) { 189 return null; 190 } 191 192 final int delta = m - n; 193 final int sum = n + m; 194 final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2; 195 vDown[1 + offset] = start1; 196 vUp[1 + offset] = end1 + 1; 197 198 for (int d = 0; d <= offset; ++d) { 199 // Down 200 for (int k = -d; k <= d; k += 2) { 201 // First step 202 203 final int i = k + offset; 204 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) { 205 vDown[i] = vDown[i + 1]; 206 } else { 207 vDown[i] = vDown[i - 1] + 1; 208 } 209 210 int x = vDown[i]; 211 int y = x - start1 + start2 - k; 212 213 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) { 214 vDown[i] = ++x; 215 ++y; 216 } 217 // Second step 218 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) { 219 if (vUp[i - delta] <= vDown[i]) { // NOPMD 220 return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2); 221 } 222 } 223 } 224 225 // Up 226 for (int k = delta - d; k <= delta + d; k += 2) { 227 // First step 228 final int i = k + offset - delta; 229 if (k == delta - d 230 || k != delta + d && vUp[i + 1] <= vUp[i - 1]) { 231 vUp[i] = vUp[i + 1] - 1; 232 } else { 233 vUp[i] = vUp[i - 1]; 234 } 235 236 int x = vUp[i] - 1; 237 int y = x - start1 + start2 - k; 238 while (x >= start1 && y >= start2 239 && left.charAt(x) == right.charAt(y)) { 240 vUp[i] = x--; 241 y--; 242 } 243 // Second step 244 if (delta % 2 == 0 && -d <= k && k <= d) { 245 if (vUp[i] <= vDown[i + delta]) { // NOPMD 246 return buildSnake(vUp[i], k + start1 - start2, end1, end2); 247 } 248 } 249 } 250 } 251 252 // this should not happen 253 throw new RuntimeException("Internal Error"); 254 } 255 256 /** 257 * Build a snake. 258 * 259 * @param start the value of the start of the snake 260 * @param diag the value of the diagonal of the snake 261 * @param end1 the value of the end of the first sequence to be compared 262 * @param end2 the value of the end of the second sequence to be compared 263 * @return the snake built 264 */ 265 private Snake buildSnake(final int start, final int diag, final int end1, final int end2) { 266 int end = start; 267 while (end - diag < end2 268 && end < end1 269 && left.charAt(end) == right.charAt(end - diag)) { 270 ++end; 271 } 272 return new Snake(start, end, diag); 273 } 274 275 /** 276 * This class is a simple placeholder to hold the end part of a path 277 * under construction in a {@link StringsComparator StringsComparator}. 278 */ 279 private static class Snake { 280 281 /** Start index. */ 282 private final int start; 283 284 /** End index. */ 285 private final int end; 286 287 /** Diagonal number. */ 288 private final int diag; 289 290 /** 291 * Simple constructor. Creates a new instance of Snake with specified indices. 292 * 293 * @param start start index of the snake 294 * @param end end index of the snake 295 * @param diag diagonal number 296 */ 297 public Snake(final int start, final int end, final int diag) { 298 this.start = start; 299 this.end = end; 300 this.diag = diag; 301 } 302 303 /** 304 * Get the start index of the snake. 305 * 306 * @return start index of the snake 307 */ 308 public int getStart() { 309 return start; 310 } 311 312 /** 313 * Get the end index of the snake. 314 * 315 * @return end index of the snake 316 */ 317 public int getEnd() { 318 return end; 319 } 320 321 /** 322 * Get the diagonal number of the snake. 323 * 324 * @return diagonal number of the snake 325 */ 326 public int getDiag() { 327 return diag; 328 } 329 } 330 331}