StringsComparator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.diff;

  18. /**
  19.  * <p>
  20.  * It is guaranteed that the comparisons will always be done as
  21.  * {@code o1.equals(o2)} where {@code o1} belongs to the first
  22.  * sequence and {@code o2} belongs to the second sequence. This can
  23.  * be important if subclassing is used for some elements in the first
  24.  * sequence and the {@code equals} method is specialized.
  25.  * </p>
  26.  * <p>
  27.  * Comparison can be seen from two points of view: either as giving the smallest
  28.  * modification allowing to transform the first sequence into the second one, or
  29.  * as giving the longest sequence which is a subsequence of both initial
  30.  * sequences. The {@code equals} method is used to compare objects, so any
  31.  * object can be put into sequences. Modifications include deleting, inserting
  32.  * or keeping one object, starting from the beginning of the first sequence.
  33.  * </p>
  34.  * <p>
  35.  * This class implements the comparison algorithm, which is the very efficient
  36.  * algorithm from Eugene W. Myers
  37.  * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
  38.  * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
  39.  * the shortest possible {@link EditScript edit script} containing all the
  40.  * {@link EditCommand commands} needed to transform the first sequence into
  41.  * the second one.
  42.  *
  43.  * <p>
  44.  * This code has been adapted from Apache Commons Collections 4.0.
  45.  * </p>
  46.  *
  47.  * @see EditScript
  48.  * @see EditCommand
  49.  * @see CommandVisitor
  50.  * @since 1.0
  51.  */
  52. public class StringsComparator {

  53.     /**
  54.      * This class is a simple placeholder to hold the end part of a path
  55.      * under construction in a {@link StringsComparator StringsComparator}.
  56.      */
  57.     private static final class Snake {

  58.         /** Start index. */
  59.         private final int start;

  60.         /** End index. */
  61.         private final int end;

  62.         /** Diagonal number. */
  63.         private final int diag;

  64.         /**
  65.          * Constructs a new instance of Snake with specified indices.
  66.          *
  67.          * @param start  start index of the snake
  68.          * @param end  end index of the snake
  69.          * @param diag  diagonal number
  70.          */
  71.         Snake(final int start, final int end, final int diag) {
  72.             this.start = start;
  73.             this.end   = end;
  74.             this.diag  = diag;
  75.         }

  76.         /**
  77.          * Gets the diagonal number of the snake.
  78.          *
  79.          * @return diagonal number of the snake
  80.          */
  81.         public int getDiag() {
  82.             return diag;
  83.         }

  84.         /**
  85.          * Gets the end index of the snake.
  86.          *
  87.          * @return end index of the snake
  88.          */
  89.         public int getEnd() {
  90.             return end;
  91.         }

  92.         /**
  93.          * Gets the start index of the snake.
  94.          *
  95.          * @return start index of the snake
  96.          */
  97.         public int getStart() {
  98.             return start;
  99.         }
  100.     }
  101.     /**
  102.      * First character sequence.
  103.      */
  104.     private final String left;
  105.     /**
  106.      * Second character sequence.
  107.      */
  108.     private final String right;
  109.     /**
  110.      * Temporary array.
  111.      */
  112.     private final int[] vDown;

  113.     /**
  114.      * Temporary array.
  115.      */
  116.     private final int[] vUp;

  117.     /**
  118.      * Constructs a new instance of StringsComparator.
  119.      * <p>
  120.      * It is <em>guaranteed</em> that the comparisons will always be done as
  121.      * {@code o1.equals(o2)} where {@code o1} belongs to the first
  122.      * sequence and {@code o2} belongs to the second sequence. This can be
  123.      * important if subclassing is used for some elements in the first sequence
  124.      * and the {@code equals} method is specialized.
  125.      * </p>
  126.      *
  127.      * @param left first character sequence to be compared
  128.      * @param right second character sequence to be compared
  129.      */
  130.     public StringsComparator(final String left, final String right) {
  131.         this.left = left;
  132.         this.right = right;

  133.         final int size = left.length() + right.length() + 2;
  134.         vDown = new int[size];
  135.         vUp   = new int[size];
  136.     }

  137.     /**
  138.      * Builds an edit script.
  139.      *
  140.      * @param start1  the begin of the first sequence to be compared
  141.      * @param end1  the end of the first sequence to be compared
  142.      * @param start2  the begin of the second sequence to be compared
  143.      * @param end2  the end of the second sequence to be compared
  144.      * @param script the edited script
  145.      */
  146.     private void buildScript(final int start1, final int end1, final int start2, final int end2,
  147.             final EditScript<Character> script) {
  148.         final Snake middle = getMiddleSnake(start1, end1, start2, end2);

  149.         if (middle == null
  150.                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
  151.                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {

  152.             int i = start1;
  153.             int j = start2;
  154.             while (i < end1 || j < end2) {
  155.                 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
  156.                     script.append(new KeepCommand<>(left.charAt(i)));
  157.                     ++i;
  158.                     ++j;
  159.                 } else if (end1 - start1 > end2 - start2) {
  160.                     script.append(new DeleteCommand<>(left.charAt(i)));
  161.                     ++i;
  162.                 } else {
  163.                     script.append(new InsertCommand<>(right.charAt(j)));
  164.                     ++j;
  165.                 }
  166.             }

  167.         } else {

  168.             buildScript(start1, middle.getStart(),
  169.                         start2, middle.getStart() - middle.getDiag(),
  170.                         script);
  171.             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
  172.                 script.append(new KeepCommand<>(left.charAt(i)));
  173.             }
  174.             buildScript(middle.getEnd(), end1,
  175.                         middle.getEnd() - middle.getDiag(), end2,
  176.                         script);
  177.         }
  178.     }

  179.     /**
  180.      * Builds a snake.
  181.      *
  182.      * @param start  the value of the start of the snake
  183.      * @param diag  the value of the diagonal of the snake
  184.      * @param end1  the value of the end of the first sequence to be compared
  185.      * @param end2  the value of the end of the second sequence to be compared
  186.      * @return The snake built
  187.      */
  188.     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
  189.         int end = start;
  190.         while (end - diag < end2
  191.                 && end < end1
  192.                 && left.charAt(end) == right.charAt(end - diag)) {
  193.             ++end;
  194.         }
  195.         return new Snake(start, end, diag);
  196.     }

  197.     /**
  198.      * Gets the middle snake corresponding to two subsequences of the
  199.      * main sequences.
  200.      * <p>
  201.      * The snake is found using the MYERS Algorithm (this algorithms has
  202.      * also been implemented in the GNU diff program). This algorithm is
  203.      * explained in Eugene Myers article:
  204.      * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
  205.      * An O(ND) Difference Algorithm and Its Variations</a>.
  206.      * </p>
  207.      *
  208.      * @param start1  the begin of the first sequence to be compared
  209.      * @param end1  the end of the first sequence to be compared
  210.      * @param start2  the begin of the second sequence to be compared
  211.      * @param end2  the end of the second sequence to be compared
  212.      * @return The middle snake
  213.      */
  214.     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
  215.         // Myers Algorithm
  216.         // Initialisations
  217.         final int m = end1 - start1;
  218.         final int n = end2 - start2;
  219.         if (m == 0 || n == 0) {
  220.             return null;
  221.         }

  222.         final int delta  = m - n;
  223.         final int sum    = n + m;
  224.         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
  225.         vDown[1 + offset] = start1;
  226.         vUp[1 + offset]   = end1 + 1;

  227.         for (int d = 0; d <= offset; ++d) {
  228.             // Down
  229.             for (int k = -d; k <= d; k += 2) {
  230.                 // First step

  231.                 final int i = k + offset;
  232.                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
  233.                     vDown[i] = vDown[i + 1];
  234.                 } else {
  235.                     vDown[i] = vDown[i - 1] + 1;
  236.                 }

  237.                 int x = vDown[i];
  238.                 int y = x - start1 + start2 - k;

  239.                 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
  240.                     vDown[i] = ++x;
  241.                     ++y;
  242.                 }
  243.                 // Second step
  244.                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
  245.                     return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
  246.                 }
  247.             }

  248.             // Up
  249.             for (int k = delta - d; k <= delta + d; k += 2) {
  250.                 // First step
  251.                 final int i = k + offset - delta;
  252.                 if (k == delta - d
  253.                         || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
  254.                     vUp[i] = vUp[i + 1] - 1;
  255.                 } else {
  256.                     vUp[i] = vUp[i - 1];
  257.                 }

  258.                 int x = vUp[i] - 1;
  259.                 int y = x - start1 + start2 - k;
  260.                 while (x >= start1 && y >= start2
  261.                         && left.charAt(x) == right.charAt(y)) {
  262.                     vUp[i] = x--;
  263.                     y--;
  264.                 }
  265.                 // Second step
  266.                 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
  267.                     return buildSnake(vUp[i], k + start1 - start2, end1, end2);
  268.                 }
  269.             }
  270.         }

  271.         // this should not happen
  272.         throw new IllegalStateException("Internal Error");
  273.     }

  274.     /**
  275.      * Gets the {@link EditScript} object.
  276.      * <p>
  277.      * It is guaranteed that the objects embedded in the {@link InsertCommand
  278.      * insert commands} come from the second sequence and that the objects
  279.      * embedded in either the {@link DeleteCommand delete commands} or
  280.      * {@link KeepCommand keep commands} come from the first sequence. This can
  281.      * be important if subclassing is used for some elements in the first
  282.      * sequence and the {@code equals} method is specialized.
  283.      * </p>
  284.      *
  285.      * @return The edit script resulting from the comparison of the two
  286.      *         sequences
  287.      */
  288.     public EditScript<Character> getScript() {
  289.         final EditScript<Character> script = new EditScript<>();
  290.         buildScript(0, left.length(), 0, right.length(), script);
  291.         return script;
  292.     }

  293. }