StringsComparator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.diff;

  18. /**
  19.  * <p>
  20.  * It is guaranteed that the comparisons will always be done as
  21.  * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
  22.  * sequence and <code>o2</code> belongs to the second sequence. This can
  23.  * be important if subclassing is used for some elements in the first
  24.  * sequence and the <code>equals</code> method is specialized.
  25.  * </p>
  26.  * <p>
  27.  * Comparison can be seen from two points of view: either as giving the smallest
  28.  * modification allowing to transform the first sequence into the second one, or
  29.  * as giving the longest sequence which is a subsequence of both initial
  30.  * sequences. The <code>equals</code> method is used to compare objects, so any
  31.  * object can be put into sequences. Modifications include deleting, inserting
  32.  * or keeping one object, starting from the beginning of the first sequence.
  33.  * </p>
  34.  * <p>
  35.  * This class implements the comparison algorithm, which is the very efficient
  36.  * algorithm from Eugene W. Myers
  37.  * <a href="http://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
  38.  * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
  39.  * the shortest possible {@link EditScript edit script} containing all the
  40.  * {@link EditCommand commands} needed to transform the first sequence into
  41.  * the second one.
  42.  *
  43.  * <p>
  44.  * This code has been adapted from Apache Commons Collections 4.0.
  45.  * </p>
  46.  *
  47.  * @see EditScript
  48.  * @see EditCommand
  49.  * @see CommandVisitor
  50.  * @since 1.0
  51.  */
  52. public class StringsComparator {

  53.     /**
  54.      * First character sequence.
  55.      */
  56.     private final String left;
  57.     /**
  58.      * Second character sequence.
  59.      */
  60.     private final String right;
  61.     /**
  62.      * Temporary array.
  63.      */
  64.     private final int[] vDown;
  65.     /**
  66.      * Temporary array.
  67.      */
  68.     private final int[] vUp;

  69.     /**
  70.      * Simple constructor.
  71.      * <p>
  72.      * Creates a new instance of StringsComparator.
  73.      * </p>
  74.      * <p>
  75.      * It is <em>guaranteed</em> that the comparisons will always be done as
  76.      * <code>o1.equals(o2)</code> where <code>o1</code> belongs to the first
  77.      * sequence and <code>o2</code> belongs to the second sequence. This can be
  78.      * important if subclassing is used for some elements in the first sequence
  79.      * and the <code>equals</code> method is specialized.
  80.      * </p>
  81.      *
  82.      * @param left first character sequence to be compared
  83.      * @param right second character sequence to be compared
  84.      */
  85.     public StringsComparator(final String left, final String right) {
  86.         this.left = left;
  87.         this.right = right;

  88.         final int size = left.length() + right.length() + 2;
  89.         vDown = new int[size];
  90.         vUp   = new int[size];
  91.     }

  92.     /**
  93.      * Get the {@link EditScript} object.
  94.      * <p>
  95.      * It is guaranteed that the objects embedded in the {@link InsertCommand
  96.      * insert commands} come from the second sequence and that the objects
  97.      * embedded in either the {@link DeleteCommand delete commands} or
  98.      * {@link KeepCommand keep commands} come from the first sequence. This can
  99.      * be important if subclassing is used for some elements in the first
  100.      * sequence and the <code>equals</code> method is specialized.
  101.      * </p>
  102.      *
  103.      * @return the edit script resulting from the comparison of the two
  104.      *         sequences
  105.      */
  106.     public EditScript<Character> getScript() {
  107.         final EditScript<Character> script = new EditScript<>();
  108.         buildScript(0, left.length(), 0, right.length(), script);
  109.         return script;
  110.     }

  111.     /**
  112.      * Build an edit script.
  113.      *
  114.      * @param start1  the begin of the first sequence to be compared
  115.      * @param end1  the end of the first sequence to be compared
  116.      * @param start2  the begin of the second sequence to be compared
  117.      * @param end2  the end of the second sequence to be compared
  118.      * @param script the edited script
  119.      */
  120.     private void buildScript(final int start1, final int end1, final int start2, final int end2,
  121.             final EditScript<Character> script) {
  122.         final Snake middle = getMiddleSnake(start1, end1, start2, end2);

  123.         if (middle == null
  124.                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
  125.                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {

  126.             int i = start1;
  127.             int j = start2;
  128.             while (i < end1 || j < end2) {
  129.                 if (i < end1 && j < end2 && left.charAt(i) == right.charAt(j)) {
  130.                     script.append(new KeepCommand<>(left.charAt(i)));
  131.                     ++i;
  132.                     ++j;
  133.                 } else {
  134.                     if (end1 - start1 > end2 - start2) {
  135.                         script.append(new DeleteCommand<>(left.charAt(i)));
  136.                         ++i;
  137.                     } else {
  138.                         script.append(new InsertCommand<>(right.charAt(j)));
  139.                         ++j;
  140.                     }
  141.                 }
  142.             }

  143.         } else {

  144.             buildScript(start1, middle.getStart(),
  145.                         start2, middle.getStart() - middle.getDiag(),
  146.                         script);
  147.             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
  148.                 script.append(new KeepCommand<>(left.charAt(i)));
  149.             }
  150.             buildScript(middle.getEnd(), end1,
  151.                         middle.getEnd() - middle.getDiag(), end2,
  152.                         script);
  153.         }
  154.     }

  155.     /**
  156.      * Get the middle snake corresponding to two subsequences of the
  157.      * main sequences.
  158.      * <p>
  159.      * The snake is found using the MYERS Algorithm (this algorithms has
  160.      * also been implemented in the GNU diff program). This algorithm is
  161.      * explained in Eugene Myers article:
  162.      * <a href="http://www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
  163.      * An O(ND) Difference Algorithm and Its Variations</a>.
  164.      * </p>
  165.      *
  166.      * @param start1  the begin of the first sequence to be compared
  167.      * @param end1  the end of the first sequence to be compared
  168.      * @param start2  the begin of the second sequence to be compared
  169.      * @param end2  the end of the second sequence to be compared
  170.      * @return the middle snake
  171.      */
  172.     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
  173.         // Myers Algorithm
  174.         // Initialisations
  175.         final int m = end1 - start1;
  176.         final int n = end2 - start2;
  177.         if (m == 0 || n == 0) {
  178.             return null;
  179.         }

  180.         final int delta  = m - n;
  181.         final int sum    = n + m;
  182.         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
  183.         vDown[1 + offset] = start1;
  184.         vUp[1 + offset]   = end1 + 1;

  185.         for (int d = 0; d <= offset; ++d) {
  186.             // Down
  187.             for (int k = -d; k <= d; k += 2) {
  188.                 // First step

  189.                 final int i = k + offset;
  190.                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
  191.                     vDown[i] = vDown[i + 1];
  192.                 } else {
  193.                     vDown[i] = vDown[i - 1] + 1;
  194.                 }

  195.                 int x = vDown[i];
  196.                 int y = x - start1 + start2 - k;

  197.                 while (x < end1 && y < end2 && left.charAt(x) == right.charAt(y)) {
  198.                     vDown[i] = ++x;
  199.                     ++y;
  200.                 }
  201.                 // Second step
  202.                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d) {
  203.                     if (vUp[i - delta] <= vDown[i]) { // NOPMD
  204.                         return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
  205.                     }
  206.                 }
  207.             }

  208.             // Up
  209.             for (int k = delta - d; k <= delta + d; k += 2) {
  210.                 // First step
  211.                 final int i = k + offset - delta;
  212.                 if (k == delta - d
  213.                         || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
  214.                     vUp[i] = vUp[i + 1] - 1;
  215.                 } else {
  216.                     vUp[i] = vUp[i - 1];
  217.                 }

  218.                 int x = vUp[i] - 1;
  219.                 int y = x - start1 + start2 - k;
  220.                 while (x >= start1 && y >= start2
  221.                         && left.charAt(x) == right.charAt(y)) {
  222.                     vUp[i] = x--;
  223.                     y--;
  224.                 }
  225.                 // Second step
  226.                 if (delta % 2 == 0 && -d <= k && k <= d) {
  227.                     if (vUp[i] <= vDown[i + delta]) { // NOPMD
  228.                         return buildSnake(vUp[i], k + start1 - start2, end1, end2);
  229.                     }
  230.                 }
  231.             }
  232.         }

  233.         // this should not happen
  234.         throw new RuntimeException("Internal Error");
  235.     }

  236.     /**
  237.      * Build a snake.
  238.      *
  239.      * @param start  the value of the start of the snake
  240.      * @param diag  the value of the diagonal of the snake
  241.      * @param end1  the value of the end of the first sequence to be compared
  242.      * @param end2  the value of the end of the second sequence to be compared
  243.      * @return the snake built
  244.      */
  245.     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
  246.         int end = start;
  247.         while (end - diag < end2
  248.                 && end < end1
  249.                 && left.charAt(end) == right.charAt(end - diag)) {
  250.             ++end;
  251.         }
  252.         return new Snake(start, end, diag);
  253.     }

  254.     /**
  255.      * This class is a simple placeholder to hold the end part of a path
  256.      * under construction in a {@link StringsComparator StringsComparator}.
  257.      */
  258.     private static class Snake {

  259.         /** Start index. */
  260.         private final int start;

  261.         /** End index. */
  262.         private final int end;

  263.         /** Diagonal number. */
  264.         private final int diag;

  265.         /**
  266.          * Simple constructor. Creates a new instance of Snake with specified indices.
  267.          *
  268.          * @param start  start index of the snake
  269.          * @param end  end index of the snake
  270.          * @param diag  diagonal number
  271.          */
  272.         public Snake(final int start, final int end, final int diag) {
  273.             this.start = start;
  274.             this.end   = end;
  275.             this.diag  = diag;
  276.         }

  277.         /**
  278.          * Get the start index of the snake.
  279.          *
  280.          * @return start index of the snake
  281.          */
  282.         public int getStart() {
  283.             return start;
  284.         }

  285.         /**
  286.          * Get the end index of the snake.
  287.          *
  288.          * @return end index of the snake
  289.          */
  290.         public int getEnd() {
  291.             return end;
  292.         }

  293.         /**
  294.          * Get the diagonal number of the snake.
  295.          *
  296.          * @return diagonal number of the snake
  297.          */
  298.         public int getDiag() {
  299.             return diag;
  300.         }
  301.     }

  302. }