SequencesComparator.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.sequence;

  18. import java.util.List;

  19. import org.apache.commons.collections4.Equator;
  20. import org.apache.commons.collections4.functors.DefaultEquator;

  21. /**
  22.  * This class allows to compare two objects sequences.
  23.  * <p>
  24.  * The two sequences can hold any object type, as only the {@code equals}
  25.  * method is used to compare the elements of the sequences. It is guaranteed
  26.  * that the comparisons will always be done as {@code o1.equals(o2)} where
  27.  * {@code o1} belongs to the first sequence and {@code o2} belongs to
  28.  * the second sequence. This can be important if subclassing is used for some
  29.  * elements in the first sequence and the {@code equals} method is
  30.  * specialized.
  31.  * </p>
  32.  * <p>
  33.  * Comparison can be seen from two points of view: either as giving the smallest
  34.  * modification allowing to transform the first sequence into the second one, or
  35.  * as giving the longest sequence which is a subsequence of both initial
  36.  * sequences. The {@code equals} method is used to compare objects, so any
  37.  * object can be put into sequences. Modifications include deleting, inserting
  38.  * or keeping one object, starting from the beginning of the first sequence.
  39.  * </p>
  40.  * <p>
  41.  * This class implements the comparison algorithm, which is the very efficient
  42.  * algorithm from Eugene W. Myers
  43.  * <a href="https://www.cis.upenn.edu/~bcpierce/courses/dd/papers/diff.ps">
  44.  * An O(ND) Difference Algorithm and Its Variations</a>. This algorithm produces
  45.  * the shortest possible
  46.  * {@link EditScript edit script}
  47.  * containing all the
  48.  * {@link EditCommand commands}
  49.  * needed to transform the first sequence into the second one.
  50.  * </p>
  51.  *
  52.  * @param <T> the type of elements in the lists.
  53.  * @see EditScript
  54.  * @see EditCommand
  55.  * @see CommandVisitor
  56.  * @since 4.0
  57.  */
  58. public class SequencesComparator<T> {

  59.     /**
  60.      * This class is a simple placeholder to hold the end part of a path
  61.      * under construction in a {@link SequencesComparator SequencesComparator}.
  62.      */
  63.     private static final class Snake {

  64.         /** Start index. */
  65.         private final int start;

  66.         /** End index. */
  67.         private final int end;

  68.         /** Diagonal number. */
  69.         private final int diag;

  70.         /**
  71.          * Simple constructor. Creates a new instance of Snake with specified indices.
  72.          *
  73.          * @param start  start index of the snake
  74.          * @param end  end index of the snake
  75.          * @param diag  diagonal number
  76.          */
  77.         Snake(final int start, final int end, final int diag) {
  78.             this.start = start;
  79.             this.end   = end;
  80.             this.diag  = diag;
  81.         }

  82.         /**
  83.          * Gets the diagonal number of the snake.
  84.          *
  85.          * @return diagonal number of the snake
  86.          */
  87.         public int getDiag() {
  88.             return diag;
  89.         }

  90.         /**
  91.          * Gets the end index of the snake.
  92.          *
  93.          * @return end index of the snake
  94.          */
  95.         public int getEnd() {
  96.             return end;
  97.         }

  98.         /**
  99.          * Gets the start index of the snake.
  100.          *
  101.          * @return start index of the snake
  102.          */
  103.         public int getStart() {
  104.             return start;
  105.         }
  106.     }

  107.     /** First sequence. */
  108.     private final List<T> sequence1;

  109.     /** Second sequence. */
  110.     private final List<T> sequence2;

  111.     /** The equator used for testing object equality. */
  112.     private final Equator<? super T> equator;
  113.     /** Temporary variables. */
  114.     private final int[] vDown;

  115.     private final int[] vUp;

  116.     /**
  117.      * Simple constructor.
  118.      * <p>
  119.      * Creates a new instance of SequencesComparator using a {@link DefaultEquator}.
  120.      * <p>
  121.      * It is <em>guaranteed</em> that the comparisons will always be done as
  122.      * {@code o1.equals(o2)} where {@code o1} belongs to the first
  123.      * sequence and {@code o2} belongs to the second sequence. This can be
  124.      * important if subclassing is used for some elements in the first sequence
  125.      * and the {@code equals} method is specialized.
  126.      *
  127.      * @param sequence1  first sequence to be compared
  128.      * @param sequence2  second sequence to be compared
  129.      */
  130.     public SequencesComparator(final List<T> sequence1, final List<T> sequence2) {
  131.         this(sequence1, sequence2, DefaultEquator.defaultEquator());
  132.     }

  133.     /**
  134.      * Simple constructor.
  135.      * <p>
  136.      * Creates a new instance of SequencesComparator with a custom {@link Equator}.
  137.      * <p>
  138.      * It is <em>guaranteed</em> that the comparisons will always be done as
  139.      * {@code Equator.equate(o1, o2)} where {@code o1} belongs to the first
  140.      * sequence and {@code o2} belongs to the second sequence.
  141.      *
  142.      * @param sequence1  first sequence to be compared
  143.      * @param sequence2  second sequence to be compared
  144.      * @param equator  the equator to use for testing object equality
  145.      */
  146.     public SequencesComparator(final List<T> sequence1, final List<T> sequence2, final Equator<? super T> equator) {
  147.         this.sequence1 = sequence1;
  148.         this.sequence2 = sequence2;
  149.         this.equator = equator;

  150.         final int size = sequence1.size() + sequence2.size() + 2;
  151.         vDown = new int[size];
  152.         vUp   = new int[size];
  153.     }

  154.     /**
  155.      * Build an edit script.
  156.      *
  157.      * @param start1  the start of the first sequence to be compared
  158.      * @param end1  the end of the first sequence to be compared
  159.      * @param start2  the start of the second sequence to be compared
  160.      * @param end2  the end of the second sequence to be compared
  161.      * @param script the edited script
  162.      */
  163.     private void buildScript(final int start1, final int end1, final int start2, final int end2,
  164.                              final EditScript<T> script) {

  165.         final Snake middle = getMiddleSnake(start1, end1, start2, end2);

  166.         if (middle == null
  167.                 || middle.getStart() == end1 && middle.getDiag() == end1 - end2
  168.                 || middle.getEnd() == start1 && middle.getDiag() == start1 - start2) {

  169.             int i = start1;
  170.             int j = start2;
  171.             while (i < end1 || j < end2) {
  172.                 if (i < end1 && j < end2 && equator.equate(sequence1.get(i), sequence2.get(j))) {
  173.                     script.append(new KeepCommand<>(sequence1.get(i)));
  174.                     ++i;
  175.                     ++j;
  176.                 } else if (end1 - start1 > end2 - start2) {
  177.                     script.append(new DeleteCommand<>(sequence1.get(i)));
  178.                     ++i;
  179.                 } else {
  180.                     script.append(new InsertCommand<>(sequence2.get(j)));
  181.                     ++j;
  182.                 }
  183.             }

  184.         } else {

  185.             buildScript(start1, middle.getStart(),
  186.                         start2, middle.getStart() - middle.getDiag(),
  187.                         script);
  188.             for (int i = middle.getStart(); i < middle.getEnd(); ++i) {
  189.                 script.append(new KeepCommand<>(sequence1.get(i)));
  190.             }
  191.             buildScript(middle.getEnd(), end1,
  192.                         middle.getEnd() - middle.getDiag(), end2,
  193.                         script);
  194.         }
  195.     }

  196.     /**
  197.      * Build a snake.
  198.      *
  199.      * @param start  the value of the start of the snake
  200.      * @param diag  the value of the diagonal of the snake
  201.      * @param end1  the value of the end of the first sequence to be compared
  202.      * @param end2  the value of the end of the second sequence to be compared
  203.      * @return the snake built
  204.      */
  205.     private Snake buildSnake(final int start, final int diag, final int end1, final int end2) {
  206.         int end = start;
  207.         while (end - diag < end2
  208.                 && end < end1
  209.                 && equator.equate(sequence1.get(end), sequence2.get(end - diag))) {
  210.             ++end;
  211.         }
  212.         return new Snake(start, end, diag);
  213.     }

  214.     /**
  215.      * Gets the middle snake corresponding to two subsequences of the
  216.      * main sequences.
  217.      * <p>
  218.      * The snake is found using the MYERS Algorithm (this algorithm has
  219.      * also been implemented in the GNU diff program). This algorithm is
  220.      * explained in Eugene Myers article:
  221.      * <a href="https://web.archive.org/web/20040719035900/http%3A//www.cs.arizona.edu/people/gene/PAPERS/diff.ps">
  222.      * An O(ND) Difference Algorithm and Its Variations</a>.
  223.      *
  224.      * @param start1  the start of the first sequence to be compared
  225.      * @param end1  the end of the first sequence to be compared
  226.      * @param start2  the start of the second sequence to be compared
  227.      * @param end2  the end of the second sequence to be compared
  228.      * @return the middle snake
  229.      */
  230.     private Snake getMiddleSnake(final int start1, final int end1, final int start2, final int end2) {
  231.         // Myers Algorithm
  232.         // Initializations
  233.         final int m = end1 - start1;
  234.         final int n = end2 - start2;
  235.         if (m == 0 || n == 0) {
  236.             return null;
  237.         }

  238.         final int delta = m - n;
  239.         final int sum = n + m;
  240.         final int offset = (sum % 2 == 0 ? sum : sum + 1) / 2;
  241.         vDown[1 + offset] = start1;
  242.         vUp[1 + offset] = end1 + 1;

  243.         for (int d = 0; d <= offset; ++d) {
  244.             // Down
  245.             for (int k = -d; k <= d; k += 2) {
  246.                 // First step

  247.                 final int i = k + offset;
  248.                 if (k == -d || k != d && vDown[i - 1] < vDown[i + 1]) {
  249.                     vDown[i] = vDown[i + 1];
  250.                 } else {
  251.                     vDown[i] = vDown[i - 1] + 1;
  252.                 }

  253.                 int x = vDown[i];
  254.                 int y = x - start1 + start2 - k;

  255.                 while (x < end1 && y < end2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
  256.                     vDown[i] = ++x;
  257.                     ++y;
  258.                 }
  259.                 // Second step
  260.                 if (delta % 2 != 0 && delta - d <= k && k <= delta + d && vUp[i - delta] <= vDown[i]) { // NOPMD
  261.                     return buildSnake(vUp[i - delta], k + start1 - start2, end1, end2);
  262.                 }
  263.             }

  264.             // Up
  265.             for (int k = delta - d; k <= delta + d; k += 2) {
  266.                 // First step
  267.                 final int i = k + offset - delta;
  268.                 if (k == delta - d || k != delta + d && vUp[i + 1] <= vUp[i - 1]) {
  269.                     vUp[i] = vUp[i + 1] - 1;
  270.                 } else {
  271.                     vUp[i] = vUp[i - 1];
  272.                 }

  273.                 int x = vUp[i] - 1;
  274.                 int y = x - start1 + start2 - k;
  275.                 while (x >= start1 && y >= start2 && equator.equate(sequence1.get(x), sequence2.get(y))) {
  276.                     vUp[i] = x--;
  277.                     y--;
  278.                 }
  279.                 // Second step
  280.                 if (delta % 2 == 0 && -d <= k && k <= d && vUp[i] <= vDown[i + delta]) { // NOPMD
  281.                     return buildSnake(vUp[i], k + start1 - start2, end1, end2);
  282.                 }
  283.             }
  284.         }

  285.         // this should not happen
  286.         throw new IllegalStateException("Internal Error");
  287.     }

  288.     /**
  289.      * Gets the {@link EditScript} object.
  290.      * <p>
  291.      * It is guaranteed that the objects embedded in the {@link InsertCommand
  292.      * insert commands} come from the second sequence and that the objects
  293.      * embedded in either the {@link DeleteCommand delete commands} or
  294.      * {@link KeepCommand keep commands} come from the first sequence. This can
  295.      * be important if subclassing is used for some elements in the first
  296.      * sequence and the {@code equals} method is specialized.
  297.      *
  298.      * @return the edit script resulting from the comparison of the two
  299.      *         sequences
  300.      */
  301.     public EditScript<T> getScript() {
  302.         final EditScript<T> script = new EditScript<>();
  303.         buildScript(0, sequence1.size(), 0, sequence2.size(), script);
  304.         return script;
  305.     }
  306. }