KolmogorovSmirnovTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.statistics.inference;

  18. import java.util.Arrays;
  19. import java.util.Objects;
  20. import java.util.function.DoubleSupplier;
  21. import java.util.function.DoubleUnaryOperator;
  22. import java.util.function.IntToDoubleFunction;
  23. import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
  24. import org.apache.commons.numbers.combinatorics.Factorial;
  25. import org.apache.commons.numbers.core.ArithmeticUtils;
  26. import org.apache.commons.numbers.core.Sum;
  27. import org.apache.commons.rng.UniformRandomProvider;

  28. /**
  29.  * Implements the Kolmogorov-Smirnov (K-S) test for equality of continuous distributions.
  30.  *
  31.  * <p>The one-sample test uses a D statistic based on the maximum deviation of the empirical
  32.  * distribution of sample data points from the distribution expected under the null hypothesis.
  33.  *
  34.  * <p>The two-sample test uses a D statistic based on the maximum deviation of the two empirical
  35.  * distributions of sample data points. The two-sample tests evaluate the null hypothesis that
  36.  * the two samples {@code x} and {@code y} come from the same underlying distribution.
  37.  *
  38.  * <p>References:
  39.  * <ol>
  40.  * <li>
  41.  * Marsaglia, G., Tsang, W. W., &amp; Wang, J. (2003).
  42.  * <a href="https://doi.org/10.18637/jss.v008.i18">Evaluating Kolmogorov's Distribution.</a>
  43.  * Journal of Statistical Software, 8(18), 1–4.
  44.  * <li>Simard, R., &amp; L’Ecuyer, P. (2011).
  45.  * <a href="https://doi.org/10.18637/jss.v039.i11">Computing the Two-Sided Kolmogorov-Smirnov Distribution.</a>
  46.  * Journal of Statistical Software, 39(11), 1–18.
  47.  * <li>Sekhon, J. S. (2011).
  48.  * <a href="https://doi.org/10.18637/jss.v042.i07">
  49.  * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
  50.  * The Matching package for R.</a>
  51.  * Journal of Statistical Software, 42(7), 1–52.
  52.  * <li>Viehmann, T (2021).
  53.  * <a href="https://doi.org/10.48550/arXiv.2102.08037">
  54.  * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.</a>
  55.  * arXiv:2102.08037
  56.  * <li>Hodges, J. L. (1958).
  57.  * <a href="https://doi.org/10.1007/BF02589501">
  58.  * The significance probability of the smirnov two-sample test.</a>
  59.  * Arkiv for Matematik, 3(5), 469-486.
  60.  * </ol>
  61.  *
  62.  * <p>Note that [1] contains an error in computing h, refer to <a
  63.  * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
  64.  *
  65.  * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
  66.  * Kolmogorov-Smirnov (K-S) test (Wikipedia)</a>
  67.  * @since 1.1
  68.  */
  69. public final class KolmogorovSmirnovTest {
  70.     /** Name for sample 1. */
  71.     private static final String SAMPLE_1_NAME = "Sample 1";
  72.     /** Name for sample 2. */
  73.     private static final String SAMPLE_2_NAME = "Sample 2";
  74.     /** When the largest sample size exceeds this value, 2-sample test AUTO p-value
  75.      * uses an asymptotic distribution to compute the p-value. */
  76.     private static final int LARGE_SAMPLE = 10000;
  77.     /** Maximum finite factorial. */
  78.     private static final int MAX_FACTORIAL = 170;
  79.     /** Maximum length of an array. This is used to determine if two arrays can be concatenated
  80.      * to create a sampler from the joint distribution. The limit is copied from the limit
  81.      * of java.util.ArrayList. */
  82.     private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
  83.     /** The maximum least common multiple (lcm) to attempt the exact p-value computation.
  84.      * The integral d value is in [0, n*m] in steps of the greatest common denominator (gcd),
  85.      * thus lcm = n*m/gcd is the number of possible different p-values.
  86.      * Some methods have a lower limit due to computation limits. This should be larger
  87.      * than LARGE_SAMPLE^2 so all AUTO p-values attempt an exact computation, i.e.
  88.      * at least 10000^2 ~ 2^26.56. */
  89.     private static final long MAX_LCM_TWO_SAMPLE_EXACT_P = 1L << 31;
  90.     /** Placeholder to use for the two-sample sign array when the value can be ignored. */
  91.     private static final int[] IGNORED_SIGN = new int[1];
  92.     /** Placeholder to use for the two-sample ties D array when the value can be ignored. */
  93.     private static final long[] IGNORED_D = new long[2];
  94.     /** Default instance. */
  95.     private static final KolmogorovSmirnovTest DEFAULT = new KolmogorovSmirnovTest(
  96.         AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, false, null, 1000);

  97.     /** Alternative hypothesis. */
  98.     private final AlternativeHypothesis alternative;
  99.     /** Method to compute the p-value. */
  100.     private final PValueMethod pValueMethod;
  101.     /** Use a strict inequality for the two-sample exact p-value. */
  102.     private final boolean strictInequality;
  103.     /** Source of randomness. */
  104.     private final UniformRandomProvider rng;
  105.     /** Number of iterations . */
  106.     private final int iterations;

  107.     /**
  108.      * Result for the one-sample Kolmogorov-Smirnov test.
  109.      *
  110.      * <p>This class is immutable.
  111.      *
  112.      * @since 1.1
  113.      */
  114.     public static class OneResult extends BaseSignificanceResult {
  115.         /** Sign of the statistic. */
  116.         private final int sign;

  117.         /**
  118.          * Create an instance.
  119.          *
  120.          * @param statistic Test statistic.
  121.          * @param sign Sign of the statistic.
  122.          * @param p Result p-value.
  123.          */
  124.         OneResult(double statistic, int sign, double p) {
  125.             super(statistic, p);
  126.             this.sign = sign;
  127.         }

  128.         /**
  129.          * Gets the sign of the statistic. This is 1 for \(D^+\) and -1 for \(D^-\).
  130.          * For a two sided-test this is zero if the magnitude of \(D^+\) and \(D^-\) was equal;
  131.          * otherwise the sign indicates the larger of \(D^+\) or \(D^-\).
  132.          *
  133.          * @return the sign
  134.          */
  135.         public int getSign() {
  136.             return sign;
  137.         }
  138.     }

  139.     /**
  140.      * Result for the two-sample Kolmogorov-Smirnov test.
  141.      *
  142.      * <p>This class is immutable.
  143.      *
  144.      * @since 1.1
  145.      */
  146.     public static final class TwoResult extends OneResult {
  147.         /** Flag to indicate there were significant ties.
  148.          * Note that in extreme cases there may be significant ties despite {@code upperD == D}
  149.          * due to rounding when converting the integral statistic to a double. For this
  150.          * reason the presence of ties is stored as a flag. */
  151.         private final boolean significantTies;
  152.         /** Upper bound of the D statistic from all possible paths through regions with ties. */
  153.         private final double upperD;
  154.         /** The p-value of the upper D value. */
  155.         private final double upperP;

  156.         /**
  157.          * Create an instance.
  158.          *
  159.          * @param statistic Test statistic.
  160.          * @param sign Sign of the statistic.
  161.          * @param p Result p-value.
  162.          * @param significantTies Flag to indicate there were significant ties.
  163.          * @param upperD Upper bound of the D statistic from all possible paths through
  164.          * regions with ties.
  165.          * @param upperP The p-value of the upper D value.
  166.          */
  167.         TwoResult(double statistic, int sign, double p, boolean significantTies, double upperD, double upperP) {
  168.             super(statistic, sign, p);
  169.             this.significantTies = significantTies;
  170.             this.upperD = upperD;
  171.             this.upperP = upperP;
  172.         }

  173.         /**
  174.          * {@inheritDoc}
  175.          *
  176.          * <p><strong>Ties</strong>
  177.          *
  178.          * <p>The presence of ties in the data creates a distribution for the D values generated
  179.          * by all possible orderings of the tied regions. This statistic is computed using the
  180.          * path with the <em>minimum</em> effect on the D statistic.
  181.          *
  182.          * <p>For a one-sided statistic \(D^+\) or \(D^-\), this is the lower bound of the D statistic.
  183.          *
  184.          * <p>For a two-sided statistic D, this may be <em>below</em> the lower bound of the
  185.          * distribution of all possible D values. This case occurs when the number of ties
  186.          * is very high and is identified by a {@link #getPValue() p-value} of 1.
  187.          *
  188.          * <p>If the two-sided statistic is zero this only occurs in the presence of ties:
  189.          * either the two arrays are identical, are 'identical' data of a single value
  190.          * (sample sizes may be different), or have a sequence of ties of 'identical' data
  191.          * with a net zero effect on the D statistic, e.g.
  192.          * <pre>
  193.          *  [1,2,3]           vs [1,2,3]
  194.          *  [0,0,0,0]         vs [0,0,0]
  195.          *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
  196.          * </pre>
  197.          */
  198.         @Override
  199.         public double getStatistic() {
  200.             // Note: This method is here for documentation
  201.             return super.getStatistic();
  202.         }

  203.         /**
  204.          * Returns {@code true} if there were ties between samples that occurred
  205.          * in a region which could change the D statistic if the ties were resolved to
  206.          * a defined order.
  207.          *
  208.          * <p>Ties between the data can be interpreted as if the values were different
  209.          * but within machine epsilon. In this case the order within the tie region is not known.
  210.          * If the most extreme ordering of any tied regions (e.g. all tied values of {@code x}
  211.          * before all tied values of {@code y}) could create a larger D statistic this
  212.          * method will return {@code true}.
  213.          *
  214.          * <p>If there were no ties, or all possible orderings of tied regions create the same
  215.          * D statistic, this method returns {@code false}.
  216.          *
  217.          * <p>Note it is possible that this method returns {@code true} when {@code D == upperD}
  218.          * due to rounding when converting the computed D statistic to a double. This will
  219.          * only occur for large sample sizes {@code n} and {@code m} where the product
  220.          * {@code n*m >= 2^53}.
  221.          *
  222.          * @return true if the D statistic could be changed by resolution of ties
  223.          * @see #getUpperD()
  224.          */
  225.         public boolean hasSignificantTies() {
  226.             return significantTies;
  227.         }

  228.         /**
  229.          * Return the upper bound of the D statistic from all possible paths through regions with ties.
  230.          *
  231.          * <p>This will return a value equal to or greater than {@link #getStatistic()}.
  232.          *
  233.          * @return the upper bound of D
  234.          * @see #hasSignificantTies()
  235.          */
  236.         public double getUpperD() {
  237.             return upperD;
  238.         }

  239.         /**
  240.          * Return the p-value of the upper bound of the D statistic.
  241.          *
  242.          * <p>If computed, this will return a value equal to or less than
  243.          * {@link #getPValue() getPValue}. It may be orders of magnitude smaller.
  244.          *
  245.          * <p>Note: This p-value corresponds to the most extreme p-value from all possible
  246.          * orderings of tied regions. It is <strong>not</strong> recommended to use this to
  247.          * reject the null hypothesis. The upper bound of D and the corresponding p-value
  248.          * provide information that must be interpreted in the context of the sample data and
  249.          * used to inform a decision on the suitability of the test to the data.
  250.          *
  251.          * <p>This value is set to {@link Double#NaN NaN} if the {@link #getPValue() p-value} was
  252.          * {@linkplain PValueMethod#ESTIMATE estimated}. The estimated p-value will have been created
  253.          * using a distribution of possible D values given the underlying joint distribution of
  254.          * the sample data. Comparison of the p-value to the upper p-value is not applicable.
  255.          *
  256.          * @return the p-value of the upper bound of D (or NaN)
  257.          * @see #getUpperD()
  258.          */
  259.         public double getUpperPValue() {
  260.             return upperP;
  261.         }
  262.     }

  263.     /**
  264.      * @param alternative Alternative hypothesis.
  265.      * @param method P-value method.
  266.      * @param strict true to use a strict inequality.
  267.      * @param rng Source of randomness.
  268.      * @param iterations Number of iterations.
  269.      */
  270.     private KolmogorovSmirnovTest(AlternativeHypothesis alternative, PValueMethod method, boolean strict,
  271.         UniformRandomProvider rng, int iterations) {
  272.         this.alternative = alternative;
  273.         this.pValueMethod = method;
  274.         this.strictInequality = strict;
  275.         this.rng = rng;
  276.         this.iterations = iterations;
  277.     }

  278.     /**
  279.      * Return an instance using the default options.
  280.      *
  281.      * <ul>
  282.      * <li>{@link AlternativeHypothesis#TWO_SIDED}
  283.      * <li>{@link PValueMethod#AUTO}
  284.      * <li>{@link Inequality#NON_STRICT}
  285.      * <li>{@linkplain #with(UniformRandomProvider) RNG = none}
  286.      * <li>{@linkplain #withIterations(int) Iterations = 1000}
  287.      * </ul>
  288.      *
  289.      * @return default instance
  290.      */
  291.     public static KolmogorovSmirnovTest withDefaults() {
  292.         return DEFAULT;
  293.     }

  294.     /**
  295.      * Return an instance with the configured alternative hypothesis.
  296.      *
  297.      * @param v Value.
  298.      * @return an instance
  299.      */
  300.     public KolmogorovSmirnovTest with(AlternativeHypothesis v) {
  301.         return new KolmogorovSmirnovTest(Objects.requireNonNull(v), pValueMethod, strictInequality, rng, iterations);
  302.     }

  303.     /**
  304.      * Return an instance with the configured p-value method.
  305.      *
  306.      * <p>For the one-sample two-sided test Kolmogorov's asymptotic approximation can be used;
  307.      * otherwise the p-value uses the distribution of the D statistic.
  308.      *
  309.      * <p>For the two-sample test the exact p-value can be computed for small sample sizes;
  310.      * otherwise the p-value resorts to the asymptotic approximation. Alternatively a p-value
  311.      * can be estimated from the combined distribution of the samples. This requires a source
  312.      * of randomness.
  313.      *
  314.      * @param v Value.
  315.      * @return an instance
  316.      * @see #with(UniformRandomProvider)
  317.      */
  318.     public KolmogorovSmirnovTest with(PValueMethod v) {
  319.         return new KolmogorovSmirnovTest(alternative, Objects.requireNonNull(v), strictInequality, rng, iterations);
  320.     }

  321.     /**
  322.      * Return an instance with the configured inequality.
  323.      *
  324.      * <p>Computes the p-value for the two-sample test as \(P(D_{n,m} &gt; d)\) if strict;
  325.      * otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample
  326.      * Kolmogorov-Smirnov statistic, either the two-sided \(D_{n,m}\) or one-sided
  327.      * \(D_{n,m}^+\) or \(D_{n,m}^-\).
  328.      *
  329.      * @param v Value.
  330.      * @return an instance
  331.      */
  332.     public KolmogorovSmirnovTest with(Inequality v) {
  333.         return new KolmogorovSmirnovTest(alternative, pValueMethod,
  334.             Objects.requireNonNull(v) == Inequality.STRICT, rng, iterations);
  335.     }

  336.     /**
  337.      * Return an instance with the configured source of randomness.
  338.      *
  339.      * <p>Applies to the two-sample test when the p-value method is set to
  340.      * {@link PValueMethod#ESTIMATE ESTIMATE}. The randomness
  341.      * is used for sampling of the combined distribution.
  342.      *
  343.      * <p>There is no default source of randomness. If the randomness is not
  344.      * set then estimation is not possible and an {@link IllegalStateException} will be
  345.      * raised in the two-sample test.
  346.      *
  347.      * @param v Value.
  348.      * @return an instance
  349.      * @see #with(PValueMethod)
  350.      */
  351.     public KolmogorovSmirnovTest with(UniformRandomProvider v) {
  352.         return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality,
  353.             Objects.requireNonNull(v), iterations);
  354.     }

  355.     /**
  356.      * Return an instance with the configured number of iterations.
  357.      *
  358.      * <p>Applies to the two-sample test when the p-value method is set to
  359.      * {@link PValueMethod#ESTIMATE ESTIMATE}. This is the number of sampling iterations
  360.      * used to estimate the p-value. The p-value is a fraction using the {@code iterations}
  361.      * as the denominator. The number of significant digits in the p-value is
  362.      * upper bounded by log<sub>10</sub>(iterations); small p-values have fewer significant
  363.      * digits. A large number of iterations is recommended when using a small critical
  364.      * value to reject the null hypothesis.
  365.      *
  366.      * @param v Value.
  367.      * @return an instance
  368.      * @throws IllegalArgumentException if the number of iterations is not strictly positive
  369.      */
  370.     public KolmogorovSmirnovTest withIterations(int v) {
  371.         return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, rng,
  372.             Arguments.checkStrictlyPositive(v));
  373.     }

  374.     /**
  375.      * Computes the one-sample Kolmogorov-Smirnov test statistic.
  376.      *
  377.      * <ul>
  378.      * <li>two-sided: \(D_n=\sup_x |F_n(x)-F(x)|\)
  379.      * <li>greater: \(D_n^+=\sup_x (F_n(x)-F(x))\)
  380.      * <li>less: \(D_n^-=\sup_x (F(x)-F_n(x))\)
  381.      * </ul>
  382.      *
  383.      * <p>where \(F\) is the distribution cumulative density function ({@code cdf}),
  384.      * \(n\) is the length of {@code x} and \(F_n\) is the empirical distribution that puts
  385.      * mass \(1/n\) at each of the values in {@code x}.
  386.      *
  387.      * <p>The cumulative distribution function should map a real value {@code x} to a probability
  388.      * in [0, 1]. To use a reference distribution the CDF can be passed using a method reference:
  389.      * <pre>
  390.      * UniformContinuousDistribution dist = UniformContinuousDistribution.of(0, 1);
  391.      * UniformRandomProvider rng = RandomSource.KISS.create(123);
  392.      * double[] x = dist.sampler(rng).samples(100);
  393.      * double d = KolmogorovSmirnovTest.withDefaults().statistic(x, dist::cumulativeProbability);
  394.      * </pre>
  395.      *
  396.      * @param cdf Reference cumulative distribution function.
  397.      * @param x Sample being evaluated.
  398.      * @return Kolmogorov-Smirnov statistic
  399.      * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
  400.      * @see #test(double[], DoubleUnaryOperator)
  401.      */
  402.     public double statistic(double[] x, DoubleUnaryOperator cdf) {
  403.         return computeStatistic(x, cdf, IGNORED_SIGN);
  404.     }

  405.     /**
  406.      * Computes the two-sample Kolmogorov-Smirnov test statistic.
  407.      *
  408.      * <ul>
  409.      * <li>two-sided: \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
  410.      * <li>greater: \(D_{n,m}^+=\sup_x (F_n(x)-F_m(x))\)
  411.      * <li>less: \(D_{n,m}^-=\sup_x (F_m(x)-F_n(x))\)
  412.      * </ul>
  413.      *
  414.      * <p>where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
  415.      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
  416.      * is the empirical distribution that puts mass \(1/m\) at each of the values in {@code y}.
  417.      *
  418.      * @param x First sample.
  419.      * @param y Second sample.
  420.      * @return Kolmogorov-Smirnov statistic
  421.      * @throws IllegalArgumentException if either {@code x} or {@code y} does not
  422.      * have length at least 2; or contain NaN values.
  423.      * @see #test(double[], double[])
  424.      */
  425.     public double statistic(double[] x, double[] y) {
  426.         final int n = checkArrayLength(x);
  427.         final int m = checkArrayLength(y);
  428.         // Clone to avoid destructive modification of input
  429.         final long dnm = computeIntegralKolmogorovSmirnovStatistic(x.clone(), y.clone(),
  430.                 IGNORED_SIGN, IGNORED_D);
  431.         // Re-use the method to compute D in [0, 1] for consistency
  432.         return computeD(dnm, n, m, ArithmeticUtils.gcd(n, m));
  433.     }

  434.     /**
  435.      * Performs a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis
  436.      * that {@code x} conforms to the distribution cumulative density function ({@code cdf}).
  437.      *
  438.      * <p>The test is defined by the {@link AlternativeHypothesis}:
  439.      * <ul>
  440.      * <li>Two-sided evaluates the null hypothesis that the two distributions are
  441.      * identical, \(F_n(i) = F(i)\) for all \( i \); the alternative is that the are not
  442.      * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
  443.      * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F(i)\) for all \( i \);
  444.      * the alternative is \(F_n(i) &gt; F(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
  445.      * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F(i)\) for all \( i \);
  446.      * the alternative is \(F_n(i) &lt; F(i)\) for at least one \( i \). The statistic is \( D_n^- \).
  447.      * </ul>
  448.      *
  449.      * <p>The p-value method defaults to exact. The one-sided p-value uses Smirnov's stable formula:
  450.      *
  451.      * <p>\[ P(D_n^+ \ge x) = x \sum_{j=0}^{\lfloor n(1-x) \rfloor} \binom{n}{j} \left(\frac{j}{n} + x\right)^{j-1} \left(1-x-\frac{j}{n} \right)^{n-j} \]
  452.      *
  453.      * <p>The two-sided p-value is computed using methods described in
  454.      * Simard &amp; L’Ecuyer (2011). The two-sided test supports an asymptotic p-value
  455.      * using Kolmogorov's formula:
  456.      *
  457.      * <p>\[ \lim_{n\to\infty} P(\sqrt{n}D_n &gt; z) = 2 \sum_{i=1}^\infty (-1)^{i-1} e^{-2 i^2 z^2} \]
  458.      *
  459.      * @param x Sample being being evaluated.
  460.      * @param cdf Reference cumulative distribution function.
  461.      * @return test result
  462.      * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
  463.      * @see #statistic(double[], DoubleUnaryOperator)
  464.      */
  465.     public OneResult test(double[] x, DoubleUnaryOperator cdf) {
  466.         final int[] sign = {0};
  467.         final double d = computeStatistic(x, cdf, sign);
  468.         final double p;
  469.         if (alternative == AlternativeHypothesis.TWO_SIDED) {
  470.             PValueMethod method = pValueMethod;
  471.             if (method == PValueMethod.AUTO) {
  472.                 // No switch to the asymptotic for large n
  473.                 method = PValueMethod.EXACT;
  474.             }
  475.             if (method == PValueMethod.ASYMPTOTIC) {
  476.                 // Kolmogorov's asymptotic formula using z = sqrt(n) * d
  477.                 p = KolmogorovSmirnovDistribution.ksSum(Math.sqrt(x.length) * d);
  478.             } else {
  479.                 // exact
  480.                 p = KolmogorovSmirnovDistribution.Two.sf(d, x.length);
  481.             }
  482.         } else {
  483.             // one-sided: always use exact
  484.             p = KolmogorovSmirnovDistribution.One.sf(d, x.length);
  485.         }
  486.         return new OneResult(d, sign[0], p);
  487.     }

  488.     /**
  489.      * Performs a two-sample Kolmogorov-Smirnov test on samples {@code x} and {@code y}.
  490.      * Test the empirical distributions \(F_n\) and \(F_m\) where \(n\) is the length
  491.      * of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the empirical distribution
  492.      * that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical
  493.      * distribution that puts mass \(1/m\) of the {@code y} values.
  494.      *
  495.      * <p>The test is defined by the {@link AlternativeHypothesis}:
  496.      * <ul>
  497.      * <li>Two-sided evaluates the null hypothesis that the two distributions are
  498.      * identical, \(F_n(i) = F_m(i)\) for all \( i \); the alternative is that they are not
  499.      * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
  500.      * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F_m(i)\) for all \( i \);
  501.      * the alternative is \(F_n(i) &gt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
  502.      * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F_m(i)\) for all \( i \);
  503.      * the alternative is \(F_n(i) &lt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^- \).
  504.      * </ul>
  505.      *
  506.      * <p>If the {@linkplain PValueMethod p-value method} is auto, then an exact p computation
  507.      * is attempted if both sample sizes are less than 10000 using the methods presented in
  508.      * Viehmann (2021) and Hodges (1958); otherwise an asymptotic p-value is returned.
  509.      * The two-sided p-value is \(\overline{F}(d, \sqrt{mn / (m + n)})\) where \(\overline{F}\)
  510.      * is the complementary cumulative density function of the two-sided one-sample
  511.      * Kolmogorov-Smirnov distribution. The one-sided p-value uses an approximation from
  512.      * Hodges (1958) Eq 5.3.
  513.      *
  514.      * <p>\(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the
  515.      * null hypothesis \(H_0 : D_{n,m} \gt d \) differ from \(H_0 : D_{n,m} \ge d \)
  516.      * by the mass of the observed value \(d\). These can be distinguished using an
  517.      * {@link Inequality} parameter. This is ignored for large samples.
  518.      *
  519.      * <p>If the data contains ties there is no defined ordering in the tied region to use
  520.      * for the difference between the two empirical distributions. Each ordering of the
  521.      * tied region <em>may</em> create a different D statistic. All possible orderings
  522.      * generate a distribution for the D value. In this case the tied region is traversed
  523.      * entirely and the effect on the D value evaluated at the end of the tied region.
  524.      * This is the path with the least change on the D statistic. The path with the
  525.      * greatest change on the D statistic is also computed as the upper bound on D.
  526.      * If these two values are different then the tied region is known to generate a
  527.      * distribution for the D statistic and the p-value is an over estimate for the cases
  528.      * with a larger D statistic. The presence of any significant tied regions is returned
  529.      * in the result.
  530.      *
  531.      * <p>If the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} then the p-value is
  532.      * estimated by repeat sampling of the joint distribution of {@code x} and {@code y}.
  533.      * The p-value is the frequency that a sample creates a D statistic
  534.      * greater than or equal to (or greater than for strict inequality) the observed value.
  535.      * In this case a source of randomness must be configured or an {@link IllegalStateException}
  536.      * will be raised. The p-value for the upper bound on D will not be estimated and is set to
  537.      * {@link Double#NaN NaN}. This estimation procedure is not affected by ties in the data
  538.      * and is increasingly robust for larger datasets. The method is modeled after
  539.      * <a href="https://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
  540.      * in the R {@code Matching} package (Sekhon (2011)).
  541.      *
  542.      * @param x First sample.
  543.      * @param y Second sample.
  544.      * @return test result
  545.      * @throws IllegalArgumentException if either {@code x} or {@code y} does not
  546.      * have length at least 2; or contain NaN values.
  547.      * @throws IllegalStateException if the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE}
  548.      * and there is no source of randomness.
  549.      * @see #statistic(double[], double[])
  550.      */
  551.     public TwoResult test(double[] x, double[] y) {
  552.         final int n = checkArrayLength(x);
  553.         final int m = checkArrayLength(y);
  554.         PValueMethod method = pValueMethod;
  555.         final int[] sign = {0};
  556.         final long[] tiesD = {0, 0};

  557.         final double[] sx = x.clone();
  558.         final double[] sy = y.clone();
  559.         final long dnm = computeIntegralKolmogorovSmirnovStatistic(sx, sy, sign, tiesD);

  560.         // Compute p-value. Note that the p-value is not invalidated by ties; it is the
  561.         // D statistic that could be invalidated by resolution of the ties. So compute
  562.         // the exact p even if ties are present.
  563.         if (method == PValueMethod.AUTO) {
  564.             // Use exact for small samples
  565.             method = Math.max(n, m) < LARGE_SAMPLE ?
  566.                 PValueMethod.EXACT :
  567.                 PValueMethod.ASYMPTOTIC;
  568.         }
  569.         final int gcd = ArithmeticUtils.gcd(n, m);
  570.         final double d = computeD(dnm, n, m, gcd);
  571.         final boolean significantTies = tiesD[1] > dnm;
  572.         final double d2 = significantTies ? computeD(tiesD[1], n, m, gcd) : d;

  573.         final double p;
  574.         double p2;

  575.         // Allow bootstrap estimation of the p-value
  576.         if (method == PValueMethod.ESTIMATE) {
  577.             p = estimateP(sx, sy, dnm);
  578.             p2 = Double.NaN;
  579.         } else {
  580.             final boolean exact = method == PValueMethod.EXACT;
  581.             p = p2 = twoSampleP(dnm, n, m, gcd, d, exact);
  582.             if (significantTies) {
  583.                 // Compute the upper bound on D.
  584.                 // The p-value is also computed. The alternative is to save the options
  585.                 // in the result with (upper dnm, n, m) and compute it on-demand.
  586.                 // Note detection of whether the exact P computation is possible is based on
  587.                 // n and m, thus this will use the same computation.
  588.                 p2 = twoSampleP(tiesD[1], n, m, gcd, d2, exact);
  589.             }
  590.         }
  591.         return new TwoResult(d, sign[0], p, significantTies, d2, p2);
  592.     }

  593.     /**
  594.      * Estimates the <i>p-value</i> of a two-sample Kolmogorov-Smirnov test evaluating the
  595.      * null hypothesis that {@code x} and {@code y} are samples drawn from the same
  596.      * probability distribution.
  597.      *
  598.      * <p>This method will destructively modify the input arrays (via a sort).
  599.      *
  600.      * <p>This method estimates the p-value by repeatedly sampling sets of size
  601.      * {@code x.length} and {@code y.length} from the empirical distribution
  602.      * of the combined sample. The memory requirement is an array copy of each of
  603.      * the input arguments.
  604.      *
  605.      * <p>When using strict inequality, this is equivalent to the algorithm implemented in
  606.      * the R function {@code ks.boot} as described in Sekhon (2011) Journal of Statistical
  607.      * Software, 42(7), 1–52 [3].
  608.      *
  609.      * @param x First sample.
  610.      * @param y Second sample.
  611.      * @param dnm Integral D statistic.
  612.      * @return p-value
  613.      * @throws IllegalStateException if the source of randomness is null.
  614.      */
  615.     private double estimateP(double[] x, double[] y, long dnm) {
  616.         if (rng == null) {
  617.             throw new IllegalStateException("No source of randomness");
  618.         }

  619.         // Test if the random statistic is greater (strict), or greater or equal to d
  620.         final long d = strictInequality ? dnm : dnm - 1;

  621.         final long plus;
  622.         final long minus;
  623.         if (alternative == AlternativeHypothesis.GREATER_THAN) {
  624.             plus = d;
  625.             minus = Long.MIN_VALUE;
  626.         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
  627.             plus = Long.MAX_VALUE;
  628.             minus = -d;
  629.         } else {
  630.             // two-sided
  631.             plus = d;
  632.             minus = -d;
  633.         }

  634.         // Test dnm=0. This occurs for example when x == y.
  635.         if (0 < minus || 0 > plus) {
  636.             // Edge case where all possible d will be outside the inclusive bounds
  637.             return 1;
  638.         }

  639.         // Sample randomly with replacement from the combined distribution.
  640.         final DoubleSupplier gen = createSampler(x, y, rng);
  641.         int count = 0;
  642.         for (int i = iterations; i > 0; i--) {
  643.             for (int j = 0; j < x.length; j++) {
  644.                 x[j] = gen.getAsDouble();
  645.             }
  646.             for (int j = 0; j < y.length; j++) {
  647.                 y[j] = gen.getAsDouble();
  648.             }
  649.             if (testIntegralKolmogorovSmirnovStatistic(x, y, plus, minus)) {
  650.                 count++;
  651.             }
  652.         }
  653.         return count / (double) iterations;
  654.     }

  655.     /**
  656.      * Computes the magnitude of the one-sample Kolmogorov-Smirnov test statistic.
  657.      * The sign of the statistic is optionally returned in {@code sign}. For the two-sided case
  658.      * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
  659.      * had the larger magnitude.
  660.      *
  661.      * @param x Sample being evaluated.
  662.      * @param cdf Reference cumulative distribution function.
  663.      * @param sign Sign of the statistic (non-zero length).
  664.      * @return Kolmogorov-Smirnov statistic
  665.      * @throws IllegalArgumentException if {@code data} does not have length at least 2;
  666.      * or contains NaN values.
  667.      */
  668.     private double computeStatistic(double[] x, DoubleUnaryOperator cdf, int[] sign) {
  669.         final int n = checkArrayLength(x);
  670.         final double nd = n;
  671.         final double[] sx = sort(x.clone(), "Sample");
  672.         // Note: ties in the data do not matter as we compare the empirical CDF
  673.         // immediately before the value (i/n) and at the value (i+1)/n. For ties
  674.         // of length m this would be (i-m+1)/n and (i+1)/n and the result is the same.
  675.         double d = 0;
  676.         if (alternative == AlternativeHypothesis.GREATER_THAN) {
  677.             // Compute D+
  678.             for (int i = 0; i < n; i++) {
  679.                 final double yi = cdf.applyAsDouble(sx[i]);
  680.                 final double dp = (i + 1) / nd - yi;
  681.                 d = dp > d ? dp : d;
  682.             }
  683.             sign[0] = 1;
  684.         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
  685.             // Compute D-
  686.             for (int i = 0; i < n; i++) {
  687.                 final double yi = cdf.applyAsDouble(sx[i]);
  688.                 final double dn = yi - i / nd;
  689.                 d = dn > d ? dn : d;
  690.             }
  691.             sign[0] = -1;
  692.         } else {
  693.             // Two sided.
  694.             // Compute both (as unsigned) and return the sign indicating the largest result.
  695.             double plus = 0;
  696.             double minus = 0;
  697.             for (int i = 0; i < n; i++) {
  698.                 final double yi = cdf.applyAsDouble(sx[i]);
  699.                 final double dn = yi - i / nd;
  700.                 final double dp = (i + 1) / nd - yi;
  701.                 minus = dn > minus ? dn : minus;
  702.                 plus = dp > plus ? dp : plus;
  703.             }
  704.             sign[0] = Double.compare(plus, minus);
  705.             d = Math.max(plus, minus);
  706.         }
  707.         return d;
  708.     }

  709.     /**
  710.      * Computes the two-sample Kolmogorov-Smirnov test statistic. The statistic is integral
  711.      * and has a value in {@code [0, n*m]}. Not all values are possible; the smallest
  712.      * increment is the greatest common divisor of {@code n} and {@code m}.
  713.      *
  714.      * <p>This method will destructively modify the input arrays (via a sort).
  715.      *
  716.      * <p>The sign of the statistic is returned in {@code sign}. For the two-sided case
  717.      * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
  718.      * had the larger magnitude. If the two-sided statistic is zero the two arrays are
  719.      * identical, or are 'identical' data of a single value (sample sizes may be different),
  720.      * or have a sequence of ties of 'identical' data with a net zero effect on the D statistic
  721.      * e.g.
  722.      * <pre>
  723.      *  [1,2,3]           vs [1,2,3]
  724.      *  [0,0,0,0]         vs [0,0,0]
  725.      *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
  726.      * </pre>
  727.      *
  728.      * <p>This method detects ties in the input data. Not all ties will invalidate the returned
  729.      * statistic. Ties between the data can be interpreted as if the values were different
  730.      * but within machine epsilon. In this case the path through the tie region is not known.
  731.      * All paths through the tie region end at the same point. This method will compute the
  732.      * most extreme path through each tie region and the D statistic for these paths. If the
  733.      * ties D statistic is a larger magnitude than the returned D statistic then at least
  734.      * one tie region lies at a point on the full path that could result in a different
  735.      * statistic in the absence of ties. This signals the P-value computed using the returned
  736.      * D statistic is one of many possible p-values given the data and how ties are resolved.
  737.      * Note: The tiesD value may be smaller than the returned D statistic as it is not set
  738.      * to the maximum of D or tiesD. The only result of interest is when {@code tiesD > D}
  739.      * due to a tie region that can change the output D. On output {@code tiesD[0] != 0}
  740.      * if there were ties between samples and {@code tiesD[1] = D} of the most extreme path
  741.      * through the ties.
  742.      *
  743.      * @param x First sample (destructively modified by sort).
  744.      * @param y Second sample  (destructively modified by sort).
  745.      * @param sign Sign of the statistic (non-zero length).
  746.      * @param tiesD Integral statistic for the most extreme path through any ties (length at least 2).
  747.      * @return integral Kolmogorov-Smirnov statistic
  748.      * @throws IllegalArgumentException if either {@code x} or {@code y} contain NaN values.
  749.      */
  750.     private long computeIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, int[] sign, long[] tiesD) {
  751.         // Sort the sample arrays
  752.         sort(x, SAMPLE_1_NAME);
  753.         sort(y, SAMPLE_2_NAME);
  754.         final int n = x.length;
  755.         final int m = y.length;

  756.         // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
  757.         // Scale by n*m to use increments of m and n for x and y.
  758.         // Find the max difference between cdf_x and cdf_y.
  759.         int i = 0;
  760.         int j = 0;
  761.         long d = 0;
  762.         long plus = 0;
  763.         long minus = 0;
  764.         // Ties: store the D+,D- for most extreme path though tie region(s)
  765.         long tplus = 0;
  766.         long tminus = 0;
  767.         do {
  768.             // No NaNs so compare using < and >
  769.             if (x[i] < y[j]) {
  770.                 final double z = x[i];
  771.                 do {
  772.                     i++;
  773.                     d += m;
  774.                 } while (i < n && x[i] == z);
  775.                 plus = d > plus ? d : plus;
  776.             } else if (x[i] > y[j]) {
  777.                 final double z = y[j];
  778.                 do {
  779.                     j++;
  780.                     d -= n;
  781.                 } while (j < m && y[j] == z);
  782.                 minus = d < minus ? d : minus;
  783.             } else {
  784.                 // Traverse to the end of the tied section for d.
  785.                 // Also compute the most extreme path through the tied region.
  786.                 final double z = x[i];
  787.                 final long dd = d;
  788.                 int k = i;
  789.                 do {
  790.                     i++;
  791.                 } while (i < n && x[i] == z);
  792.                 k = i - k;
  793.                 d += k * (long) m;
  794.                 // Extreme D+ path
  795.                 tplus = d > tplus ? d : tplus;
  796.                 k = j;
  797.                 do {
  798.                     j++;
  799.                 } while (j < m && y[j] == z);
  800.                 k = j - k;
  801.                 d -= k * (long) n;
  802.                 // Extreme D- path must start at the original d
  803.                 tminus = Math.min(tminus, dd - k * (long) n);
  804.                 // End of tied section
  805.                 if (d > plus) {
  806.                     plus = d;
  807.                 } else if (d < minus) {
  808.                     minus = d;
  809.                 }
  810.             }
  811.         } while (i < n && j < m);
  812.         // The presence of any ties is flagged by a non-zero value for D+ or D-.
  813.         // Note we cannot use the selected tiesD value as in the one-sided case it may be zero
  814.         // and the non-selected D value will be non-zero.
  815.         tiesD[0] = tplus | tminus;
  816.         // For simplicity the correct tiesD is not returned (correct value is commented).
  817.         // The only case that matters is tiesD > D which is evaluated by the caller.
  818.         // Note however that the distance of tiesD < D is a measure of how little the
  819.         // tied region matters.
  820.         if (alternative == AlternativeHypothesis.GREATER_THAN) {
  821.             sign[0] = 1;
  822.             // correct = max(tplus, plus)
  823.             tiesD[1] = tplus;
  824.             return plus;
  825.         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
  826.             sign[0] = -1;
  827.             // correct = -min(tminus, minus)
  828.             tiesD[1] = -tminus;
  829.             return -minus;
  830.         } else {
  831.             // Two sided.
  832.             sign[0] = Double.compare(plus, -minus);
  833.             d = Math.max(plus, -minus);
  834.             // correct = max(d, max(tplus, -tminus))
  835.             tiesD[1] = Math.max(tplus, -tminus);
  836.             return d;
  837.         }
  838.     }

  839.     /**
  840.      * Test if the two-sample integral Kolmogorov-Smirnov statistic is strictly greater
  841.      * than the specified values for D+ and D-. Note that D- should have a negative sign
  842.      * to impose an inclusive lower bound.
  843.      *
  844.      * <p>This method will destructively modify the input arrays (via a sort).
  845.      *
  846.      * <p>For a two-sided alternative hypothesis {@code plus} and {@code minus} should have the
  847.      * same magnitude with opposite signs.
  848.      *
  849.      * <p>For a one-sided alternative hypothesis the value of {@code plus} or {@code minus}
  850.      * should have the expected value of the statistic, and the opposite D should have the maximum
  851.      * or minimum long value. The {@code minus} should be negatively signed:
  852.      *
  853.      * <ul>
  854.      * <li>greater: {@code plus} = D, {@code minus} = {@link Long#MIN_VALUE}
  855.      * <li>greater: {@code minus} = -D, {@code plus} = {@link Long#MAX_VALUE}
  856.      * </ul>
  857.      *
  858.      * <p>Note: This method has not been specialized for the one-sided case. Specialization
  859.      * would eliminate a conditional branch for {@code d > Long.MAX_VALUE} or
  860.      * {@code d < Long.MIN_VALUE}. Since these branches are never possible in the one-sided case
  861.      * this should be efficiently chosen by branch prediction in a processor pipeline.
  862.      *
  863.      * @param x First sample (destructively modified by sort; must not contain NaN).
  864.      * @param y Second sample (destructively modified by sort; must not contain NaN).
  865.      * @param plus Limit on D+ (inclusive upper bound).
  866.      * @param minus Limit on D- (inclusive lower bound).
  867.      * @return true if the D value exceeds the provided limits
  868.      */
  869.     private static boolean testIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, long plus, long minus) {
  870.         // Sort the sample arrays
  871.         Arrays.sort(x);
  872.         Arrays.sort(y);
  873.         final int n = x.length;
  874.         final int m = y.length;

  875.         // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
  876.         // Scale by n*m to use increments of m and n for x and y.
  877.         // Find the any difference that exceeds the specified bounds.
  878.         int i = 0;
  879.         int j = 0;
  880.         long d = 0;
  881.         do {
  882.             // No NaNs so compare using < and >
  883.             if (x[i] < y[j]) {
  884.                 final double z = x[i];
  885.                 do {
  886.                     i++;
  887.                     d += m;
  888.                 } while (i < n && x[i] == z);
  889.                 if (d > plus) {
  890.                     return true;
  891.                 }
  892.             } else if (x[i] > y[j]) {
  893.                 final double z = y[j];
  894.                 do {
  895.                     j++;
  896.                     d -= n;
  897.                 } while (j < m && y[j] == z);
  898.                 if (d < minus) {
  899.                     return true;
  900.                 }
  901.             } else {
  902.                 // Traverse to the end of the tied section for d.
  903.                 final double z = x[i];
  904.                 do {
  905.                     i++;
  906.                     d += m;
  907.                 } while (i < n && x[i] == z);
  908.                 do {
  909.                     j++;
  910.                     d -= n;
  911.                 } while (j < m && y[j] == z);
  912.                 // End of tied section
  913.                 if (d > plus || d < minus) {
  914.                     return true;
  915.                 }
  916.             }
  917.         } while (i < n && j < m);
  918.         // Note: Here d requires returning to zero. This is applicable to the one-sided
  919.         // cases since d may have always been above zero (favours D+) or always below zero
  920.         // (favours D-). This is ignored as the method is not called when dnm=0 is
  921.         // outside the inclusive bounds.
  922.         return false;
  923.     }

  924.     /**
  925.      * Creates a sampler to sample randomly from the combined distribution of the two samples.
  926.      *
  927.      * @param x First sample.
  928.      * @param y Second sample.
  929.      * @param rng Source of randomness.
  930.      * @return the sampler
  931.      */
  932.     private static DoubleSupplier createSampler(double[] x, double[] y,
  933.                                                 UniformRandomProvider rng) {
  934.         return createSampler(x, y, rng, MAX_ARRAY_SIZE);
  935.     }

  936.     /**
  937.      * Creates a sampler to sample randomly from the combined distribution of the two
  938.      * samples. This will copy the input data for the sampler.
  939.      *
  940.      * @param x First sample.
  941.      * @param y Second sample.
  942.      * @param rng Source of randomness.
  943.      * @param maxArraySize Maximum size of a single array.
  944.      * @return the sampler
  945.      */
  946.     static DoubleSupplier createSampler(double[] x, double[] y,
  947.                                         UniformRandomProvider rng,
  948.                                         int maxArraySize) {
  949.         final int n = x.length;
  950.         final int m = y.length;
  951.         final int len = n + m;
  952.         // Overflow safe: len > maxArraySize
  953.         if (len - maxArraySize > 0) {
  954.             // Support sampling with maximum length arrays
  955.             // (where a concatenated array is not possible)
  956.             // by choosing one or the other.
  957.             // - generate i in [-n, m)
  958.             // - return i < 0 ? x[n + i] : y[i]
  959.             // The sign condition is a 50-50 branch.
  960.             // Perform branchless by extracting the sign bit to pick the array.
  961.             // Copy the source data.
  962.             final double[] xx = x.clone();
  963.             final double[] yy = y.clone();
  964.             final IntToDoubleFunction nextX = i -> xx[n + i];
  965.             final IntToDoubleFunction nextY = i -> yy[i];
  966.             // Arrange function which accepts the negative index at position [1]
  967.             final IntToDoubleFunction[] next = {nextY, nextX};
  968.             return () -> {
  969.                 final int i = rng.nextInt(-n, m);
  970.                 return next[i >>> 31].applyAsDouble(i);
  971.             };
  972.         }
  973.         // Concatenate arrays
  974.         final double[] z = new double[len];
  975.         System.arraycopy(x, 0, z, 0, n);
  976.         System.arraycopy(y, 0, z, n, m);
  977.         return () -> z[rng.nextInt(len)];
  978.     }

  979.     /**
  980.      * Compute the D statistic from the integral D value.
  981.      *
  982.      * @param dnm Integral D-statistic value (in [0, n*m]).
  983.      * @param n First sample size.
  984.      * @param m Second sample size.
  985.      * @param gcd Greatest common divisor of n and m.
  986.      * @return D-statistic value (in [0, 1]).
  987.      */
  988.     private static double computeD(long dnm, int n, int m, int gcd) {
  989.         // Note: Integer division using the gcd is intentional
  990.         final long a = dnm / gcd;
  991.         final int b = m / gcd;
  992.         return a / ((double) n * b);
  993.     }

  994.     /**
  995.      * Computes \(P(D_{n,m} &gt; d)\) for the 2-sample Kolmogorov-Smirnov statistic.
  996.      *
  997.      * @param dnm Integral D-statistic value (in [0, n*m]).
  998.      * @param n First sample size.
  999.      * @param m Second sample size.
  1000.      * @param gcd Greatest common divisor of n and m.
  1001.      * @param d D-statistic value (in [0, 1]).
  1002.      * @param exact whether to compute the exact probability; otherwise approximate.
  1003.      * @return probability
  1004.      * @see #twoSampleExactP(long, int, int, int, boolean, boolean)
  1005.      * @see #twoSampleApproximateP(double, int, int, boolean)
  1006.      */
  1007.     private double twoSampleP(long dnm, int n, int m, int gcd, double d, boolean exact) {
  1008.         // Exact computation returns -1 if it cannot compute.
  1009.         double p = -1;
  1010.         if (exact) {
  1011.             p = twoSampleExactP(dnm, n, m, gcd, strictInequality, alternative == AlternativeHypothesis.TWO_SIDED);
  1012.         }
  1013.         if (p < 0) {
  1014.             p = twoSampleApproximateP(d, n, m, alternative == AlternativeHypothesis.TWO_SIDED);
  1015.         }
  1016.         return p;
  1017.     }

  1018.     /**
  1019.      * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
  1020.      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic, either the two-sided
  1021.      * \(D_{n,m}\) or one-sided \(D_{n,m}^+\}. See
  1022.      * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
  1023.      *
  1024.      * <p>The returned probability is exact. If the value cannot be computed this returns -1.
  1025.      *
  1026.      * <p>Note: This requires the greatest common divisor of n and m. The integral D statistic
  1027.      * in the range [0, n*m] is separated by increments of the gcd. The method will only
  1028.      * compute p-values for valid values of D by calculating for D/gcd.
  1029.      * Strict inquality is performed using the next valid value for D.
  1030.      *
  1031.      * @param dnm Integral D-statistic value (in [0, n*m]).
  1032.      * @param n First sample size.
  1033.      * @param m Second sample size.
  1034.      * @param gcd Greatest common divisor of n and m.
  1035.      * @param strict whether or not the probability to compute is expressed as a strict inequality.
  1036.      * @param twoSided whether D refers to D or D+.
  1037.      * @return probability that a randomly selected m-n partition of m + n generates D
  1038.      *         greater than (resp. greater than or equal to) {@code d} (or -1)
  1039.      */
  1040.     static double twoSampleExactP(long dnm, int n, int m, int gcd, boolean strict, boolean twoSided) {
  1041.         // Create the statistic in [0, lcm]
  1042.         // For strict inequality D > d the result is the same if we compute for D >= (d+1)
  1043.         final long d = dnm / gcd + (strict ? 1 : 0);

  1044.         // P-value methods compute for d <= lcm (least common multiple)
  1045.         final long lcm = (long) n * (m / gcd);
  1046.         if (d > lcm) {
  1047.             return 0;
  1048.         }

  1049.         // Note: Some methods require m >= n, others n >= m
  1050.         final int a = Math.min(n, m);
  1051.         final int b = Math.max(n, m);

  1052.         if (twoSided) {
  1053.             // Any two-sided statistic dnm cannot be less than min(n, m) in the absence of ties.
  1054.             if (d * gcd <= a) {
  1055.                 return 1;
  1056.             }
  1057.             // Here d in [2, lcm]
  1058.             if (n == m) {
  1059.                 return twoSampleTwoSidedPOutsideSquare(d, n);
  1060.             }
  1061.             return twoSampleTwoSidedPStabilizedInner(d, b, a, gcd);
  1062.         }
  1063.         // Any one-sided statistic cannot be less than 0
  1064.         if (d <= 0) {
  1065.             return 1;
  1066.         }
  1067.         // Here d in [1, lcm]
  1068.         if (n == m) {
  1069.             return twoSampleOneSidedPOutsideSquare(d, n);
  1070.         }
  1071.         return twoSampleOneSidedPOutside(d, a, b, gcd);
  1072.     }

  1073.     /**
  1074.      * Computes \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
  1075.      *
  1076.      * <p>The returned probability is exact, implemented using the stabilized inner method
  1077.      * presented in Viehmann (2021).
  1078.      *
  1079.      * <p>This is optimized for {@code m <= n}. If {@code m > n} and index-out-of-bounds
  1080.      * exception can occur.
  1081.      *
  1082.      * @param d Integral D-statistic value (in [2, lcm])
  1083.      * @param n Larger sample size.
  1084.      * @param m Smaller sample size.
  1085.      * @param gcd Greatest common divisor of n and m.
  1086.      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
  1087.      *         greater than or equal to {@code d}
  1088.      */
  1089.     private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) {
  1090.         // Check the computation is possible.
  1091.         // Note that the possible paths is binom(m+n, n).
  1092.         // However the computation is stable above this limit.
  1093.         // Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m).
  1094.         // Possible terms to compute <= n * size(cij)
  1095.         // Simple limit based on the number of possible different p-values
  1096.         if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) {
  1097.             return -1;
  1098.         }

  1099.         // This could be updated to use d in [1, lcm].
  1100.         // Currently it uses d in [gcd, n*m].
  1101.         // Largest intermediate value is (dnm + im + n) which is within 2^63
  1102.         // if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63
  1103.         final long dnm = d * gcd;

  1104.         // Viehmann (2021): Updated for i in [0, n], j in [0, m]
  1105.         // C_i,j = 1                                      if |i/n - j/m| >= d
  1106.         //       = 0                                      if |i/n - j/m| < d and (i=0 or j=0)
  1107.         //       = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)  otherwise
  1108.         // P2 = C_n,m
  1109.         // Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm]
  1110.         // so updates the scaling to compute the ranges. Also note that the listing uses
  1111.         // dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality).
  1112.         // The provided listing is explicit in the values for each j in the range.
  1113.         // It can be optimized given the known start and end j for each iteration as only
  1114.         // j where |i/n - j/m| < d must be processed:
  1115.         // startJ where: im - jn < dnm : jn > im - dnm
  1116.         // j = floor((im - dnm) / n) + 1      in [0, m]
  1117.         // endJ where: jn - im >= dnm
  1118.         // j = ceil((dnm + im) / n)           in [0, m+1]

  1119.         // First iteration with i = 0
  1120.         // j = ceil(dnm / n)
  1121.         int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n));

  1122.         // Only require 1 array to store C_i-1,j as the startJ only ever increases
  1123.         // and we update lower indices using higher ones.
  1124.         // The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m)
  1125.         // Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1
  1126.         // The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when
  1127.         // j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1
  1128.         // which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i
  1129.         // and we can maintain Cij as 1 larger than ceil(2*d*m) + 1.
  1130.         final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)];

  1131.         // Each iteration fills C_i,j with values and the remaining values are
  1132.         // kept as 1 for |i/n - j/m| >= d
  1133.         //assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ";
  1134.         for (int j = endJ; j < cij.length; j++) {
  1135.             //assert j * (long) n >= dnm : "jn < dnm for j >= endJ";
  1136.             cij[j] = 1;
  1137.         }

  1138.         int startJ = 0;
  1139.         int length = endJ;
  1140.         double val = -1;
  1141.         long im = 0;
  1142.         for (int i = 1; i <= n; i++) {
  1143.             im += m;
  1144.             final int lastStartJ = startJ;

  1145.             // Compute C_i,j for startJ <= j < endJ
  1146.             // startJ = floor((im - dnm) / n) + 1      in [0, m]
  1147.             // endJ   = ceil((dnm + im) / n)           in [0, m+1]
  1148.             startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1);
  1149.             endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n));

  1150.             if (startJ >= endJ) {
  1151.                 // No possible paths inside the boundary
  1152.                 return 1;
  1153.             }

  1154.             //assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1";

  1155.             // Initialize previous value C_i,j-1
  1156.             val = startJ == 0 ? 0 : 1;

  1157.             //assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ";
  1158.             //assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ";
  1159.             for (int j = startJ; j < endJ; j++) {
  1160.                 //assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ";
  1161.                 // C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)
  1162.                 // Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception.
  1163.                 // In this case cij[j - lastStartJ] == 1. Only happens when m >= n.
  1164.                 // Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ]
  1165.                 val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j);
  1166.                 cij[j - startJ] = val;
  1167.             }

  1168.             // Must keep the remaining values in C_i,j as 1 to allow
  1169.             // cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength
  1170.             final int lastLength = length;
  1171.             length = endJ - startJ;
  1172.             for (int j = lastLength - length - 1; j >= 0; j--) {
  1173.                 cij[length + j] = 1;
  1174.             }
  1175.         }
  1176.         // Return the most recently written value: C_n,m
  1177.         return val;
  1178.     }

  1179.     /**
  1180.      * Computes \(P(D_{n,m}^+ \ge d)\), where \(D_{n,m}^+\) is the 2-sample one-sided
  1181.      * Kolmogorov-Smirnov statistic.
  1182.      *
  1183.      * <p>The returned probability is exact, implemented using the outer method
  1184.      * presented in Hodges (1958).
  1185.      *
  1186.      * <p>This method will fail-fast and return -1 if the computation of the
  1187.      * numbers of paths overflows.
  1188.      *
  1189.      * @param d Integral D-statistic value (in [0, lcm])
  1190.      * @param n First sample size.
  1191.      * @param m Second sample size.
  1192.      * @param gcd Greatest common divisor of n and m.
  1193.      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
  1194.      *         greater than or equal to {@code d}
  1195.      */
  1196.     private static double twoSampleOneSidedPOutside(long d, int n, int m, int gcd) {
  1197.         // Hodges, Fig.2
  1198.         // Lower boundary: (nx - my)/nm >= d : (nx - my) >= dnm
  1199.         // B(x, y) is the number of ways from (0, 0) to (x, y) without previously
  1200.         // reaching the boundary.
  1201.         // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
  1202.         // Total paths:
  1203.         // sum_y { B(x, y) binom(m+n-x-y, n-y) }

  1204.         // Normalized by binom(m+n, n). Check this is possible.
  1205.         final long lm = m;
  1206.         if (n + lm > Integer.MAX_VALUE) {
  1207.             return -1;
  1208.         }
  1209.         final double binom = binom(m + n, n);
  1210.         if (binom == Double.POSITIVE_INFINITY) {
  1211.             return -1;
  1212.         }

  1213.         // This could be updated to use d in [1, lcm].
  1214.         // Currently it uses d in [gcd, n*m].
  1215.         final long dnm = d * gcd;

  1216.         // Visit all x in [0, m] where (nx - my) >= d for each increasing y in [0, n].
  1217.         // x = ceil( (d + my) / n ) = (d + my + n - 1) / n
  1218.         // y = ceil( (nx - d) / m ) = (nx - d + m - 1) / m
  1219.         // Note: n m integer, d in [0, nm], the intermediate cannot overflow a long.
  1220.         // x | y=0 = (d + n - 1) / n
  1221.         final int x0 = (int) ((dnm + n - 1) / n);
  1222.         if (x0 >= m) {
  1223.             return 1 / binom;
  1224.         }
  1225.         // The y above is the y *on* the boundary. Set the limit as the next y above:
  1226.         // y | x=m = 1 + floor( (nx - d) / m ) = 1 + (nm - d) / m
  1227.         final int maxy = (int) ((n * lm - dnm + m) / m);
  1228.         // Compute x and B(x, y) for visited B(x,y)
  1229.         final int[] xy = new int[maxy];
  1230.         final double[] bxy = new double[maxy];
  1231.         xy[0] = x0;
  1232.         bxy[0] = 1;
  1233.         for (int y = 1; y < maxy; y++) {
  1234.             final int x = (int) ((dnm + lm * y + n - 1) / n);
  1235.             // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
  1236.             // Add the terms to subtract as a negative sum.
  1237.             final Sum b = Sum.create();
  1238.             for (int yy = 0; yy < y; yy++) {
  1239.                 // Here: previousX = x - xy[yy] : previousY = y - yy
  1240.                 // bxy[yy] is the paths to (previousX, previousY)
  1241.                 // binom represent the paths from (previousX, previousY) to (x, y)
  1242.                 b.addProduct(bxy[yy], -binom(x - xy[yy] + y - yy, y - yy));
  1243.             }
  1244.             b.add(binom(x + y, y));
  1245.             xy[y] = x;
  1246.             bxy[y] = b.getAsDouble();
  1247.         }
  1248.         // sum_y { B(x, y) binom(m+n-x-y, n-y) }
  1249.         final Sum sum = Sum.create();
  1250.         for (int y = 0; y < maxy; y++) {
  1251.             sum.addProduct(bxy[y], binom(m + n - xy[y] - y, n - y));
  1252.         }
  1253.         // No individual term should have overflowed since binom is finite.
  1254.         // Any sum above 1 is floating-point error.
  1255.         return KolmogorovSmirnovDistribution.clipProbability(sum.getAsDouble() / binom);
  1256.     }

  1257.     /**
  1258.      * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample one-sided
  1259.      * Kolmogorov-Smirnov statistic.
  1260.      *
  1261.      * <p>The returned probability is exact, implemented using the outer method
  1262.      * presented in Hodges (1958).
  1263.      *
  1264.      * @param d Integral D-statistic value (in [1, lcm])
  1265.      * @param n Sample size.
  1266.      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
  1267.      *         greater than or equal to {@code d}
  1268.      */
  1269.     private static double twoSampleOneSidedPOutsideSquare(long d, int n) {
  1270.         // Hodges (1958) Eq. 2.3:
  1271.         // p = binom(2n, n-a) / binom(2n, n)
  1272.         // a in [1, n] == d * n == dnm / n
  1273.         final int a = (int) d;

  1274.         // Rearrange:
  1275.         // p = ( 2n! / ((n-a)! (n+a)!) ) / ( 2n! / (n! n!) )
  1276.         //   = n! n! / ( (n-a)! (n+a)! )
  1277.         // Perform using pre-computed factorials if possible.
  1278.         if (n + a <= MAX_FACTORIAL) {
  1279.             final double x = Factorial.doubleValue(n);
  1280.             final double y = Factorial.doubleValue(n - a);
  1281.             final double z = Factorial.doubleValue(n + a);
  1282.             return (x / y) * (x / z);
  1283.         }
  1284.         // p = n! / (n-a)!  *  n! / (n+a)!
  1285.         //       n * (n-1) * ... * (n-a+1)
  1286.         //   = -----------------------------
  1287.         //     (n+a) * (n+a-1) * ... * (n+1)

  1288.         double p = 1;
  1289.         for (int i = 0; i < a && p != 0; i++) {
  1290.             p *= (n - i) / (1.0 + n + i);
  1291.         }
  1292.         return p;
  1293.     }

  1294.     /**
  1295.      * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample two-sided
  1296.      * Kolmogorov-Smirnov statistic.
  1297.      *
  1298.      * <p>The returned probability is exact, implemented using the outer method
  1299.      * presented in Hodges (1958).
  1300.      *
  1301.      * @param d Integral D-statistic value (in [1, n])
  1302.      * @param n Sample size.
  1303.      * @return probability that a randomly selected m-n partition of n + n generates \(D_{n,n}\)
  1304.      *         greater than or equal to {@code d}
  1305.      */
  1306.     private static double twoSampleTwoSidedPOutsideSquare(long d, int n) {
  1307.         // Hodges (1958) Eq. 2.4:
  1308.         // p = 2 [ binom(2n, n-a) - binom(2n, n-2a) + binom(2n, n-3a) - ... ] / binom(2n, n)
  1309.         // a in [1, n] == d * n == dnm / n

  1310.         // As per twoSampleOneSidedPOutsideSquare, divide by binom(2n, n) and each term
  1311.         // can be expressed as a product:
  1312.         //         (             n - i                    n - i                   n - i         )
  1313.         // p = 2 * ( prod_i=0^a --------- - prod_i=0^2a --------- + prod_i=0^3a --------- + ... )
  1314.         //         (           1 + n + i                1 + n + i               1 + n + i       )
  1315.         // for ja in [1, ..., n/a]
  1316.         // Avoid repeat computation of terms by extracting common products:
  1317.         // p = 2 * ( p0a * (1 - p1a * (1 - p2a * (1 - ... ))) )
  1318.         // where each term pja is prod_i={ja}^{ja+a} for all j in [1, n / a]

  1319.         // The first term is the one-sided p.
  1320.         final double p0a = twoSampleOneSidedPOutsideSquare(d, n);
  1321.         if (p0a == 0) {
  1322.             // Underflow - nothing more to do
  1323.             return 0;
  1324.         }
  1325.         // Compute the inner-terms from small to big.
  1326.         // j = n / (d/n) ~ n*n / d
  1327.         // j is a measure of how extreme the d value is (small j is extreme d).
  1328.         // When j is above 0 a path may traverse from the lower boundary to the upper boundary.
  1329.         final int a = (int) d;
  1330.         double p = 0;
  1331.         for (int j = n / a; j > 0; j--) {
  1332.             double pja = 1;
  1333.             final int jaa = j * a + a;
  1334.             // Since p0a did not underflow we avoid the check for pj != 0
  1335.             for (int i = j * a; i < jaa; i++) {
  1336.                 pja *= (n - i) / (1.0 + n + i);
  1337.             }
  1338.             p = pja * (1 - p);
  1339.         }
  1340.         p = p0a * (1 - p);
  1341.         return Math.min(1, 2 * p);
  1342.     }

  1343.     /**
  1344.      * Compute the binomial coefficient binom(n, k).
  1345.      *
  1346.      * @param n N.
  1347.      * @param k K.
  1348.      * @return binom(n, k)
  1349.      */
  1350.     private static double binom(int n, int k) {
  1351.         return BinomialCoefficientDouble.value(n, k);
  1352.     }

  1353.     /**
  1354.      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
  1355.      * is the 2-sample Kolmogorov-Smirnov statistic. See
  1356.      * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
  1357.      *
  1358.      * <p>Specifically, what is returned is \(1 - CDF(d, \sqrt{mn / (m + n)})\) where CDF
  1359.      * is the cumulative density function of the two-sided one-sample Kolmogorov-Smirnov
  1360.      * distribution.
  1361.      *
  1362.      * @param d D-statistic value.
  1363.      * @param n First sample size.
  1364.      * @param m Second sample size.
  1365.      * @param twoSided True to compute the two-sided p-value; else one-sided.
  1366.      * @return approximate probability that a randomly selected m-n partition of m + n generates
  1367.      *         \(D_{n,m}\) greater than {@code d}
  1368.      */
  1369.     static double twoSampleApproximateP(double d, int n, int m, boolean twoSided) {
  1370.         final double nn = Math.min(n, m);
  1371.         final double mm = Math.max(n, m);
  1372.         if (twoSided) {
  1373.             // Smirnov's asymptotic formula:
  1374.             // P(sqrt(N) D_n > x)
  1375.             // N = m*n/(m+n)
  1376.             return KolmogorovSmirnovDistribution.Two.sf(d, (int) Math.round(mm * nn / (mm + nn)));
  1377.         }
  1378.         // one-sided
  1379.         // Use Hodges Eq 5.3. Requires m >= n
  1380.         // Correct for m=n, m an integral multiple of n, and 'on the average' for m nearly equal to n
  1381.         final double z = d * Math.sqrt(nn * mm / (nn + mm));
  1382.         return Math.exp(-2 * z * z - 2 * z * (mm + 2 * nn) / Math.sqrt(mm * nn * (mm + nn)) / 3);
  1383.     }

  1384.     /**
  1385.      * Verifies that {@code array} has length at least 2.
  1386.      *
  1387.      * @param array Array to test.
  1388.      * @return the length
  1389.      * @throws IllegalArgumentException if array is too short
  1390.      */
  1391.     private static int checkArrayLength(double[] array) {
  1392.         final int n = array.length;
  1393.         if (n <= 1) {
  1394.             throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
  1395.         }
  1396.         return n;
  1397.     }

  1398.     /**
  1399.      * Sort the input array. Throws an exception if NaN values are
  1400.      * present. It is assumed the array is non-zero length.
  1401.      *
  1402.      * @param x Input array.
  1403.      * @param name Name of the array.
  1404.      * @return a reference to the input (sorted) array
  1405.      * @throws IllegalArgumentException if {@code x} contains NaN values.
  1406.      */
  1407.     private static double[] sort(double[] x, String name) {
  1408.         Arrays.sort(x);
  1409.         // NaN will be at the end
  1410.         if (Double.isNaN(x[x.length - 1])) {
  1411.             throw new InferenceException(name + " contains NaN");
  1412.         }
  1413.         return x;
  1414.     }
  1415. }