View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.statistics.inference;
19  
20  import java.util.Arrays;
21  import java.util.Objects;
22  import java.util.function.DoubleSupplier;
23  import java.util.function.DoubleUnaryOperator;
24  import java.util.function.IntToDoubleFunction;
25  import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
26  import org.apache.commons.numbers.combinatorics.Factorial;
27  import org.apache.commons.numbers.core.ArithmeticUtils;
28  import org.apache.commons.numbers.core.Sum;
29  import org.apache.commons.rng.UniformRandomProvider;
30  
31  /**
32   * Implements the Kolmogorov-Smirnov (K-S) test for equality of continuous distributions.
33   *
34   * <p>The one-sample test uses a D statistic based on the maximum deviation of the empirical
35   * distribution of sample data points from the distribution expected under the null hypothesis.
36   *
37   * <p>The two-sample test uses a D statistic based on the maximum deviation of the two empirical
38   * distributions of sample data points. The two-sample tests evaluate the null hypothesis that
39   * the two samples {@code x} and {@code y} come from the same underlying distribution.
40   *
41   * <p>References:
42   * <ol>
43   * <li>
44   * Marsaglia, G., Tsang, W. W., &amp; Wang, J. (2003).
45   * <a href="https://doi.org/10.18637/jss.v008.i18">Evaluating Kolmogorov's Distribution.</a>
46   * Journal of Statistical Software, 8(18), 1–4.
47   * <li>Simard, R., &amp; L’Ecuyer, P. (2011).
48   * <a href="https://doi.org/10.18637/jss.v039.i11">Computing the Two-Sided Kolmogorov-Smirnov Distribution.</a>
49   * Journal of Statistical Software, 39(11), 1–18.
50   * <li>Sekhon, J. S. (2011).
51   * <a href="https://doi.org/10.18637/jss.v042.i07">
52   * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
53   * The Matching package for R.</a>
54   * Journal of Statistical Software, 42(7), 1–52.
55   * <li>Viehmann, T (2021).
56   * <a href="https://doi.org/10.48550/arXiv.2102.08037">
57   * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.</a>
58   * arXiv:2102.08037
59   * <li>Hodges, J. L. (1958).
60   * <a href="https://doi.org/10.1007/BF02589501">
61   * The significance probability of the smirnov two-sample test.</a>
62   * Arkiv for Matematik, 3(5), 469-486.
63   * </ol>
64   *
65   * <p>Note that [1] contains an error in computing h, refer to <a
66   * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
67   *
68   * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
69   * Kolmogorov-Smirnov (K-S) test (Wikipedia)</a>
70   * @since 1.1
71   */
72  public final class KolmogorovSmirnovTest {
73      /** Name for sample 1. */
74      private static final String SAMPLE_1_NAME = "Sample 1";
75      /** Name for sample 2. */
76      private static final String SAMPLE_2_NAME = "Sample 2";
77      /** When the largest sample size exceeds this value, 2-sample test AUTO p-value
78       * uses an asymptotic distribution to compute the p-value. */
79      private static final int LARGE_SAMPLE = 10000;
80      /** Maximum finite factorial. */
81      private static final int MAX_FACTORIAL = 170;
82      /** Maximum length of an array. This is used to determine if two arrays can be concatenated
83       * to create a sampler from the joint distribution. The limit is copied from the limit
84       * of java.util.ArrayList. */
85      private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
86      /** The maximum least common multiple (lcm) to attempt the exact p-value computation.
87       * The integral d value is in [0, n*m] in steps of the greatest common denominator (gcd),
88       * thus lcm = n*m/gcd is the number of possible different p-values.
89       * Some methods have a lower limit due to computation limits. This should be larger
90       * than LARGE_SAMPLE^2 so all AUTO p-values attempt an exact computation, i.e.
91       * at least 10000^2 ~ 2^26.56. */
92      private static final long MAX_LCM_TWO_SAMPLE_EXACT_P = 1L << 31;
93      /** Placeholder to use for the two-sample sign array when the value can be ignored. */
94      private static final int[] IGNORED_SIGN = new int[1];
95      /** Placeholder to use for the two-sample ties D array when the value can be ignored. */
96      private static final long[] IGNORED_D = new long[2];
97      /** Default instance. */
98      private static final KolmogorovSmirnovTest DEFAULT = new KolmogorovSmirnovTest(
99          AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, false, null, 1000);
100 
101     /** Alternative hypothesis. */
102     private final AlternativeHypothesis alternative;
103     /** Method to compute the p-value. */
104     private final PValueMethod pValueMethod;
105     /** Use a strict inequality for the two-sample exact p-value. */
106     private final boolean strictInequality;
107     /** Source of randomness. */
108     private final UniformRandomProvider rng;
109     /** Number of iterations . */
110     private final int iterations;
111 
112     /**
113      * Result for the one-sample Kolmogorov-Smirnov test.
114      *
115      * <p>This class is immutable.
116      *
117      * @since 1.1
118      */
119     public static class OneResult extends BaseSignificanceResult {
120         /** Sign of the statistic. */
121         private final int sign;
122 
123         /**
124          * Create an instance.
125          *
126          * @param statistic Test statistic.
127          * @param sign Sign of the statistic.
128          * @param p Result p-value.
129          */
130         OneResult(double statistic, int sign, double p) {
131             super(statistic, p);
132             this.sign = sign;
133         }
134 
135         /**
136          * Gets the sign of the statistic. This is 1 for \(D^+\) and -1 for \(D^-\).
137          * For a two sided-test this is zero if the magnitude of \(D^+\) and \(D^-\) was equal;
138          * otherwise the sign indicates the larger of \(D^+\) or \(D^-\).
139          *
140          * @return the sign
141          */
142         public int getSign() {
143             return sign;
144         }
145     }
146 
147     /**
148      * Result for the two-sample Kolmogorov-Smirnov test.
149      *
150      * <p>This class is immutable.
151      *
152      * @since 1.1
153      */
154     public static final class TwoResult extends OneResult {
155         /** Flag to indicate there were significant ties.
156          * Note that in extreme cases there may be significant ties despite {@code upperD == D}
157          * due to rounding when converting the integral statistic to a double. For this
158          * reason the presence of ties is stored as a flag. */
159         private final boolean significantTies;
160         /** Upper bound of the D statistic from all possible paths through regions with ties. */
161         private final double upperD;
162         /** The p-value of the upper D value. */
163         private final double upperP;
164 
165         /**
166          * Create an instance.
167          *
168          * @param statistic Test statistic.
169          * @param sign Sign of the statistic.
170          * @param p Result p-value.
171          * @param significantTies Flag to indicate there were significant ties.
172          * @param upperD Upper bound of the D statistic from all possible paths through
173          * regions with ties.
174          * @param upperP The p-value of the upper D value.
175          */
176         TwoResult(double statistic, int sign, double p, boolean significantTies, double upperD, double upperP) {
177             super(statistic, sign, p);
178             this.significantTies = significantTies;
179             this.upperD = upperD;
180             this.upperP = upperP;
181         }
182 
183         /**
184          * {@inheritDoc}
185          *
186          * <p><strong>Ties</strong>
187          *
188          * <p>The presence of ties in the data creates a distribution for the D values generated
189          * by all possible orderings of the tied regions. This statistic is computed using the
190          * path with the <em>minimum</em> effect on the D statistic.
191          *
192          * <p>For a one-sided statistic \(D^+\) or \(D^-\), this is the lower bound of the D statistic.
193          *
194          * <p>For a two-sided statistic D, this may be <em>below</em> the lower bound of the
195          * distribution of all possible D values. This case occurs when the number of ties
196          * is very high and is identified by a {@link #getPValue() p-value} of 1.
197          *
198          * <p>If the two-sided statistic is zero this only occurs in the presence of ties:
199          * either the two arrays are identical, are 'identical' data of a single value
200          * (sample sizes may be different), or have a sequence of ties of 'identical' data
201          * with a net zero effect on the D statistic, e.g.
202          * <pre>
203          *  [1,2,3]           vs [1,2,3]
204          *  [0,0,0,0]         vs [0,0,0]
205          *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
206          * </pre>
207          */
208         @Override
209         public double getStatistic() {
210             // Note: This method is here for documentation
211             return super.getStatistic();
212         }
213 
214         /**
215          * Returns {@code true} if there were ties between samples that occurred
216          * in a region which could change the D statistic if the ties were resolved to
217          * a defined order.
218          *
219          * <p>Ties between the data can be interpreted as if the values were different
220          * but within machine epsilon. In this case the order within the tie region is not known.
221          * If the most extreme ordering of any tied regions (e.g. all tied values of {@code x}
222          * before all tied values of {@code y}) could create a larger D statistic this
223          * method will return {@code true}.
224          *
225          * <p>If there were no ties, or all possible orderings of tied regions create the same
226          * D statistic, this method returns {@code false}.
227          *
228          * <p>Note it is possible that this method returns {@code true} when {@code D == upperD}
229          * due to rounding when converting the computed D statistic to a double. This will
230          * only occur for large sample sizes {@code n} and {@code m} where the product
231          * {@code n*m >= 2^53}.
232          *
233          * @return true if the D statistic could be changed by resolution of ties
234          * @see #getUpperD()
235          */
236         public boolean hasSignificantTies() {
237             return significantTies;
238         }
239 
240         /**
241          * Return the upper bound of the D statistic from all possible paths through regions with ties.
242          *
243          * <p>This will return a value equal to or greater than {@link #getStatistic()}.
244          *
245          * @return the upper bound of D
246          * @see #hasSignificantTies()
247          */
248         public double getUpperD() {
249             return upperD;
250         }
251 
252         /**
253          * Return the p-value of the upper bound of the D statistic.
254          *
255          * <p>If computed, this will return a value equal to or less than
256          * {@link #getPValue() getPValue}. It may be orders of magnitude smaller.
257          *
258          * <p>Note: This p-value corresponds to the most extreme p-value from all possible
259          * orderings of tied regions. It is <strong>not</strong> recommended to use this to
260          * reject the null hypothesis. The upper bound of D and the corresponding p-value
261          * provide information that must be interpreted in the context of the sample data and
262          * used to inform a decision on the suitability of the test to the data.
263          *
264          * <p>This value is set to {@link Double#NaN NaN} if the {@link #getPValue() p-value} was
265          * {@linkplain PValueMethod#ESTIMATE estimated}. The estimated p-value will have been created
266          * using a distribution of possible D values given the underlying joint distribution of
267          * the sample data. Comparison of the p-value to the upper p-value is not applicable.
268          *
269          * @return the p-value of the upper bound of D (or NaN)
270          * @see #getUpperD()
271          */
272         public double getUpperPValue() {
273             return upperP;
274         }
275     }
276 
277     /**
278      * @param alternative Alternative hypothesis.
279      * @param method P-value method.
280      * @param strict true to use a strict inequality.
281      * @param rng Source of randomness.
282      * @param iterations Number of iterations.
283      */
284     private KolmogorovSmirnovTest(AlternativeHypothesis alternative, PValueMethod method, boolean strict,
285         UniformRandomProvider rng, int iterations) {
286         this.alternative = alternative;
287         this.pValueMethod = method;
288         this.strictInequality = strict;
289         this.rng = rng;
290         this.iterations = iterations;
291     }
292 
293     /**
294      * Return an instance using the default options.
295      *
296      * <ul>
297      * <li>{@link AlternativeHypothesis#TWO_SIDED}
298      * <li>{@link PValueMethod#AUTO}
299      * <li>{@link Inequality#NON_STRICT}
300      * <li>{@linkplain #with(UniformRandomProvider) RNG = none}
301      * <li>{@linkplain #withIterations(int) Iterations = 1000}
302      * </ul>
303      *
304      * @return default instance
305      */
306     public static KolmogorovSmirnovTest withDefaults() {
307         return DEFAULT;
308     }
309 
310     /**
311      * Return an instance with the configured alternative hypothesis.
312      *
313      * @param v Value.
314      * @return an instance
315      */
316     public KolmogorovSmirnovTest with(AlternativeHypothesis v) {
317         return new KolmogorovSmirnovTest(Objects.requireNonNull(v), pValueMethod, strictInequality, rng, iterations);
318     }
319 
320     /**
321      * Return an instance with the configured p-value method.
322      *
323      * <p>For the one-sample two-sided test Kolmogorov's asymptotic approximation can be used;
324      * otherwise the p-value uses the distribution of the D statistic.
325      *
326      * <p>For the two-sample test the exact p-value can be computed for small sample sizes;
327      * otherwise the p-value resorts to the asymptotic approximation. Alternatively a p-value
328      * can be estimated from the combined distribution of the samples. This requires a source
329      * of randomness.
330      *
331      * @param v Value.
332      * @return an instance
333      * @see #with(UniformRandomProvider)
334      */
335     public KolmogorovSmirnovTest with(PValueMethod v) {
336         return new KolmogorovSmirnovTest(alternative, Objects.requireNonNull(v), strictInequality, rng, iterations);
337     }
338 
339     /**
340      * Return an instance with the configured inequality.
341      *
342      * <p>Computes the p-value for the two-sample test as \(P(D_{n,m} &gt; d)\) if strict;
343      * otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample
344      * Kolmogorov-Smirnov statistic, either the two-sided \(D_{n,m}\) or one-sided
345      * \(D_{n,m}^+\) or \(D_{n,m}^-\).
346      *
347      * @param v Value.
348      * @return an instance
349      */
350     public KolmogorovSmirnovTest with(Inequality v) {
351         return new KolmogorovSmirnovTest(alternative, pValueMethod,
352             Objects.requireNonNull(v) == Inequality.STRICT, rng, iterations);
353     }
354 
355     /**
356      * Return an instance with the configured source of randomness.
357      *
358      * <p>Applies to the two-sample test when the p-value method is set to
359      * {@link PValueMethod#ESTIMATE ESTIMATE}. The randomness
360      * is used for sampling of the combined distribution.
361      *
362      * <p>There is no default source of randomness. If the randomness is not
363      * set then estimation is not possible and an {@link IllegalStateException} will be
364      * raised in the two-sample test.
365      *
366      * @param v Value.
367      * @return an instance
368      * @see #with(PValueMethod)
369      */
370     public KolmogorovSmirnovTest with(UniformRandomProvider v) {
371         return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality,
372             Objects.requireNonNull(v), iterations);
373     }
374 
375     /**
376      * Return an instance with the configured number of iterations.
377      *
378      * <p>Applies to the two-sample test when the p-value method is set to
379      * {@link PValueMethod#ESTIMATE ESTIMATE}. This is the number of sampling iterations
380      * used to estimate the p-value. The p-value is a fraction using the {@code iterations}
381      * as the denominator. The number of significant digits in the p-value is
382      * upper bounded by log<sub>10</sub>(iterations); small p-values have fewer significant
383      * digits. A large number of iterations is recommended when using a small critical
384      * value to reject the null hypothesis.
385      *
386      * @param v Value.
387      * @return an instance
388      * @throws IllegalArgumentException if the number of iterations is not strictly positive
389      */
390     public KolmogorovSmirnovTest withIterations(int v) {
391         return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, rng,
392             Arguments.checkStrictlyPositive(v));
393     }
394 
395     /**
396      * Computes the one-sample Kolmogorov-Smirnov test statistic.
397      *
398      * <ul>
399      * <li>two-sided: \(D_n=\sup_x |F_n(x)-F(x)|\)
400      * <li>greater: \(D_n^+=\sup_x (F_n(x)-F(x))\)
401      * <li>less: \(D_n^-=\sup_x (F(x)-F_n(x))\)
402      * </ul>
403      *
404      * <p>where \(F\) is the distribution cumulative density function ({@code cdf}),
405      * \(n\) is the length of {@code x} and \(F_n\) is the empirical distribution that puts
406      * mass \(1/n\) at each of the values in {@code x}.
407      *
408      * <p>The cumulative distribution function should map a real value {@code x} to a probability
409      * in [0, 1]. To use a reference distribution the CDF can be passed using a method reference:
410      * <pre>
411      * UniformContinuousDistribution dist = UniformContinuousDistribution.of(0, 1);
412      * UniformRandomProvider rng = RandomSource.KISS.create(123);
413      * double[] x = dist.sampler(rng).samples(100);
414      * double d = KolmogorovSmirnovTest.withDefaults().statistic(x, dist::cumulativeProbability);
415      * </pre>
416      *
417      * @param cdf Reference cumulative distribution function.
418      * @param x Sample being evaluated.
419      * @return Kolmogorov-Smirnov statistic
420      * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
421      * @see #test(double[], DoubleUnaryOperator)
422      */
423     public double statistic(double[] x, DoubleUnaryOperator cdf) {
424         return computeStatistic(x, cdf, IGNORED_SIGN);
425     }
426 
427     /**
428      * Computes the two-sample Kolmogorov-Smirnov test statistic.
429      *
430      * <ul>
431      * <li>two-sided: \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
432      * <li>greater: \(D_{n,m}^+=\sup_x (F_n(x)-F_m(x))\)
433      * <li>less: \(D_{n,m}^-=\sup_x (F_m(x)-F_n(x))\)
434      * </ul>
435      *
436      * <p>where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
437      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
438      * is the empirical distribution that puts mass \(1/m\) at each of the values in {@code y}.
439      *
440      * @param x First sample.
441      * @param y Second sample.
442      * @return Kolmogorov-Smirnov statistic
443      * @throws IllegalArgumentException if either {@code x} or {@code y} does not
444      * have length at least 2; or contain NaN values.
445      * @see #test(double[], double[])
446      */
447     public double statistic(double[] x, double[] y) {
448         final int n = checkArrayLength(x);
449         final int m = checkArrayLength(y);
450         // Clone to avoid destructive modification of input
451         final long dnm = computeIntegralKolmogorovSmirnovStatistic(x.clone(), y.clone(),
452                 IGNORED_SIGN, IGNORED_D);
453         // Re-use the method to compute D in [0, 1] for consistency
454         return computeD(dnm, n, m, ArithmeticUtils.gcd(n, m));
455     }
456 
457     /**
458      * Performs a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis
459      * that {@code x} conforms to the distribution cumulative density function ({@code cdf}).
460      *
461      * <p>The test is defined by the {@link AlternativeHypothesis}:
462      * <ul>
463      * <li>Two-sided evaluates the null hypothesis that the two distributions are
464      * identical, \(F_n(i) = F(i)\) for all \( i \); the alternative is that the are not
465      * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
466      * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F(i)\) for all \( i \);
467      * the alternative is \(F_n(i) &gt; F(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
468      * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F(i)\) for all \( i \);
469      * the alternative is \(F_n(i) &lt; F(i)\) for at least one \( i \). The statistic is \( D_n^- \).
470      * </ul>
471      *
472      * <p>The p-value method defaults to exact. The one-sided p-value uses Smirnov's stable formula:
473      *
474      * <p>\[ P(D_n^+ \ge x) = x \sum_{j=0}^{\lfloor n(1-x) \rfloor} \binom{n}{j} \left(\frac{j}{n} + x\right)^{j-1} \left(1-x-\frac{j}{n} \right)^{n-j} \]
475      *
476      * <p>The two-sided p-value is computed using methods described in
477      * Simard &amp; L’Ecuyer (2011). The two-sided test supports an asymptotic p-value
478      * using Kolmogorov's formula:
479      *
480      * <p>\[ \lim_{n\to\infty} P(\sqrt{n}D_n &gt; z) = 2 \sum_{i=1}^\infty (-1)^{i-1} e^{-2 i^2 z^2} \]
481      *
482      * @param x Sample being being evaluated.
483      * @param cdf Reference cumulative distribution function.
484      * @return test result
485      * @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
486      * @see #statistic(double[], DoubleUnaryOperator)
487      */
488     public OneResult test(double[] x, DoubleUnaryOperator cdf) {
489         final int[] sign = {0};
490         final double d = computeStatistic(x, cdf, sign);
491         final double p;
492         if (alternative == AlternativeHypothesis.TWO_SIDED) {
493             PValueMethod method = pValueMethod;
494             if (method == PValueMethod.AUTO) {
495                 // No switch to the asymptotic for large n
496                 method = PValueMethod.EXACT;
497             }
498             if (method == PValueMethod.ASYMPTOTIC) {
499                 // Kolmogorov's asymptotic formula using z = sqrt(n) * d
500                 p = KolmogorovSmirnovDistribution.ksSum(Math.sqrt(x.length) * d);
501             } else {
502                 // exact
503                 p = KolmogorovSmirnovDistribution.Two.sf(d, x.length);
504             }
505         } else {
506             // one-sided: always use exact
507             p = KolmogorovSmirnovDistribution.One.sf(d, x.length);
508         }
509         return new OneResult(d, sign[0], p);
510     }
511 
512     /**
513      * Performs a two-sample Kolmogorov-Smirnov test on samples {@code x} and {@code y}.
514      * Test the empirical distributions \(F_n\) and \(F_m\) where \(n\) is the length
515      * of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the empirical distribution
516      * that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical
517      * distribution that puts mass \(1/m\) of the {@code y} values.
518      *
519      * <p>The test is defined by the {@link AlternativeHypothesis}:
520      * <ul>
521      * <li>Two-sided evaluates the null hypothesis that the two distributions are
522      * identical, \(F_n(i) = F_m(i)\) for all \( i \); the alternative is that they are not
523      * identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
524      * <li>Greater evaluates the null hypothesis that the \(F_n(i) &lt;= F_m(i)\) for all \( i \);
525      * the alternative is \(F_n(i) &gt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
526      * <li>Less evaluates the null hypothesis that the \(F_n(i) &gt;= F_m(i)\) for all \( i \);
527      * the alternative is \(F_n(i) &lt; F_m(i)\) for at least one \( i \). The statistic is \( D_n^- \).
528      * </ul>
529      *
530      * <p>If the {@linkplain PValueMethod p-value method} is auto, then an exact p computation
531      * is attempted if both sample sizes are less than 10000 using the methods presented in
532      * Viehmann (2021) and Hodges (1958); otherwise an asymptotic p-value is returned.
533      * The two-sided p-value is \(\overline{F}(d, \sqrt{mn / (m + n)})\) where \(\overline{F}\)
534      * is the complementary cumulative density function of the two-sided one-sample
535      * Kolmogorov-Smirnov distribution. The one-sided p-value uses an approximation from
536      * Hodges (1958) Eq 5.3.
537      *
538      * <p>\(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the
539      * null hypothesis \(H_0 : D_{n,m} \gt d \) differ from \(H_0 : D_{n,m} \ge d \)
540      * by the mass of the observed value \(d\). These can be distinguished using an
541      * {@link Inequality} parameter. This is ignored for large samples.
542      *
543      * <p>If the data contains ties there is no defined ordering in the tied region to use
544      * for the difference between the two empirical distributions. Each ordering of the
545      * tied region <em>may</em> create a different D statistic. All possible orderings
546      * generate a distribution for the D value. In this case the tied region is traversed
547      * entirely and the effect on the D value evaluated at the end of the tied region.
548      * This is the path with the least change on the D statistic. The path with the
549      * greatest change on the D statistic is also computed as the upper bound on D.
550      * If these two values are different then the tied region is known to generate a
551      * distribution for the D statistic and the p-value is an over estimate for the cases
552      * with a larger D statistic. The presence of any significant tied regions is returned
553      * in the result.
554      *
555      * <p>If the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} then the p-value is
556      * estimated by repeat sampling of the joint distribution of {@code x} and {@code y}.
557      * The p-value is the frequency that a sample creates a D statistic
558      * greater than or equal to (or greater than for strict inequality) the observed value.
559      * In this case a source of randomness must be configured or an {@link IllegalStateException}
560      * will be raised. The p-value for the upper bound on D will not be estimated and is set to
561      * {@link Double#NaN NaN}. This estimation procedure is not affected by ties in the data
562      * and is increasingly robust for larger datasets. The method is modeled after
563      * <a href="https://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
564      * in the R {@code Matching} package (Sekhon (2011)).
565      *
566      * @param x First sample.
567      * @param y Second sample.
568      * @return test result
569      * @throws IllegalArgumentException if either {@code x} or {@code y} does not
570      * have length at least 2; or contain NaN values.
571      * @throws IllegalStateException if the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE}
572      * and there is no source of randomness.
573      * @see #statistic(double[], double[])
574      */
575     public TwoResult test(double[] x, double[] y) {
576         final int n = checkArrayLength(x);
577         final int m = checkArrayLength(y);
578         PValueMethod method = pValueMethod;
579         final int[] sign = {0};
580         final long[] tiesD = {0, 0};
581 
582         final double[] sx = x.clone();
583         final double[] sy = y.clone();
584         final long dnm = computeIntegralKolmogorovSmirnovStatistic(sx, sy, sign, tiesD);
585 
586         // Compute p-value. Note that the p-value is not invalidated by ties; it is the
587         // D statistic that could be invalidated by resolution of the ties. So compute
588         // the exact p even if ties are present.
589         if (method == PValueMethod.AUTO) {
590             // Use exact for small samples
591             method = Math.max(n, m) < LARGE_SAMPLE ?
592                 PValueMethod.EXACT :
593                 PValueMethod.ASYMPTOTIC;
594         }
595         final int gcd = ArithmeticUtils.gcd(n, m);
596         final double d = computeD(dnm, n, m, gcd);
597         final boolean significantTies = tiesD[1] > dnm;
598         final double d2 = significantTies ? computeD(tiesD[1], n, m, gcd) : d;
599 
600         final double p;
601         double p2;
602 
603         // Allow bootstrap estimation of the p-value
604         if (method == PValueMethod.ESTIMATE) {
605             p = estimateP(sx, sy, dnm);
606             p2 = Double.NaN;
607         } else {
608             final boolean exact = method == PValueMethod.EXACT;
609             p = p2 = twoSampleP(dnm, n, m, gcd, d, exact);
610             if (significantTies) {
611                 // Compute the upper bound on D.
612                 // The p-value is also computed. The alternative is to save the options
613                 // in the result with (upper dnm, n, m) and compute it on-demand.
614                 // Note detection of whether the exact P computation is possible is based on
615                 // n and m, thus this will use the same computation.
616                 p2 = twoSampleP(tiesD[1], n, m, gcd, d2, exact);
617             }
618         }
619         return new TwoResult(d, sign[0], p, significantTies, d2, p2);
620     }
621 
622     /**
623      * Estimates the <i>p-value</i> of a two-sample Kolmogorov-Smirnov test evaluating the
624      * null hypothesis that {@code x} and {@code y} are samples drawn from the same
625      * probability distribution.
626      *
627      * <p>This method will destructively modify the input arrays (via a sort).
628      *
629      * <p>This method estimates the p-value by repeatedly sampling sets of size
630      * {@code x.length} and {@code y.length} from the empirical distribution
631      * of the combined sample. The memory requirement is an array copy of each of
632      * the input arguments.
633      *
634      * <p>When using strict inequality, this is equivalent to the algorithm implemented in
635      * the R function {@code ks.boot} as described in Sekhon (2011) Journal of Statistical
636      * Software, 42(7), 1–52 [3].
637      *
638      * @param x First sample.
639      * @param y Second sample.
640      * @param dnm Integral D statistic.
641      * @return p-value
642      * @throws IllegalStateException if the source of randomness is null.
643      */
644     private double estimateP(double[] x, double[] y, long dnm) {
645         if (rng == null) {
646             throw new IllegalStateException("No source of randomness");
647         }
648 
649         // Test if the random statistic is greater (strict), or greater or equal to d
650         final long d = strictInequality ? dnm : dnm - 1;
651 
652         final long plus;
653         final long minus;
654         if (alternative == AlternativeHypothesis.GREATER_THAN) {
655             plus = d;
656             minus = Long.MIN_VALUE;
657         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
658             plus = Long.MAX_VALUE;
659             minus = -d;
660         } else {
661             // two-sided
662             plus = d;
663             minus = -d;
664         }
665 
666         // Test dnm=0. This occurs for example when x == y.
667         if (0 < minus || 0 > plus) {
668             // Edge case where all possible d will be outside the inclusive bounds
669             return 1;
670         }
671 
672         // Sample randomly with replacement from the combined distribution.
673         final DoubleSupplier gen = createSampler(x, y, rng);
674         int count = 0;
675         for (int i = iterations; i > 0; i--) {
676             for (int j = 0; j < x.length; j++) {
677                 x[j] = gen.getAsDouble();
678             }
679             for (int j = 0; j < y.length; j++) {
680                 y[j] = gen.getAsDouble();
681             }
682             if (testIntegralKolmogorovSmirnovStatistic(x, y, plus, minus)) {
683                 count++;
684             }
685         }
686         return count / (double) iterations;
687     }
688 
689     /**
690      * Computes the magnitude of the one-sample Kolmogorov-Smirnov test statistic.
691      * The sign of the statistic is optionally returned in {@code sign}. For the two-sided case
692      * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
693      * had the larger magnitude.
694      *
695      * @param x Sample being evaluated.
696      * @param cdf Reference cumulative distribution function.
697      * @param sign Sign of the statistic (non-zero length).
698      * @return Kolmogorov-Smirnov statistic
699      * @throws IllegalArgumentException if {@code data} does not have length at least 2;
700      * or contains NaN values.
701      */
702     private double computeStatistic(double[] x, DoubleUnaryOperator cdf, int[] sign) {
703         final int n = checkArrayLength(x);
704         final double nd = n;
705         final double[] sx = sort(x.clone(), "Sample");
706         // Note: ties in the data do not matter as we compare the empirical CDF
707         // immediately before the value (i/n) and at the value (i+1)/n. For ties
708         // of length m this would be (i-m+1)/n and (i+1)/n and the result is the same.
709         double d = 0;
710         if (alternative == AlternativeHypothesis.GREATER_THAN) {
711             // Compute D+
712             for (int i = 0; i < n; i++) {
713                 final double yi = cdf.applyAsDouble(sx[i]);
714                 final double dp = (i + 1) / nd - yi;
715                 d = dp > d ? dp : d;
716             }
717             sign[0] = 1;
718         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
719             // Compute D-
720             for (int i = 0; i < n; i++) {
721                 final double yi = cdf.applyAsDouble(sx[i]);
722                 final double dn = yi - i / nd;
723                 d = dn > d ? dn : d;
724             }
725             sign[0] = -1;
726         } else {
727             // Two sided.
728             // Compute both (as unsigned) and return the sign indicating the largest result.
729             double plus = 0;
730             double minus = 0;
731             for (int i = 0; i < n; i++) {
732                 final double yi = cdf.applyAsDouble(sx[i]);
733                 final double dn = yi - i / nd;
734                 final double dp = (i + 1) / nd - yi;
735                 minus = dn > minus ? dn : minus;
736                 plus = dp > plus ? dp : plus;
737             }
738             sign[0] = Double.compare(plus, minus);
739             d = Math.max(plus, minus);
740         }
741         return d;
742     }
743 
744     /**
745      * Computes the two-sample Kolmogorov-Smirnov test statistic. The statistic is integral
746      * and has a value in {@code [0, n*m]}. Not all values are possible; the smallest
747      * increment is the greatest common divisor of {@code n} and {@code m}.
748      *
749      * <p>This method will destructively modify the input arrays (via a sort).
750      *
751      * <p>The sign of the statistic is returned in {@code sign}. For the two-sided case
752      * the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
753      * had the larger magnitude. If the two-sided statistic is zero the two arrays are
754      * identical, or are 'identical' data of a single value (sample sizes may be different),
755      * or have a sequence of ties of 'identical' data with a net zero effect on the D statistic
756      * e.g.
757      * <pre>
758      *  [1,2,3]           vs [1,2,3]
759      *  [0,0,0,0]         vs [0,0,0]
760      *  [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
761      * </pre>
762      *
763      * <p>This method detects ties in the input data. Not all ties will invalidate the returned
764      * statistic. Ties between the data can be interpreted as if the values were different
765      * but within machine epsilon. In this case the path through the tie region is not known.
766      * All paths through the tie region end at the same point. This method will compute the
767      * most extreme path through each tie region and the D statistic for these paths. If the
768      * ties D statistic is a larger magnitude than the returned D statistic then at least
769      * one tie region lies at a point on the full path that could result in a different
770      * statistic in the absence of ties. This signals the P-value computed using the returned
771      * D statistic is one of many possible p-values given the data and how ties are resolved.
772      * Note: The tiesD value may be smaller than the returned D statistic as it is not set
773      * to the maximum of D or tiesD. The only result of interest is when {@code tiesD > D}
774      * due to a tie region that can change the output D. On output {@code tiesD[0] != 0}
775      * if there were ties between samples and {@code tiesD[1] = D} of the most extreme path
776      * through the ties.
777      *
778      * @param x First sample (destructively modified by sort).
779      * @param y Second sample  (destructively modified by sort).
780      * @param sign Sign of the statistic (non-zero length).
781      * @param tiesD Integral statistic for the most extreme path through any ties (length at least 2).
782      * @return integral Kolmogorov-Smirnov statistic
783      * @throws IllegalArgumentException if either {@code x} or {@code y} contain NaN values.
784      */
785     private long computeIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, int[] sign, long[] tiesD) {
786         // Sort the sample arrays
787         sort(x, SAMPLE_1_NAME);
788         sort(y, SAMPLE_2_NAME);
789         final int n = x.length;
790         final int m = y.length;
791 
792         // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
793         // Scale by n*m to use increments of m and n for x and y.
794         // Find the max difference between cdf_x and cdf_y.
795         int i = 0;
796         int j = 0;
797         long d = 0;
798         long plus = 0;
799         long minus = 0;
800         // Ties: store the D+,D- for most extreme path though tie region(s)
801         long tplus = 0;
802         long tminus = 0;
803         do {
804             // No NaNs so compare using < and >
805             if (x[i] < y[j]) {
806                 final double z = x[i];
807                 do {
808                     i++;
809                     d += m;
810                 } while (i < n && x[i] == z);
811                 plus = d > plus ? d : plus;
812             } else if (x[i] > y[j]) {
813                 final double z = y[j];
814                 do {
815                     j++;
816                     d -= n;
817                 } while (j < m && y[j] == z);
818                 minus = d < minus ? d : minus;
819             } else {
820                 // Traverse to the end of the tied section for d.
821                 // Also compute the most extreme path through the tied region.
822                 final double z = x[i];
823                 final long dd = d;
824                 int k = i;
825                 do {
826                     i++;
827                 } while (i < n && x[i] == z);
828                 k = i - k;
829                 d += k * (long) m;
830                 // Extreme D+ path
831                 tplus = d > tplus ? d : tplus;
832                 k = j;
833                 do {
834                     j++;
835                 } while (j < m && y[j] == z);
836                 k = j - k;
837                 d -= k * (long) n;
838                 // Extreme D- path must start at the original d
839                 tminus = Math.min(tminus, dd - k * (long) n);
840                 // End of tied section
841                 if (d > plus) {
842                     plus = d;
843                 } else if (d < minus) {
844                     minus = d;
845                 }
846             }
847         } while (i < n && j < m);
848         // The presence of any ties is flagged by a non-zero value for D+ or D-.
849         // Note we cannot use the selected tiesD value as in the one-sided case it may be zero
850         // and the non-selected D value will be non-zero.
851         tiesD[0] = tplus | tminus;
852         // For simplicity the correct tiesD is not returned (correct value is commented).
853         // The only case that matters is tiesD > D which is evaluated by the caller.
854         // Note however that the distance of tiesD < D is a measure of how little the
855         // tied region matters.
856         if (alternative == AlternativeHypothesis.GREATER_THAN) {
857             sign[0] = 1;
858             // correct = max(tplus, plus)
859             tiesD[1] = tplus;
860             return plus;
861         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
862             sign[0] = -1;
863             // correct = -min(tminus, minus)
864             tiesD[1] = -tminus;
865             return -minus;
866         } else {
867             // Two sided.
868             sign[0] = Double.compare(plus, -minus);
869             d = Math.max(plus, -minus);
870             // correct = max(d, max(tplus, -tminus))
871             tiesD[1] = Math.max(tplus, -tminus);
872             return d;
873         }
874     }
875 
876     /**
877      * Test if the two-sample integral Kolmogorov-Smirnov statistic is strictly greater
878      * than the specified values for D+ and D-. Note that D- should have a negative sign
879      * to impose an inclusive lower bound.
880      *
881      * <p>This method will destructively modify the input arrays (via a sort).
882      *
883      * <p>For a two-sided alternative hypothesis {@code plus} and {@code minus} should have the
884      * same magnitude with opposite signs.
885      *
886      * <p>For a one-sided alternative hypothesis the value of {@code plus} or {@code minus}
887      * should have the expected value of the statistic, and the opposite D should have the maximum
888      * or minimum long value. The {@code minus} should be negatively signed:
889      *
890      * <ul>
891      * <li>greater: {@code plus} = D, {@code minus} = {@link Long#MIN_VALUE}
892      * <li>greater: {@code minus} = -D, {@code plus} = {@link Long#MAX_VALUE}
893      * </ul>
894      *
895      * <p>Note: This method has not been specialized for the one-sided case. Specialization
896      * would eliminate a conditional branch for {@code d > Long.MAX_VALUE} or
897      * {@code d < Long.MIN_VALUE}. Since these branches are never possible in the one-sided case
898      * this should be efficiently chosen by branch prediction in a processor pipeline.
899      *
900      * @param x First sample (destructively modified by sort; must not contain NaN).
901      * @param y Second sample (destructively modified by sort; must not contain NaN).
902      * @param plus Limit on D+ (inclusive upper bound).
903      * @param minus Limit on D- (inclusive lower bound).
904      * @return true if the D value exceeds the provided limits
905      */
906     private static boolean testIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, long plus, long minus) {
907         // Sort the sample arrays
908         Arrays.sort(x);
909         Arrays.sort(y);
910         final int n = x.length;
911         final int m = y.length;
912 
913         // CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
914         // Scale by n*m to use increments of m and n for x and y.
915         // Find the any difference that exceeds the specified bounds.
916         int i = 0;
917         int j = 0;
918         long d = 0;
919         do {
920             // No NaNs so compare using < and >
921             if (x[i] < y[j]) {
922                 final double z = x[i];
923                 do {
924                     i++;
925                     d += m;
926                 } while (i < n && x[i] == z);
927                 if (d > plus) {
928                     return true;
929                 }
930             } else if (x[i] > y[j]) {
931                 final double z = y[j];
932                 do {
933                     j++;
934                     d -= n;
935                 } while (j < m && y[j] == z);
936                 if (d < minus) {
937                     return true;
938                 }
939             } else {
940                 // Traverse to the end of the tied section for d.
941                 final double z = x[i];
942                 do {
943                     i++;
944                     d += m;
945                 } while (i < n && x[i] == z);
946                 do {
947                     j++;
948                     d -= n;
949                 } while (j < m && y[j] == z);
950                 // End of tied section
951                 if (d > plus || d < minus) {
952                     return true;
953                 }
954             }
955         } while (i < n && j < m);
956         // Note: Here d requires returning to zero. This is applicable to the one-sided
957         // cases since d may have always been above zero (favours D+) or always below zero
958         // (favours D-). This is ignored as the method is not called when dnm=0 is
959         // outside the inclusive bounds.
960         return false;
961     }
962 
963     /**
964      * Creates a sampler to sample randomly from the combined distribution of the two samples.
965      *
966      * @param x First sample.
967      * @param y Second sample.
968      * @param rng Source of randomness.
969      * @return the sampler
970      */
971     private static DoubleSupplier createSampler(double[] x, double[] y,
972                                                 UniformRandomProvider rng) {
973         return createSampler(x, y, rng, MAX_ARRAY_SIZE);
974     }
975 
976     /**
977      * Creates a sampler to sample randomly from the combined distribution of the two
978      * samples. This will copy the input data for the sampler.
979      *
980      * @param x First sample.
981      * @param y Second sample.
982      * @param rng Source of randomness.
983      * @param maxArraySize Maximum size of a single array.
984      * @return the sampler
985      */
986     static DoubleSupplier createSampler(double[] x, double[] y,
987                                         UniformRandomProvider rng,
988                                         int maxArraySize) {
989         final int n = x.length;
990         final int m = y.length;
991         final int len = n + m;
992         // Overflow safe: len > maxArraySize
993         if (len - maxArraySize > 0) {
994             // Support sampling with maximum length arrays
995             // (where a concatenated array is not possible)
996             // by choosing one or the other.
997             // - generate i in [-n, m)
998             // - return i < 0 ? x[n + i] : y[i]
999             // The sign condition is a 50-50 branch.
1000             // Perform branchless by extracting the sign bit to pick the array.
1001             // Copy the source data.
1002             final double[] xx = x.clone();
1003             final double[] yy = y.clone();
1004             final IntToDoubleFunction nextX = i -> xx[n + i];
1005             final IntToDoubleFunction nextY = i -> yy[i];
1006             // Arrange function which accepts the negative index at position [1]
1007             final IntToDoubleFunction[] next = {nextY, nextX};
1008             return () -> {
1009                 final int i = rng.nextInt(-n, m);
1010                 return next[i >>> 31].applyAsDouble(i);
1011             };
1012         }
1013         // Concatenate arrays
1014         final double[] z = new double[len];
1015         System.arraycopy(x, 0, z, 0, n);
1016         System.arraycopy(y, 0, z, n, m);
1017         return () -> z[rng.nextInt(len)];
1018     }
1019 
1020     /**
1021      * Compute the D statistic from the integral D value.
1022      *
1023      * @param dnm Integral D-statistic value (in [0, n*m]).
1024      * @param n First sample size.
1025      * @param m Second sample size.
1026      * @param gcd Greatest common divisor of n and m.
1027      * @return D-statistic value (in [0, 1]).
1028      */
1029     private static double computeD(long dnm, int n, int m, int gcd) {
1030         // Note: Integer division using the gcd is intentional
1031         final long a = dnm / gcd;
1032         final int b = m / gcd;
1033         return a / ((double) n * b);
1034     }
1035 
1036     /**
1037      * Computes \(P(D_{n,m} &gt; d)\) for the 2-sample Kolmogorov-Smirnov statistic.
1038      *
1039      * @param dnm Integral D-statistic value (in [0, n*m]).
1040      * @param n First sample size.
1041      * @param m Second sample size.
1042      * @param gcd Greatest common divisor of n and m.
1043      * @param d D-statistic value (in [0, 1]).
1044      * @param exact whether to compute the exact probability; otherwise approximate.
1045      * @return probability
1046      * @see #twoSampleExactP(long, int, int, int, boolean, boolean)
1047      * @see #twoSampleApproximateP(double, int, int, boolean)
1048      */
1049     private double twoSampleP(long dnm, int n, int m, int gcd, double d, boolean exact) {
1050         // Exact computation returns -1 if it cannot compute.
1051         double p = -1;
1052         if (exact) {
1053             p = twoSampleExactP(dnm, n, m, gcd, strictInequality, alternative == AlternativeHypothesis.TWO_SIDED);
1054         }
1055         if (p < 0) {
1056             p = twoSampleApproximateP(d, n, m, alternative == AlternativeHypothesis.TWO_SIDED);
1057         }
1058         return p;
1059     }
1060 
1061     /**
1062      * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
1063      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic, either the two-sided
1064      * \(D_{n,m}\) or one-sided \(D_{n,m}^+\}. See
1065      * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
1066      *
1067      * <p>The returned probability is exact. If the value cannot be computed this returns -1.
1068      *
1069      * <p>Note: This requires the greatest common divisor of n and m. The integral D statistic
1070      * in the range [0, n*m] is separated by increments of the gcd. The method will only
1071      * compute p-values for valid values of D by calculating for D/gcd.
1072      * Strict inquality is performed using the next valid value for D.
1073      *
1074      * @param dnm Integral D-statistic value (in [0, n*m]).
1075      * @param n First sample size.
1076      * @param m Second sample size.
1077      * @param gcd Greatest common divisor of n and m.
1078      * @param strict whether or not the probability to compute is expressed as a strict inequality.
1079      * @param twoSided whether D refers to D or D+.
1080      * @return probability that a randomly selected m-n partition of m + n generates D
1081      *         greater than (resp. greater than or equal to) {@code d} (or -1)
1082      */
1083     static double twoSampleExactP(long dnm, int n, int m, int gcd, boolean strict, boolean twoSided) {
1084         // Create the statistic in [0, lcm]
1085         // For strict inequality D > d the result is the same if we compute for D >= (d+1)
1086         final long d = dnm / gcd + (strict ? 1 : 0);
1087 
1088         // P-value methods compute for d <= lcm (least common multiple)
1089         final long lcm = (long) n * (m / gcd);
1090         if (d > lcm) {
1091             return 0;
1092         }
1093 
1094         // Note: Some methods require m >= n, others n >= m
1095         final int a = Math.min(n, m);
1096         final int b = Math.max(n, m);
1097 
1098         if (twoSided) {
1099             // Any two-sided statistic dnm cannot be less than min(n, m) in the absence of ties.
1100             if (d * gcd <= a) {
1101                 return 1;
1102             }
1103             // Here d in [2, lcm]
1104             if (n == m) {
1105                 return twoSampleTwoSidedPOutsideSquare(d, n);
1106             }
1107             return twoSampleTwoSidedPStabilizedInner(d, b, a, gcd);
1108         }
1109         // Any one-sided statistic cannot be less than 0
1110         if (d <= 0) {
1111             return 1;
1112         }
1113         // Here d in [1, lcm]
1114         if (n == m) {
1115             return twoSampleOneSidedPOutsideSquare(d, n);
1116         }
1117         return twoSampleOneSidedPOutside(d, a, b, gcd);
1118     }
1119 
1120     /**
1121      * Computes \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
1122      *
1123      * <p>The returned probability is exact, implemented using the stabilized inner method
1124      * presented in Viehmann (2021).
1125      *
1126      * <p>This is optimized for {@code m <= n}. If {@code m > n} and index-out-of-bounds
1127      * exception can occur.
1128      *
1129      * @param d Integral D-statistic value (in [2, lcm])
1130      * @param n Larger sample size.
1131      * @param m Smaller sample size.
1132      * @param gcd Greatest common divisor of n and m.
1133      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1134      *         greater than or equal to {@code d}
1135      */
1136     private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) {
1137         // Check the computation is possible.
1138         // Note that the possible paths is binom(m+n, n).
1139         // However the computation is stable above this limit.
1140         // Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m).
1141         // Possible terms to compute <= n * size(cij)
1142         // Simple limit based on the number of possible different p-values
1143         if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) {
1144             return -1;
1145         }
1146 
1147         // This could be updated to use d in [1, lcm].
1148         // Currently it uses d in [gcd, n*m].
1149         // Largest intermediate value is (dnm + im + n) which is within 2^63
1150         // if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63
1151         final long dnm = d * gcd;
1152 
1153         // Viehmann (2021): Updated for i in [0, n], j in [0, m]
1154         // C_i,j = 1                                      if |i/n - j/m| >= d
1155         //       = 0                                      if |i/n - j/m| < d and (i=0 or j=0)
1156         //       = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)  otherwise
1157         // P2 = C_n,m
1158         // Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm]
1159         // so updates the scaling to compute the ranges. Also note that the listing uses
1160         // dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality).
1161         // The provided listing is explicit in the values for each j in the range.
1162         // It can be optimized given the known start and end j for each iteration as only
1163         // j where |i/n - j/m| < d must be processed:
1164         // startJ where: im - jn < dnm : jn > im - dnm
1165         // j = floor((im - dnm) / n) + 1      in [0, m]
1166         // endJ where: jn - im >= dnm
1167         // j = ceil((dnm + im) / n)           in [0, m+1]
1168 
1169         // First iteration with i = 0
1170         // j = ceil(dnm / n)
1171         int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n));
1172 
1173         // Only require 1 array to store C_i-1,j as the startJ only ever increases
1174         // and we update lower indices using higher ones.
1175         // The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m)
1176         // Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1
1177         // The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when
1178         // j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1
1179         // which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i
1180         // and we can maintain Cij as 1 larger than ceil(2*d*m) + 1.
1181         final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)];
1182 
1183         // Each iteration fills C_i,j with values and the remaining values are
1184         // kept as 1 for |i/n - j/m| >= d
1185         //assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ";
1186         for (int j = endJ; j < cij.length; j++) {
1187             //assert j * (long) n >= dnm : "jn < dnm for j >= endJ";
1188             cij[j] = 1;
1189         }
1190 
1191         int startJ = 0;
1192         int length = endJ;
1193         double val = -1;
1194         long im = 0;
1195         for (int i = 1; i <= n; i++) {
1196             im += m;
1197             final int lastStartJ = startJ;
1198 
1199             // Compute C_i,j for startJ <= j < endJ
1200             // startJ = floor((im - dnm) / n) + 1      in [0, m]
1201             // endJ   = ceil((dnm + im) / n)           in [0, m+1]
1202             startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1);
1203             endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n));
1204 
1205             if (startJ >= endJ) {
1206                 // No possible paths inside the boundary
1207                 return 1;
1208             }
1209 
1210             //assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1";
1211 
1212             // Initialize previous value C_i,j-1
1213             val = startJ == 0 ? 0 : 1;
1214 
1215             //assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ";
1216             //assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ";
1217             for (int j = startJ; j < endJ; j++) {
1218                 //assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ";
1219                 // C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)
1220                 // Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception.
1221                 // In this case cij[j - lastStartJ] == 1. Only happens when m >= n.
1222                 // Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ]
1223                 val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j);
1224                 cij[j - startJ] = val;
1225             }
1226 
1227             // Must keep the remaining values in C_i,j as 1 to allow
1228             // cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength
1229             final int lastLength = length;
1230             length = endJ - startJ;
1231             for (int j = lastLength - length - 1; j >= 0; j--) {
1232                 cij[length + j] = 1;
1233             }
1234         }
1235         // Return the most recently written value: C_n,m
1236         return val;
1237     }
1238 
1239     /**
1240      * Computes \(P(D_{n,m}^+ \ge d)\), where \(D_{n,m}^+\) is the 2-sample one-sided
1241      * Kolmogorov-Smirnov statistic.
1242      *
1243      * <p>The returned probability is exact, implemented using the outer method
1244      * presented in Hodges (1958).
1245      *
1246      * <p>This method will fail-fast and return -1 if the computation of the
1247      * numbers of paths overflows.
1248      *
1249      * @param d Integral D-statistic value (in [0, lcm])
1250      * @param n First sample size.
1251      * @param m Second sample size.
1252      * @param gcd Greatest common divisor of n and m.
1253      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1254      *         greater than or equal to {@code d}
1255      */
1256     private static double twoSampleOneSidedPOutside(long d, int n, int m, int gcd) {
1257         // Hodges, Fig.2
1258         // Lower boundary: (nx - my)/nm >= d : (nx - my) >= dnm
1259         // B(x, y) is the number of ways from (0, 0) to (x, y) without previously
1260         // reaching the boundary.
1261         // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
1262         // Total paths:
1263         // sum_y { B(x, y) binom(m+n-x-y, n-y) }
1264 
1265         // Normalized by binom(m+n, n). Check this is possible.
1266         final long lm = m;
1267         if (n + lm > Integer.MAX_VALUE) {
1268             return -1;
1269         }
1270         final double binom = binom(m + n, n);
1271         if (binom == Double.POSITIVE_INFINITY) {
1272             return -1;
1273         }
1274 
1275         // This could be updated to use d in [1, lcm].
1276         // Currently it uses d in [gcd, n*m].
1277         final long dnm = d * gcd;
1278 
1279         // Visit all x in [0, m] where (nx - my) >= d for each increasing y in [0, n].
1280         // x = ceil( (d + my) / n ) = (d + my + n - 1) / n
1281         // y = ceil( (nx - d) / m ) = (nx - d + m - 1) / m
1282         // Note: n m integer, d in [0, nm], the intermediate cannot overflow a long.
1283         // x | y=0 = (d + n - 1) / n
1284         final int x0 = (int) ((dnm + n - 1) / n);
1285         if (x0 >= m) {
1286             return 1 / binom;
1287         }
1288         // The y above is the y *on* the boundary. Set the limit as the next y above:
1289         // y | x=m = 1 + floor( (nx - d) / m ) = 1 + (nm - d) / m
1290         final int maxy = (int) ((n * lm - dnm + m) / m);
1291         // Compute x and B(x, y) for visited B(x,y)
1292         final int[] xy = new int[maxy];
1293         final double[] bxy = new double[maxy];
1294         xy[0] = x0;
1295         bxy[0] = 1;
1296         for (int y = 1; y < maxy; y++) {
1297             final int x = (int) ((dnm + lm * y + n - 1) / n);
1298             // B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
1299             // Add the terms to subtract as a negative sum.
1300             final Sum b = Sum.create();
1301             for (int yy = 0; yy < y; yy++) {
1302                 // Here: previousX = x - xy[yy] : previousY = y - yy
1303                 // bxy[yy] is the paths to (previousX, previousY)
1304                 // binom represent the paths from (previousX, previousY) to (x, y)
1305                 b.addProduct(bxy[yy], -binom(x - xy[yy] + y - yy, y - yy));
1306             }
1307             b.add(binom(x + y, y));
1308             xy[y] = x;
1309             bxy[y] = b.getAsDouble();
1310         }
1311         // sum_y { B(x, y) binom(m+n-x-y, n-y) }
1312         final Sum sum = Sum.create();
1313         for (int y = 0; y < maxy; y++) {
1314             sum.addProduct(bxy[y], binom(m + n - xy[y] - y, n - y));
1315         }
1316         // No individual term should have overflowed since binom is finite.
1317         // Any sum above 1 is floating-point error.
1318         return KolmogorovSmirnovDistribution.clipProbability(sum.getAsDouble() / binom);
1319     }
1320 
1321     /**
1322      * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample one-sided
1323      * Kolmogorov-Smirnov statistic.
1324      *
1325      * <p>The returned probability is exact, implemented using the outer method
1326      * presented in Hodges (1958).
1327      *
1328      * @param d Integral D-statistic value (in [1, lcm])
1329      * @param n Sample size.
1330      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1331      *         greater than or equal to {@code d}
1332      */
1333     private static double twoSampleOneSidedPOutsideSquare(long d, int n) {
1334         // Hodges (1958) Eq. 2.3:
1335         // p = binom(2n, n-a) / binom(2n, n)
1336         // a in [1, n] == d * n == dnm / n
1337         final int a = (int) d;
1338 
1339         // Rearrange:
1340         // p = ( 2n! / ((n-a)! (n+a)!) ) / ( 2n! / (n! n!) )
1341         //   = n! n! / ( (n-a)! (n+a)! )
1342         // Perform using pre-computed factorials if possible.
1343         if (n + a <= MAX_FACTORIAL) {
1344             final double x = Factorial.doubleValue(n);
1345             final double y = Factorial.doubleValue(n - a);
1346             final double z = Factorial.doubleValue(n + a);
1347             return (x / y) * (x / z);
1348         }
1349         // p = n! / (n-a)!  *  n! / (n+a)!
1350         //       n * (n-1) * ... * (n-a+1)
1351         //   = -----------------------------
1352         //     (n+a) * (n+a-1) * ... * (n+1)
1353 
1354         double p = 1;
1355         for (int i = 0; i < a && p != 0; i++) {
1356             p *= (n - i) / (1.0 + n + i);
1357         }
1358         return p;
1359     }
1360 
1361     /**
1362      * Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample two-sided
1363      * Kolmogorov-Smirnov statistic.
1364      *
1365      * <p>The returned probability is exact, implemented using the outer method
1366      * presented in Hodges (1958).
1367      *
1368      * @param d Integral D-statistic value (in [1, n])
1369      * @param n Sample size.
1370      * @return probability that a randomly selected m-n partition of n + n generates \(D_{n,n}\)
1371      *         greater than or equal to {@code d}
1372      */
1373     private static double twoSampleTwoSidedPOutsideSquare(long d, int n) {
1374         // Hodges (1958) Eq. 2.4:
1375         // p = 2 [ binom(2n, n-a) - binom(2n, n-2a) + binom(2n, n-3a) - ... ] / binom(2n, n)
1376         // a in [1, n] == d * n == dnm / n
1377 
1378         // As per twoSampleOneSidedPOutsideSquare, divide by binom(2n, n) and each term
1379         // can be expressed as a product:
1380         //         (             n - i                    n - i                   n - i         )
1381         // p = 2 * ( prod_i=0^a --------- - prod_i=0^2a --------- + prod_i=0^3a --------- + ... )
1382         //         (           1 + n + i                1 + n + i               1 + n + i       )
1383         // for ja in [1, ..., n/a]
1384         // Avoid repeat computation of terms by extracting common products:
1385         // p = 2 * ( p0a * (1 - p1a * (1 - p2a * (1 - ... ))) )
1386         // where each term pja is prod_i={ja}^{ja+a} for all j in [1, n / a]
1387 
1388         // The first term is the one-sided p.
1389         final double p0a = twoSampleOneSidedPOutsideSquare(d, n);
1390         if (p0a == 0) {
1391             // Underflow - nothing more to do
1392             return 0;
1393         }
1394         // Compute the inner-terms from small to big.
1395         // j = n / (d/n) ~ n*n / d
1396         // j is a measure of how extreme the d value is (small j is extreme d).
1397         // When j is above 0 a path may traverse from the lower boundary to the upper boundary.
1398         final int a = (int) d;
1399         double p = 0;
1400         for (int j = n / a; j > 0; j--) {
1401             double pja = 1;
1402             final int jaa = j * a + a;
1403             // Since p0a did not underflow we avoid the check for pj != 0
1404             for (int i = j * a; i < jaa; i++) {
1405                 pja *= (n - i) / (1.0 + n + i);
1406             }
1407             p = pja * (1 - p);
1408         }
1409         p = p0a * (1 - p);
1410         return Math.min(1, 2 * p);
1411     }
1412 
1413     /**
1414      * Compute the binomial coefficient binom(n, k).
1415      *
1416      * @param n N.
1417      * @param k K.
1418      * @return binom(n, k)
1419      */
1420     private static double binom(int n, int k) {
1421         return BinomialCoefficientDouble.value(n, k);
1422     }
1423 
1424     /**
1425      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
1426      * is the 2-sample Kolmogorov-Smirnov statistic. See
1427      * {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
1428      *
1429      * <p>Specifically, what is returned is \(1 - CDF(d, \sqrt{mn / (m + n)})\) where CDF
1430      * is the cumulative density function of the two-sided one-sample Kolmogorov-Smirnov
1431      * distribution.
1432      *
1433      * @param d D-statistic value.
1434      * @param n First sample size.
1435      * @param m Second sample size.
1436      * @param twoSided True to compute the two-sided p-value; else one-sided.
1437      * @return approximate probability that a randomly selected m-n partition of m + n generates
1438      *         \(D_{n,m}\) greater than {@code d}
1439      */
1440     static double twoSampleApproximateP(double d, int n, int m, boolean twoSided) {
1441         final double nn = Math.min(n, m);
1442         final double mm = Math.max(n, m);
1443         if (twoSided) {
1444             // Smirnov's asymptotic formula:
1445             // P(sqrt(N) D_n > x)
1446             // N = m*n/(m+n)
1447             return KolmogorovSmirnovDistribution.Two.sf(d, (int) Math.round(mm * nn / (mm + nn)));
1448         }
1449         // one-sided
1450         // Use Hodges Eq 5.3. Requires m >= n
1451         // Correct for m=n, m an integral multiple of n, and 'on the average' for m nearly equal to n
1452         final double z = d * Math.sqrt(nn * mm / (nn + mm));
1453         return Math.exp(-2 * z * z - 2 * z * (mm + 2 * nn) / Math.sqrt(mm * nn * (mm + nn)) / 3);
1454     }
1455 
1456     /**
1457      * Verifies that {@code array} has length at least 2.
1458      *
1459      * @param array Array to test.
1460      * @return the length
1461      * @throws IllegalArgumentException if array is too short
1462      */
1463     private static int checkArrayLength(double[] array) {
1464         final int n = array.length;
1465         if (n <= 1) {
1466             throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
1467         }
1468         return n;
1469     }
1470 
1471     /**
1472      * Sort the input array. Throws an exception if NaN values are
1473      * present. It is assumed the array is non-zero length.
1474      *
1475      * @param x Input array.
1476      * @param name Name of the array.
1477      * @return a reference to the input (sorted) array
1478      * @throws IllegalArgumentException if {@code x} contains NaN values.
1479      */
1480     private static double[] sort(double[] x, String name) {
1481         Arrays.sort(x);
1482         // NaN will be at the end
1483         if (Double.isNaN(x[x.length - 1])) {
1484             throw new InferenceException(name + " contains NaN");
1485         }
1486         return x;
1487     }
1488 }