Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math4.legacy.stat.inference;
019
020import java.math.RoundingMode;
021import java.util.Arrays;
022
023import org.apache.commons.rng.simple.RandomSource;
024import org.apache.commons.rng.UniformRandomProvider;
025import org.apache.commons.statistics.distribution.ContinuousDistribution;
026import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
027import org.apache.commons.numbers.fraction.BigFraction;
028import org.apache.commons.numbers.field.BigFractionField;
029import org.apache.commons.math4.legacy.distribution.EnumeratedRealDistribution;
030import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution;
031import org.apache.commons.math4.legacy.exception.InsufficientDataException;
032import org.apache.commons.math4.legacy.exception.MathArithmeticException;
033import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
034import org.apache.commons.math4.legacy.exception.NullArgumentException;
035import org.apache.commons.math4.legacy.exception.NumberIsTooLargeException;
036import org.apache.commons.math4.legacy.exception.OutOfRangeException;
037import org.apache.commons.math4.legacy.exception.TooManyIterationsException;
038import org.apache.commons.math4.legacy.exception.NotANumberException;
039import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
040import org.apache.commons.math4.legacy.linear.MatrixUtils;
041import org.apache.commons.math4.legacy.linear.RealMatrix;
042import org.apache.commons.math4.core.jdkmath.JdkMath;
043import org.apache.commons.math4.legacy.core.MathArrays;
044import org.apache.commons.math4.legacy.field.linalg.FieldDenseMatrix;
045
046/**
047 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
048 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
049 * <p>
050 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
051 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
052 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
053 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
054 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
055 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
056 * given in [2].
057 * </p>
058 * <p>
059 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
060 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
061 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
062 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
063 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
064 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
065 * follows:
066 * <ul>
067 * <li>When the product of the sample sizes is less than 10000, the method presented in [4]
068 * is used to compute the exact p-value for the 2-sample test.</li>
069 * <li>When the product of the sample sizes is larger, the asymptotic
070 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
071 * the approximation.</li>
072 * </ul><p>
073 * For small samples (former case), if the data contains ties, random jitter is added
074 * to the sample data to break ties before applying the algorithm above. Alternatively,
075 * the {@link #bootstrap(double[],double[],int,boolean,UniformRandomProvider)}
076 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
077 * in the R Matching package [3], can be used if ties are known to be present in the data.
078 * </p>
079 * <p>
080 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
081 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} \ge d \)
082 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
083 * {@code strict} parameter. This parameter is ignored for large samples.
084 * </p>
085 * <p>
086 * The methods used by the 2-sample default implementation are also exposed directly:
087 * <ul>
088 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
089 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
090 * arguments in the first two methods allow the probability used to estimate the p-value to be
091 * expressed using strict or non-strict inequality. See
092 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
093 * </ul>
094 * <p>
095 * References:
096 * <ul>
097 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
098 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
099 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
100 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
101 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
102 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
103 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
104 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
105 * Chapter 5, 3rd Ed. Academic Press.</li>
106 * </ul>
107 * <br>
108 * Note that [1] contains an error in computing h, refer to <a
109 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
110 *
111 * @since 3.3
112 */
113public class KolmogorovSmirnovTest {
114    /** pi^2. */
115    private static final double PI_SQUARED = JdkMath.PI * JdkMath.PI;
116    /**
117     * Bound on the number of partial sums in {@link #ksSum(double, double, int)}.
118     */
119    private static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
120    /** Convergence criterion for {@link #ksSum(double, double, int)}. */
121    private static final double KS_SUM_CAUCHY_CRITERION = 1e-20;
122    /** Convergence criterion for the sums in {@link #pelzGood(double, int)}. */
123    private static final double PG_SUM_RELATIVE_ERROR = 1e-10;
124    /** 1/2. */
125    private static final BigFraction ONE_HALF = BigFraction.of(1, 2);
126
127    /**
128     * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
129     * distribution to compute the p-value.
130     */
131    private static final int LARGE_SAMPLE_PRODUCT = 10000;
132
133    /**
134     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
135     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
136     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
137     * {@code exact} is true, the distribution used to compute the p-value is computed using
138     * extended precision. See {@link #cdfExact(double, int)}.
139     *
140     * @param distribution reference distribution
141     * @param data sample being being evaluated
142     * @param exact whether or not to force exact computation of the p-value
143     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
144     *         {@code distribution}
145     * @throws InsufficientDataException if {@code data} does not have length at least 2
146     * @throws NullArgumentException if {@code data} is null
147     */
148    public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, boolean exact) {
149        return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
150    }
151
152    /**
153     * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
154     * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
155     * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
156     * each of the values in {@code data}.
157     *
158     * @param distribution reference distribution
159     * @param data sample being evaluated
160     * @return Kolmogorov-Smirnov statistic \(D_n\)
161     * @throws InsufficientDataException if {@code data} does not have length at least 2
162     * @throws NullArgumentException if {@code data} is null
163     */
164    public double kolmogorovSmirnovStatistic(ContinuousDistribution distribution, double[] data) {
165        checkArray(data);
166        final int n = data.length;
167        final double nd = n;
168        final double[] dataCopy = new double[n];
169        System.arraycopy(data, 0, dataCopy, 0, n);
170        Arrays.sort(dataCopy);
171        double d = 0d;
172        for (int i = 1; i <= n; i++) {
173            final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
174            final double currD = JdkMath.max(yi - (i - 1) / nd, i / nd - yi);
175            if (currD > d) {
176                d = currD;
177            }
178        }
179        return d;
180    }
181
182    /**
183     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
184     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
185     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
186     * probability distribution. Specifically, what is returned is an estimate of the probability
187     * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
188     * selected partition of the combined sample into subsamples of sizes {@code x.length} and
189     * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
190     * large as (if {@code strict} is {@code false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
191     *
192     * @param x first sample dataset.
193     * @param y second sample dataset.
194     * @param strict whether or not the probability to compute is expressed as
195     * a strict inequality (ignored for large samples).
196     * @return p-value associated with the null hypothesis that {@code x} and
197     * {@code y} represent samples from the same distribution.
198     * @throws InsufficientDataException if either {@code x} or {@code y} does
199     * not have length at least 2.
200     * @throws NullArgumentException if either {@code x} or {@code y} is null.
201     * @throws NotANumberException if the input arrays contain NaN values.
202     *
203     * @see #bootstrap(double[],double[],int,boolean,UniformRandomProvider)
204     */
205    public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
206        final long lengthProduct = (long) x.length * y.length;
207        double[] xa = null;
208        double[] ya = null;
209        if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
210            xa = Arrays.copyOf(x, x.length);
211            ya = Arrays.copyOf(y, y.length);
212            fixTies(xa, ya);
213        } else {
214            xa = x;
215            ya = y;
216        }
217        if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
218            return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
219        }
220        return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
221    }
222
223    /**
224     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
225     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
226     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
227     * probability distribution. Assumes the strict form of the inequality used to compute the
228     * p-value. See {@link #kolmogorovSmirnovTest(ContinuousDistribution, double[], boolean)}.
229     *
230     * @param x first sample dataset
231     * @param y second sample dataset
232     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
233     *         samples from the same distribution
234     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
235     *         least 2
236     * @throws NullArgumentException if either {@code x} or {@code y} is null
237     */
238    public double kolmogorovSmirnovTest(double[] x, double[] y) {
239        return kolmogorovSmirnovTest(x, y, true);
240    }
241
242    /**
243     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
244     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
245     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
246     * is the empirical distribution of the {@code y} values.
247     *
248     * @param x first sample
249     * @param y second sample
250     * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
251     *         {@code y} represent samples from the same underlying distribution
252     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
253     *         least 2
254     * @throws NullArgumentException if either {@code x} or {@code y} is null
255     */
256    public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
257        return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
258    }
259
260    /**
261     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
262     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
263     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
264     * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
265     * as long value.
266     *
267     * @param x first sample
268     * @param y second sample
269     * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
270     *         {@code y} represent samples from the same underlying distribution
271     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
272     *         least 2
273     * @throws NullArgumentException if either {@code x} or {@code y} is null
274     */
275    private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
276        checkArray(x);
277        checkArray(y);
278        // Copy and sort the sample arrays
279        final double[] sx = Arrays.copyOf(x, x.length);
280        final double[] sy = Arrays.copyOf(y, y.length);
281        Arrays.sort(sx);
282        Arrays.sort(sy);
283        final int n = sx.length;
284        final int m = sy.length;
285
286        int rankX = 0;
287        int rankY = 0;
288        long curD = 0L;
289
290        // Find the max difference between cdf_x and cdf_y
291        long supD = 0L;
292        do {
293            double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
294            while(rankX < n && Double.compare(sx[rankX], z) == 0) {
295                rankX += 1;
296                curD += m;
297            }
298            while(rankY < m && Double.compare(sy[rankY], z) == 0) {
299                rankY += 1;
300                curD -= n;
301            }
302            if (curD > supD) {
303                supD = curD;
304            } else if (-curD > supD) {
305                supD = -curD;
306            }
307        } while(rankX < n && rankY < m);
308        return supD;
309    }
310
311    /**
312     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
313     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
314     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
315     *
316     * @param distribution reference distribution
317     * @param data sample being being evaluated
318     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
319     *         {@code distribution}
320     * @throws InsufficientDataException if {@code data} does not have length at least 2
321     * @throws NullArgumentException if {@code data} is null
322     */
323    public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data) {
324        return kolmogorovSmirnovTest(distribution, data, false);
325    }
326
327    /**
328     * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
329     * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
330     *
331     * @param distribution reference distribution
332     * @param data sample being being evaluated
333     * @param alpha significance level of the test
334     * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
335     *         can be rejected with confidence 1 - {@code alpha}
336     * @throws InsufficientDataException if {@code data} does not have length at least 2
337     * @throws NullArgumentException if {@code data} is null
338     */
339    public boolean kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, double alpha) {
340        if (alpha <= 0 || alpha > 0.5) {
341            throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
342        }
343        return kolmogorovSmirnovTest(distribution, data) < alpha;
344    }
345
346    /**
347     * Estimates the <i>p-value</i> of a two-sample
348     * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test</a>
349     * evaluating the null hypothesis that {@code x} and {@code y} are samples
350     * drawn from the same probability distribution.
351     * This method estimates the p-value by repeatedly sampling sets of size
352     * {@code x.length} and {@code y.length} from the empirical distribution
353     * of the combined sample.
354     * When {@code strict} is true, this is equivalent to the algorithm implemented
355     * in the R function {@code ks.boot}, described in <pre>
356     * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
357     * Software with Automated Balance Optimization: The Matching package for R.'
358     * Journal of Statistical Software, 42(7): 1-52.
359     * </pre>
360     *
361     * @param x First sample.
362     * @param y Second sample.
363     * @param iterations Number of bootstrap resampling iterations.
364     * @param strict Whether or not the null hypothesis is expressed as a strict inequality.
365     * @param rng RNG for creating the sampling sets.
366     * @return the estimated p-value.
367     */
368    public double bootstrap(double[] x,
369                            double[] y,
370                            int iterations,
371                            boolean strict,
372                            UniformRandomProvider rng) {
373        final int xLength = x.length;
374        final int yLength = y.length;
375        final double[] combined = new double[xLength + yLength];
376        System.arraycopy(x, 0, combined, 0, xLength);
377        System.arraycopy(y, 0, combined, xLength, yLength);
378        final ContinuousDistribution.Sampler sampler = new EnumeratedRealDistribution(combined).createSampler(rng);
379        final long d = integralKolmogorovSmirnovStatistic(x, y);
380        int greaterCount = 0;
381        int equalCount = 0;
382        double[] curX;
383        double[] curY;
384        long curD;
385        for (int i = 0; i < iterations; i++) {
386            curX = AbstractRealDistribution.sample(xLength, sampler);
387            curY = AbstractRealDistribution.sample(yLength, sampler);
388            curD = integralKolmogorovSmirnovStatistic(curX, curY);
389            if (curD > d) {
390                greaterCount++;
391            } else if (curD == d) {
392                equalCount++;
393            }
394        }
395        return strict ? greaterCount / (double) iterations :
396            (greaterCount + equalCount) / (double) iterations;
397    }
398
399    /**
400     * Calculates \(P(D_n &lt; d)\) using the method described in [1] with quick decisions for extreme
401     * values given in [2] (see above). The result is not exact as with
402     * {@link #cdfExact(double, int)} because calculations are based on
403     * {@code double} rather than {@link BigFraction}.
404     *
405     * @param d statistic
406     * @param n sample size
407     * @return \(P(D_n &lt; d)\)
408     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
409     * {@link BigFraction} in expressing {@code d} as
410     * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
411     */
412    public double cdf(double d, int n) {
413        return cdf(d, n, false);
414    }
415
416    /**
417     * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
418     * used everywhere at the expense of very slow execution time. Almost never choose this in real
419     * applications unless you are very sure; this is almost solely for verification purposes.
420     * Normally, you would choose {@link #cdf(double, int)}. See the class
421     * javadoc for definitions and algorithm description.
422     *
423     * @param d statistic
424     * @param n sample size
425     * @return \(P(D_n &lt; d)\)
426     * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
427     * {@link BigFraction} in expressing {@code d} as
428     * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
429     */
430    public double cdfExact(double d, int n) {
431        return cdf(d, n, true);
432    }
433
434    /**
435     * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
436     * values given in [2] (see above).
437     *
438     * @param d statistic
439     * @param n sample size
440     * @param exact whether the probability should be calculated exact using
441     *        {@link BigFraction} everywhere at the expense of
442     *        very slow execution time, or if {@code double} should be used convenient places to
443     *        gain speed. Almost never choose {@code true} in real applications unless you are very
444     *        sure; {@code true} is almost solely for verification purposes.
445     * @return \(P(D_n &lt; d)\)
446     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
447     * {@link BigFraction} in expressing {@code d} as
448     * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\).
449     */
450    public double cdf(double d, int n, boolean exact) {
451        final double ninv = 1 / ((double) n);
452        final double ninvhalf = 0.5 * ninv;
453
454        if (d <= ninvhalf) {
455            return 0;
456        } else if (ninvhalf < d && d <= ninv) {
457            double res = 1;
458            final double f = 2 * d - ninv;
459            // n! f^n = n*f * (n-1)*f * ... * 1*x
460            for (int i = 1; i <= n; ++i) {
461                res *= i * f;
462            }
463            return res;
464        } else if (1 - ninv <= d && d < 1) {
465            return 1 - 2 * Math.pow(1 - d, n);
466        } else if (1 <= d) {
467            return 1;
468        }
469        if (exact) {
470            return exactK(d, n);
471        }
472        if (n <= 140) {
473            return roundedK(d, n);
474        }
475        return pelzGood(d, n);
476    }
477
478    /**
479     * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
480     * in class javadoc above) and {@link BigFraction} (see above).
481     *
482     * @param d statistic
483     * @param n sample size
484     * @return the two-sided probability of \(P(D_n &lt; d)\)
485     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
486     * {@link BigFraction}.
487     */
488    private double exactK(double d, int n) {
489        final int k = (int) Math.ceil(n * d);
490
491        final FieldDenseMatrix<BigFraction> h;
492        try {
493            h = createExactH(d, n);
494        } catch (ArithmeticException e) {
495            throw new MathArithmeticException(LocalizedFormats.FRACTION);
496        }
497
498        final FieldDenseMatrix<BigFraction> hPower = h.pow(n);
499
500        BigFraction pFrac = hPower.get(k - 1, k - 1);
501
502        for (int i = 1; i <= n; ++i) {
503            pFrac = pFrac.multiply(i).divide(n);
504        }
505
506        /*
507         * BigFraction.doubleValue converts numerator to double and the denominator to double and
508         * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
509         * digits):
510         */
511        return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
512    }
513
514    /**
515     * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
516     *
517     * @param d statistic
518     * @param n sample size
519     * @return \(P(D_n &lt; d)\)
520     */
521    private double roundedK(double d, int n) {
522
523        final int k = (int) Math.ceil(n * d);
524        final RealMatrix h = this.createRoundedH(d, n);
525        final RealMatrix hPower = h.power(n);
526
527        double pFrac = hPower.getEntry(k - 1, k - 1);
528        for (int i = 1; i <= n; ++i) {
529            pFrac *= (double) i / (double) n;
530        }
531
532        return pFrac;
533    }
534
535    /**
536     * Computes the Pelz-Good approximation for \(P(D_n &lt; d)\) as described in [2] in the class javadoc.
537     *
538     * @param d value of d-statistic (x in [2])
539     * @param n sample size
540     * @return \(P(D_n &lt; d)\)
541     * @since 3.4
542     */
543    public double pelzGood(double d, int n) {
544        // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
545        final double sqrtN = JdkMath.sqrt(n);
546        final double z = d * sqrtN;
547        final double z2 = d * d * n;
548        final double z4 = z2 * z2;
549        final double z6 = z4 * z2;
550        final double z8 = z4 * z4;
551
552        // Eventual return value
553        double ret = 0;
554
555        // Compute K_0(z)
556        double sum = 0;
557        double increment = 0;
558        double kTerm = 0;
559        double z2Term = PI_SQUARED / (8 * z2);
560        int k = 1;
561        for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
562            kTerm = 2 * k - 1;
563            increment = JdkMath.exp(-z2Term * kTerm * kTerm);
564            sum += increment;
565            if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
566                break;
567            }
568        }
569        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
570            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
571        }
572        ret = sum * JdkMath.sqrt(2 * JdkMath.PI) / z;
573
574        // K_1(z)
575        // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
576        // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
577        final double twoZ2 = 2 * z2;
578        sum = 0;
579        kTerm = 0;
580        double kTerm2 = 0;
581        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
582            kTerm = k + 0.5;
583            kTerm2 = kTerm * kTerm;
584            increment = (PI_SQUARED * kTerm2 - z2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
585            sum += increment;
586            if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
587                break;
588            }
589        }
590        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
591            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
592        }
593        final double sqrtHalfPi = JdkMath.sqrt(JdkMath.PI / 2);
594        // Instead of doubling sum, divide by 3 instead of 6
595        ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
596
597        // K_2(z)
598        // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
599        final double z4Term = 2 * z4;
600        final double z6Term = 6 * z6;
601        z2Term = 5 * z2;
602        final double pi4 = PI_SQUARED * PI_SQUARED;
603        sum = 0;
604        kTerm = 0;
605        kTerm2 = 0;
606        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
607            kTerm = k + 0.5;
608            kTerm2 = kTerm * kTerm;
609            increment =  (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
610                    pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
611            sum += increment;
612            if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
613                break;
614            }
615        }
616        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
617            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
618        }
619        double sum2 = 0;
620        kTerm2 = 0;
621        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
622            kTerm2 = (double) k * k;
623            increment = PI_SQUARED * kTerm2 * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
624            sum2 += increment;
625            if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
626                break;
627            }
628        }
629        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
630            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
631        }
632        // Again, adjust coefficients instead of doubling sum, sum2
633        ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
634
635        // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
636        // Multiply coefficient denominators by 2, so omit doubling sums.
637        final double pi6 = pi4 * PI_SQUARED;
638        sum = 0;
639        double kTerm4 = 0;
640        double kTerm6 = 0;
641        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
642            kTerm = k + 0.5;
643            kTerm2 = kTerm * kTerm;
644            kTerm4 = kTerm2 * kTerm2;
645            kTerm6 = kTerm4 * kTerm2;
646            increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
647                            PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
648                    JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
649            sum += increment;
650            if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
651                break;
652            }
653        }
654        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
655            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
656        }
657        sum2 = 0;
658        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
659            kTerm2 = (double) k * k;
660            kTerm4 = kTerm2 * kTerm2;
661            increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
662                    JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
663            sum2 += increment;
664            if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
665                break;
666            }
667        }
668        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
669            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
670        }
671        return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
672                sum2 / (108 * z6));
673    }
674
675    /**
676     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
677     *
678     * @param d statistic
679     * @param n sample size
680     * @return H matrix
681     * @throws NumberIsTooLargeException if fractional part is greater than 1.
682     * @throws ArithmeticException if algorithm fails to convert {@code h} to a
683     * {@link BigFraction}.
684     */
685    private FieldDenseMatrix<BigFraction> createExactH(double d,
686                                                       int n) {
687        final int k = (int) Math.ceil(n * d);
688        final int m = 2 * k - 1;
689        final double hDouble = k - n * d;
690        if (hDouble >= 1) {
691            throw new NumberIsTooLargeException(hDouble, 1.0, false);
692        }
693        BigFraction h = null;
694        try {
695            h = BigFraction.from(hDouble, 1e-20, 10000);
696        } catch (final ArithmeticException e1) {
697            try {
698                h = BigFraction.from(hDouble, 1e-10, 10000);
699            } catch (final ArithmeticException e2) {
700                h = BigFraction.from(hDouble, 1e-5, 10000);
701            }
702        }
703        final FieldDenseMatrix<BigFraction> hData = FieldDenseMatrix.create(BigFractionField.get(), m, m);
704
705        /*
706         * Start by filling everything with either 0 or 1.
707         */
708        for (int i = 0; i < m; ++i) {
709            for (int j = 0; j < m; ++j) {
710                if (i - j + 1 < 0) {
711                    hData.set(i, j, BigFraction.ZERO);
712                } else {
713                    hData.set(i, j, BigFraction.ONE);
714                }
715            }
716        }
717
718        /*
719         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
720         * hPowers[m-1] = h^m
721         */
722        final BigFraction[] hPowers = new BigFraction[m];
723        hPowers[0] = h;
724        for (int i = 1; i < m; i++) {
725            hPowers[i] = h.multiply(hPowers[i - 1]);
726        }
727
728        /*
729         * First column and last row has special values (each other reversed).
730         */
731        for (int i = 0; i < m; ++i) {
732            hData.set(i, 0,
733                      hData.get(i, 0).subtract(hPowers[i]));
734            hData.set(m - 1, i,
735                      hData.get(m - 1, i).subtract(hPowers[m - i - 1]));
736        }
737
738        /*
739         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
740         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
741         */
742        if (h.compareTo(ONE_HALF) > 0) {
743            hData.set(m - 1, 0,
744                      hData.get(m - 1, 0).add(h.multiply(2).subtract(1).pow(m)));
745        }
746
747        /*
748         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
749         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
750         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
751         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
752         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
753         * really necessary.
754         */
755        for (int i = 0; i < m; ++i) {
756            for (int j = 0; j < i + 1; ++j) {
757                if (i - j + 1 > 0) {
758                    for (int g = 2; g <= i - j + 1; ++g) {
759                        hData.set(i, j,
760                                  hData.get(i, j).divide(g));
761                    }
762                }
763            }
764        }
765        return hData;
766    }
767
768    /***
769     * Creates {@code H} of size {@code m x m} as described in [1] (see above)
770     * using double-precision.
771     *
772     * @param d statistic
773     * @param n sample size
774     * @return H matrix
775     * @throws NumberIsTooLargeException if fractional part is greater than 1
776     */
777    private RealMatrix createRoundedH(double d, int n)
778        throws NumberIsTooLargeException {
779
780        final int k = (int) Math.ceil(n * d);
781        final int m = 2 * k - 1;
782        final double h = k - n * d;
783        if (h >= 1) {
784            throw new NumberIsTooLargeException(h, 1.0, false);
785        }
786        final double[][] hData = new double[m][m];
787
788        /*
789         * Start by filling everything with either 0 or 1.
790         */
791        for (int i = 0; i < m; ++i) {
792            for (int j = 0; j < m; ++j) {
793                if (i - j + 1 < 0) {
794                    hData[i][j] = 0;
795                } else {
796                    hData[i][j] = 1;
797                }
798            }
799        }
800
801        /*
802         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
803         * hPowers[m-1] = h^m
804         */
805        final double[] hPowers = new double[m];
806        hPowers[0] = h;
807        for (int i = 1; i < m; ++i) {
808            hPowers[i] = h * hPowers[i - 1];
809        }
810
811        /*
812         * First column and last row has special values (each other reversed).
813         */
814        for (int i = 0; i < m; ++i) {
815            hData[i][0] = hData[i][0] - hPowers[i];
816            hData[m - 1][i] -= hPowers[m - i - 1];
817        }
818
819        /*
820         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
821         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
822         */
823        if (Double.compare(h, 0.5) > 0) {
824            hData[m - 1][0] += JdkMath.pow(2 * h - 1, m);
825        }
826
827        /*
828         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
829         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
830         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
831         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
832         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
833         * really necessary.
834         */
835        for (int i = 0; i < m; ++i) {
836            for (int j = 0; j < i + 1; ++j) {
837                if (i - j + 1 > 0) {
838                    for (int g = 2; g <= i - j + 1; ++g) {
839                        hData[i][j] /= g;
840                    }
841                }
842            }
843        }
844        return MatrixUtils.createRealMatrix(hData);
845    }
846
847    /**
848     * Verifies that {@code array} has length at least 2.
849     *
850     * @param array array to test
851     * @throws NullArgumentException if array is null
852     * @throws InsufficientDataException if array is too short
853     */
854    private void checkArray(double[] array) {
855        if (array == null) {
856            throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
857        }
858        if (array.length < 2) {
859            throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
860                                                2);
861        }
862    }
863
864    /**
865     * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
866     * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
867     * have been computed. If the sum does not converge before {@code maxIterations} iterations a
868     * {@link TooManyIterationsException} is thrown.
869     *
870     * @param t argument
871     * @param tolerance Cauchy criterion for partial sums
872     * @param maxIterations maximum number of partial sums to compute
873     * @return Kolmogorov sum evaluated at t
874     * @throws TooManyIterationsException if the series does not converge
875     */
876    public double ksSum(double t, double tolerance, int maxIterations) {
877        if (t == 0.0) {
878            return 0.0;
879        }
880
881        // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
882        // from class javadoc should be used.
883
884        final double x = -2 * t * t;
885        int sign = -1;
886        long i = 1;
887        double partialSum = 0.5d;
888        double delta = 1;
889        while (delta > tolerance && i < maxIterations) {
890            delta = JdkMath.exp(x * i * i);
891            partialSum += sign * delta;
892            sign *= -1;
893            i++;
894        }
895        if (i == maxIterations) {
896            throw new TooManyIterationsException(maxIterations);
897        }
898        return partialSum * 2;
899    }
900
901    /**
902     * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
903     * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
904     * comparison with other integral d-statistics. Depending whether {@code strict} is
905     * {@code true} or not, the returned value divided by (n*m) is greater than
906     * (resp greater than or equal to) the given d value (allowing some tolerance).
907     *
908     * @param d a d-statistic in the range [0, 1]
909     * @param n first sample size
910     * @param m second sample size
911     * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
912     * @return the integral d-statistic in the range [0, n*m]
913     */
914    private static long calculateIntegralD(double d, int n, int m, boolean strict) {
915        final double tol = 1e-12;  // d-values within tol of one another are considered equal
916        long nm = n * (long)m;
917        long upperBound = (long)JdkMath.ceil((d - tol) * nm);
918        long lowerBound = (long)JdkMath.floor((d + tol) * nm);
919        if (strict && lowerBound == upperBound) {
920            return upperBound + 1L;
921        } else {
922            return upperBound;
923        }
924    }
925
926    /**
927     * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
928     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
929     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
930     * <p>
931     * The returned probability is exact, implemented by unwinding the recursive function
932     * definitions presented in [4] (class javadoc).
933     * </p>
934     *
935     * @param d D-statistic value
936     * @param n first sample size
937     * @param m second sample size
938     * @param strict whether or not the probability to compute is expressed as a strict inequality
939     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
940     *         greater than (resp. greater than or equal to) {@code d}
941     */
942    public double exactP(double d, int n, int m, boolean strict) {
943       return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
944           BinomialCoefficientDouble.value(n + m, m);
945    }
946
947    /**
948     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
949     * is the 2-sample Kolmogorov-Smirnov statistic. See
950     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
951     * <p>
952     * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
953     * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
954     * details on how convergence of the sum is determined.
955     * </p>
956     *
957     * @param d D-statistic value
958     * @param n first sample size
959     * @param m second sample size
960     * @return approximate probability that a randomly selected m-n partition of m + n generates
961     *         \(D_{n,m}\) greater than {@code d}
962     */
963    public double approximateP(double d, int n, int m) {
964        return 1 - ksSum(d * JdkMath.sqrt(((double) m * (double) n) / ((double) m + (double) n)),
965                         KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
966    }
967
968    /**
969     * Fills a boolean array randomly with a fixed number of {@code true} values.
970     * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
971     * By processing first the {@code true} values followed by the remaining {@code false} values
972     * less random numbers need to be generated. The method is optimized for the case
973     * that the number of {@code true} values is larger than or equal to the number of
974     * {@code false} values.
975     *
976     * @param b boolean array
977     * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
978     * @param rng random data generator
979     */
980    private static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final UniformRandomProvider rng) {
981        Arrays.fill(b, true);
982        for (int k = numberOfTrueValues; k < b.length; k++) {
983            final int r = rng.nextInt(k + 1);
984            b[(b[r]) ? r : k] = false;
985        }
986    }
987
988    /**
989     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\) is the
990     * 2-sample Kolmogorov-Smirnov statistic. See
991     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
992     * <p>
993     * The simulation generates {@code iterations} random partitions of {@code m + n} into an
994     * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
995     * the proportion of values that are greater than {@code d}, or greater than or equal to
996     * {@code d} if {@code strict} is {@code false}.
997     * </p>
998     *
999     * @param d D-statistic value.
1000     * @param n First sample size.
1001     * @param m Second sample size.
1002     * @param iterations Number of random partitions to generate.
1003     * @param strict whether or not the probability to compute is expressed as a strict inequality
1004     * @param rng RNG used for generating the partitions.
1005     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1006     * greater than (resp. greater than or equal to) {@code d}.
1007     */
1008    public double monteCarloP(final double d,
1009                              final int n,
1010                              final int m,
1011                              final boolean strict,
1012                              final int iterations,
1013                              UniformRandomProvider rng) {
1014        return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations, rng);
1015    }
1016
1017    /**
1018     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d / (n * m))\)
1019     * where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
1020     * <p>
1021     * Here {@code d} is the D-statistic represented as long value.
1022     * The real D-statistic is obtained by dividing {@code d} by {@code n * m}.
1023     * See also {@link #monteCarloP(double,int,int,boolean,int,UniformRandomProvider)}.
1024     *
1025     * @param d Integral D-statistic.
1026     * @param n First sample size.
1027     * @param m Second sample size.
1028     * @param iterations Number of random partitions to generate.
1029     * @param rng RNG used for generating the partitions.
1030     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1031     * greater than or equal to {@code d / (n * m))}.
1032     */
1033    private double integralMonteCarloP(final long d,
1034                                       final int n,
1035                                       final int m,
1036                                       final int iterations,
1037                                       UniformRandomProvider rng) {
1038        // ensure that nn is always the max of (n, m) to require fewer random numbers
1039        final int nn = JdkMath.max(n, m);
1040        final int mm = JdkMath.min(n, m);
1041        final int sum = nn + mm;
1042
1043        int tail = 0;
1044        final boolean[] b = new boolean[sum];
1045        for (int i = 0; i < iterations; i++) {
1046            fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
1047            long curD = 0L;
1048            for(int j = 0; j < b.length; ++j) {
1049                if (b[j]) {
1050                    curD += mm;
1051                    if (curD >= d) {
1052                        tail++;
1053                        break;
1054                    }
1055                } else {
1056                    curD -= nn;
1057                    if (curD <= -d) {
1058                        tail++;
1059                        break;
1060                    }
1061                }
1062            }
1063        }
1064        return (double) tail / iterations;
1065    }
1066
1067    /**
1068     * If there are no ties in the combined dataset formed from x and y,
1069     * this method is a no-op.
1070     * If there are ties, a uniform random deviate in
1071     * is added to each value in x and y, and this method overwrites the
1072     * data in x and y with the jittered values.
1073     *
1074     * @param x First sample.
1075     * @param y Second sample.
1076     * @throw NotANumberException if any of the input arrays contain
1077     * a NaN value.
1078     */
1079    private static void fixTies(double[] x, double[] y) {
1080        if (hasTies(x, y)) {
1081            // Add jitter using a fixed seed (so same arguments always give same results),
1082            // low-initialization-overhead generator.
1083            final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(876543217L);
1084
1085            // It is theoretically possible that jitter does not break ties, so repeat
1086            // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
1087            int fixedRangeTrials = 10;
1088            int jitterRange = 10;
1089            int ct = 0;
1090            boolean ties = true;
1091            do {
1092                // Nudge the data using a fixed range.
1093                jitter(x, rng, jitterRange);
1094                jitter(y, rng, jitterRange);
1095                ties = hasTies(x, y);
1096                ++ct;
1097            } while (ties && ct < fixedRangeTrials);
1098
1099            if (ties) {
1100                throw new MaxCountExceededException(fixedRangeTrials);
1101            }
1102        }
1103    }
1104
1105    /**
1106     * Returns true iff there are ties in the combined sample formed from
1107     * x and y.
1108     *
1109     * @param x First sample.
1110     * @param y Second sample.
1111     * @return true if x and y together contain ties.
1112     * @throw InsufficientDataException if the input arrays only contain infinite values.
1113     * @throw NotANumberException if any of the input arrays contain a NaN value.
1114     */
1115    private static boolean hasTies(double[] x, double[] y) {
1116       final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
1117
1118       // "unique" moves NaN to the head of the output array.
1119       if (Double.isNaN(values[0])) {
1120           throw new NotANumberException();
1121       }
1122
1123       int infCount = 0;
1124       for (double v : values) {
1125           if (Double.isInfinite(v)) {
1126               ++infCount;
1127           }
1128       }
1129       if (infCount == values.length) {
1130           throw new InsufficientDataException();
1131       }
1132
1133        return values.length != x.length + y.length;  // There are no ties.
1134    }
1135
1136    /**
1137     * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
1138     * <p>
1139     * Note that jitter is applied in-place - i.e., the array
1140     * values are overwritten with the result of applying jitter.</p>
1141     *
1142     * @param data input/output data array - entries overwritten by the method
1143     * @param rng probability distribution to sample for jitter values
1144     * @param ulp ulp used when generating random numbers
1145     * @throws NullPointerException if either of the parameters is null
1146     */
1147    private static void jitter(double[] data,
1148                               UniformRandomProvider rng,
1149                               int ulp) {
1150        final int range = ulp * 2;
1151        for (int i = 0; i < data.length; i++) {
1152            final double orig = data[i];
1153            if (Double.isInfinite(orig)) {
1154                continue; // Avoid NaN creation.
1155            }
1156
1157            final int rand = rng.nextInt(range) - ulp;
1158            final double ulps = Math.ulp(orig);
1159            final double r = rand * ulps;
1160
1161            data[i] += r;
1162        }
1163    }
1164
1165    /**
1166     * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
1167     * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
1168     * and recoded as a long to avoid rounding errors in comparison tests, so what
1169     * is actually tested is |im - jn| <= cmn.
1170     *
1171     * @param i first path parameter
1172     * @param j second path paramter
1173     * @param m first sample size
1174     * @param n second sample size
1175     * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1176     * @param strict whether or not the null hypothesis uses strict inequality
1177     * @return C(i,j) for given m, n, c
1178     */
1179    private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
1180        if (strict) {
1181            return JdkMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
1182        }
1183        return JdkMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
1184    }
1185
1186    /**
1187     * The function N(i, j) defined in [4] (class javadoc).
1188     * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
1189     * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
1190     * The return value is integral, but subject to overflow, so it is maintained and
1191     * returned as a double.
1192     *
1193     * @param i first path parameter
1194     * @param j second path parameter
1195     * @param m first sample size
1196     * @param n second sample size
1197     * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1198     * @param strict whether or not the null hypothesis uses strict inequality
1199     * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
1200     */
1201    private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
1202        /*
1203         * Unwind the recursive definition given in [4].
1204         * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
1205         * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
1206         */
1207        final double[] lag = new double[n];
1208        double last = 0;
1209        for (int k = 0; k < n; k++) {
1210            lag[k] = c(0, k + 1, m, n, cnm, strict);
1211        }
1212        for (int k = 1; k <= i; k++) {
1213            last = c(k, 0, m, n, cnm, strict);
1214            for (int l = 1; l <= j; l++) {
1215                lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
1216                last = lag[l - 1];
1217            }
1218        }
1219        return last;
1220    }
1221}