Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.inference;
019
020import java.math.BigDecimal;
021import java.util.Arrays;
022import java.util.HashSet;
023
024import org.apache.commons.math3.distribution.EnumeratedRealDistribution;
025import org.apache.commons.math3.distribution.RealDistribution;
026import org.apache.commons.math3.distribution.UniformRealDistribution;
027import org.apache.commons.math3.exception.InsufficientDataException;
028import org.apache.commons.math3.exception.MathArithmeticException;
029import org.apache.commons.math3.exception.MathInternalError;
030import org.apache.commons.math3.exception.NullArgumentException;
031import org.apache.commons.math3.exception.NumberIsTooLargeException;
032import org.apache.commons.math3.exception.OutOfRangeException;
033import org.apache.commons.math3.exception.TooManyIterationsException;
034import org.apache.commons.math3.exception.util.LocalizedFormats;
035import org.apache.commons.math3.fraction.BigFraction;
036import org.apache.commons.math3.fraction.BigFractionField;
037import org.apache.commons.math3.fraction.FractionConversionException;
038import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
039import org.apache.commons.math3.linear.FieldMatrix;
040import org.apache.commons.math3.linear.MatrixUtils;
041import org.apache.commons.math3.linear.RealMatrix;
042import org.apache.commons.math3.random.JDKRandomGenerator;
043import org.apache.commons.math3.random.RandomGenerator;
044import org.apache.commons.math3.random.Well19937c;
045import org.apache.commons.math3.util.CombinatoricsUtils;
046import org.apache.commons.math3.util.FastMath;
047import org.apache.commons.math3.util.MathArrays;
048import org.apache.commons.math3.util.MathUtils;
049
050/**
051 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
052 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
053 * <p>
054 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
055 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
056 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
057 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
058 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
059 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
060 * given in [2].
061 * </p>
062 * <p>
063 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
064 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
065 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
066 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
067 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
068 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
069 * follows:
070 * <ul>
071 * <li>For small samples (where the product of the sample sizes is less than
072 * {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
073 * exact p-value for the 2-sample test.</li>
074 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
075 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
076 * the approximation.</li>
077 * </ul></p><p>
078 * If the product of the sample sizes is less than {@value #LARGE_SAMPLE_PRODUCT} and the sample
079 * data contains ties, random jitter is added to the sample data to break ties before applying
080 * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)}
081 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
082 * in the R Matching package [3], can be used if ties are known to be present in the data.
083 * </p>
084 * <p>
085 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
086 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
087 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
088 * {@code strict} parameter. This parameter is ignored for large samples.
089 * </p>
090 * <p>
091 * The methods used by the 2-sample default implementation are also exposed directly:
092 * <ul>
093 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
094 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
095 * arguments in the first two methods allow the probability used to estimate the p-value to be
096 * expressed using strict or non-strict inequality. See
097 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
098 * </ul>
099 * </p>
100 * <p>
101 * References:
102 * <ul>
103 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
104 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
105 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
106 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
107 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
108 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
109 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
110 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
111 * Chapter 5, 3rd Ed. Academic Press.</li>
112 * </ul>
113 * <br/>
114 * Note that [1] contains an error in computing h, refer to <a
115 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
116 * </p>
117 *
118 * @since 3.3
119 */
120public class KolmogorovSmirnovTest {
121
122    /**
123     * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
124     */
125    protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
126
127    /** Convergence criterion for {@link #ksSum(double, double, int)} */
128    protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
129
130    /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
131    protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
132
133    /** No longer used. */
134    @Deprecated
135    protected static final int SMALL_SAMPLE_PRODUCT = 200;
136
137    /**
138     * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
139     * distribution to compute the p-value.
140     */
141    protected static final int LARGE_SAMPLE_PRODUCT = 10000;
142
143    /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}.
144     *  Deprecated as of version 3.6, as this method is no longer needed. */
145    @Deprecated
146    protected static final int MONTE_CARLO_ITERATIONS = 1000000;
147
148    /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
149    private final RandomGenerator rng;
150
151    /**
152     * Construct a KolmogorovSmirnovTest instance with a default random data generator.
153     */
154    public KolmogorovSmirnovTest() {
155        rng = new Well19937c();
156    }
157
158    /**
159     * Construct a KolmogorovSmirnovTest with the provided random data generator.
160     * The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this
161     * constructor is deprecated as of version 3.6.
162     *
163     * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
164     */
165    @Deprecated
166    public KolmogorovSmirnovTest(RandomGenerator rng) {
167        this.rng = rng;
168    }
169
170    /**
171     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
172     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
173     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
174     * {@code exact} is true, the distribution used to compute the p-value is computed using
175     * extended precision. See {@link #cdfExact(double, int)}.
176     *
177     * @param distribution reference distribution
178     * @param data sample being being evaluated
179     * @param exact whether or not to force exact computation of the p-value
180     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
181     *         {@code distribution}
182     * @throws InsufficientDataException if {@code data} does not have length at least 2
183     * @throws NullArgumentException if {@code data} is null
184     */
185    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
186        return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
187    }
188
189    /**
190     * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
191     * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
192     * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
193     * each of the values in {@code data}.
194     *
195     * @param distribution reference distribution
196     * @param data sample being evaluated
197     * @return Kolmogorov-Smirnov statistic \(D_n\)
198     * @throws InsufficientDataException if {@code data} does not have length at least 2
199     * @throws NullArgumentException if {@code data} is null
200     */
201    public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
202        checkArray(data);
203        final int n = data.length;
204        final double nd = n;
205        final double[] dataCopy = new double[n];
206        System.arraycopy(data, 0, dataCopy, 0, n);
207        Arrays.sort(dataCopy);
208        double d = 0d;
209        for (int i = 1; i <= n; i++) {
210            final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
211            final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
212            if (currD > d) {
213                d = currD;
214            }
215        }
216        return d;
217    }
218
219    /**
220     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
221     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
222     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
223     * probability distribution. Specifically, what is returned is an estimate of the probability
224     * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
225     * selected partition of the combined sample into subsamples of sizes {@code x.length} and
226     * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
227     * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
228     * <ul>
229     * <li>For small samples (where the product of the sample sizes is less than
230     * {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
231     * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
232     * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
233     * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
234     * for details on the approximation.</li>
235     * </ul><p>
236     * If {@code x.length * y.length} < {@value #LARGE_SAMPLE_PRODUCT} and the combined set of values in
237     * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to
238     * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed
239     * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between
240     * values in the combined sample.</p>
241     * <p>
242     * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
243     * may be used as an alternative method for estimating the p-value.</p>
244     *
245     * @param x first sample dataset
246     * @param y second sample dataset
247     * @param strict whether or not the probability to compute is expressed as a strict inequality
248     *        (ignored for large samples)
249     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
250     *         samples from the same distribution
251     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
252     *         least 2
253     * @throws NullArgumentException if either {@code x} or {@code y} is null
254     * @see #bootstrap(double[], double[], int, boolean)
255     */
256    public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
257        final long lengthProduct = (long) x.length * y.length;
258        double[] xa = null;
259        double[] ya = null;
260        if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
261            xa = MathArrays.copyOf(x);
262            ya = MathArrays.copyOf(y);
263            fixTies(xa, ya);
264        } else {
265            xa = x;
266            ya = y;
267        }
268        if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
269            return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
270        }
271        return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
272    }
273
274    /**
275     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
276     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
277     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
278     * probability distribution. Assumes the strict form of the inequality used to compute the
279     * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
280     *
281     * @param x first sample dataset
282     * @param y second sample dataset
283     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
284     *         samples from the same distribution
285     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
286     *         least 2
287     * @throws NullArgumentException if either {@code x} or {@code y} is null
288     */
289    public double kolmogorovSmirnovTest(double[] x, double[] y) {
290        return kolmogorovSmirnovTest(x, y, true);
291    }
292
293    /**
294     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
295     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
296     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
297     * is the empirical distribution of the {@code y} values.
298     *
299     * @param x first sample
300     * @param y second sample
301     * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
302     *         {@code y} represent samples from the same underlying distribution
303     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
304     *         least 2
305     * @throws NullArgumentException if either {@code x} or {@code y} is null
306     */
307    public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
308        return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
309    }
310
311    /**
312     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
313     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
314     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
315     * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
316     * as long value.
317     *
318     * @param x first sample
319     * @param y second sample
320     * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
321     *         {@code y} represent samples from the same underlying distribution
322     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
323     *         least 2
324     * @throws NullArgumentException if either {@code x} or {@code y} is null
325     */
326    private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
327        checkArray(x);
328        checkArray(y);
329        // Copy and sort the sample arrays
330        final double[] sx = MathArrays.copyOf(x);
331        final double[] sy = MathArrays.copyOf(y);
332        Arrays.sort(sx);
333        Arrays.sort(sy);
334        final int n = sx.length;
335        final int m = sy.length;
336
337        int rankX = 0;
338        int rankY = 0;
339        long curD = 0l;
340
341        // Find the max difference between cdf_x and cdf_y
342        long supD = 0l;
343        do {
344            double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
345            while(rankX < n && Double.compare(sx[rankX], z) == 0) {
346                rankX += 1;
347                curD += m;
348            }
349            while(rankY < m && Double.compare(sy[rankY], z) == 0) {
350                rankY += 1;
351                curD -= n;
352            }
353            if (curD > supD) {
354                supD = curD;
355            }
356            else if (-curD > supD) {
357                supD = -curD;
358            }
359        } while(rankX < n && rankY < m);
360        return supD;
361    }
362
363    /**
364     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
365     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
366     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
367     *
368     * @param distribution reference distribution
369     * @param data sample being being evaluated
370     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
371     *         {@code distribution}
372     * @throws InsufficientDataException if {@code data} does not have length at least 2
373     * @throws NullArgumentException if {@code data} is null
374     */
375    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
376        return kolmogorovSmirnovTest(distribution, data, false);
377    }
378
379    /**
380     * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
381     * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
382     *
383     * @param distribution reference distribution
384     * @param data sample being being evaluated
385     * @param alpha significance level of the test
386     * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
387     *         can be rejected with confidence 1 - {@code alpha}
388     * @throws InsufficientDataException if {@code data} does not have length at least 2
389     * @throws NullArgumentException if {@code data} is null
390     */
391    public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
392        if ((alpha <= 0) || (alpha > 0.5)) {
393            throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
394        }
395        return kolmogorovSmirnovTest(distribution, data) < alpha;
396    }
397
398    /**
399     * Estimates the <i>p-value</i> of a two-sample
400     * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
401     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
402     * probability distribution. This method estimates the p-value by repeatedly sampling sets of size
403     * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample.
404     * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function
405     * {@code ks.boot}, described in <pre>
406     * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
407     * Software with Automated Balance Optimization: The Matching package for R.'
408     * Journal of Statistical Software, 42(7): 1-52.
409     * </pre>
410     * @param x first sample
411     * @param y second sample
412     * @param iterations number of bootstrap resampling iterations
413     * @param strict whether or not the null hypothesis is expressed as a strict inequality
414     * @return estimated p-value
415     */
416    public double bootstrap(double[] x, double[] y, int iterations, boolean strict) {
417        final int xLength = x.length;
418        final int yLength = y.length;
419        final double[] combined = new double[xLength + yLength];
420        System.arraycopy(x, 0, combined, 0, xLength);
421        System.arraycopy(y, 0, combined, xLength, yLength);
422        final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(rng, combined);
423        final long d = integralKolmogorovSmirnovStatistic(x, y);
424        int greaterCount = 0;
425        int equalCount = 0;
426        double[] curX;
427        double[] curY;
428        long curD;
429        for (int i = 0; i < iterations; i++) {
430            curX = dist.sample(xLength);
431            curY = dist.sample(yLength);
432            curD = integralKolmogorovSmirnovStatistic(curX, curY);
433            if (curD > d) {
434                greaterCount++;
435            } else if (curD == d) {
436                equalCount++;
437            }
438        }
439        return strict ? greaterCount / (double) iterations :
440            (greaterCount + equalCount) / (double) iterations;
441    }
442
443    /**
444     * Computes {@code bootstrap(x, y, iterations, true)}.
445     * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching
446     * package function. See #bootstrap(double[], double[], int, boolean).
447     *
448     * @param x first sample
449     * @param y second sample
450     * @param iterations number of bootstrap resampling iterations
451     * @return estimated p-value
452     */
453    public double bootstrap(double[] x, double[] y, int iterations) {
454        return bootstrap(x, y, iterations, true);
455    }
456
457    /**
458     * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
459     * values given in [2] (see above). The result is not exact as with
460     * {@link #cdfExact(double, int)} because calculations are based on
461     * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
462     *
463     * @param d statistic
464     * @param n sample size
465     * @return \(P(D_n < d)\)
466     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
467     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
468     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
469     */
470    public double cdf(double d, int n)
471        throws MathArithmeticException {
472        return cdf(d, n, false);
473    }
474
475    /**
476     * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
477     * used everywhere at the expense of very slow execution time. Almost never choose this in real
478     * applications unless you are very sure; this is almost solely for verification purposes.
479     * Normally, you would choose {@link #cdf(double, int)}. See the class
480     * javadoc for definitions and algorithm description.
481     *
482     * @param d statistic
483     * @param n sample size
484     * @return \(P(D_n < d)\)
485     * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
486     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
487     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
488     */
489    public double cdfExact(double d, int n)
490        throws MathArithmeticException {
491        return cdf(d, n, true);
492    }
493
494    /**
495     * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
496     * values given in [2] (see above).
497     *
498     * @param d statistic
499     * @param n sample size
500     * @param exact whether the probability should be calculated exact using
501     *        {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
502     *        very slow execution time, or if {@code double} should be used convenient places to
503     *        gain speed. Almost never choose {@code true} in real applications unless you are very
504     *        sure; {@code true} is almost solely for verification purposes.
505     * @return \(P(D_n < d)\)
506     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
507     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
508     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
509     */
510    public double cdf(double d, int n, boolean exact)
511        throws MathArithmeticException {
512
513        final double ninv = 1 / ((double) n);
514        final double ninvhalf = 0.5 * ninv;
515
516        if (d <= ninvhalf) {
517            return 0;
518        } else if (ninvhalf < d && d <= ninv) {
519            double res = 1;
520            final double f = 2 * d - ninv;
521            // n! f^n = n*f * (n-1)*f * ... * 1*x
522            for (int i = 1; i <= n; ++i) {
523                res *= i * f;
524            }
525            return res;
526        } else if (1 - ninv <= d && d < 1) {
527            return 1 - 2 * Math.pow(1 - d, n);
528        } else if (1 <= d) {
529            return 1;
530        }
531        if (exact) {
532            return exactK(d, n);
533        }
534        if (n <= 140) {
535            return roundedK(d, n);
536        }
537        return pelzGood(d, n);
538    }
539
540    /**
541     * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
542     * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
543     * above).
544     *
545     * @param d statistic
546     * @param n sample size
547     * @return the two-sided probability of \(P(D_n < d)\)
548     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
549     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
550     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
551     */
552    private double exactK(double d, int n)
553        throws MathArithmeticException {
554
555        final int k = (int) Math.ceil(n * d);
556
557        final FieldMatrix<BigFraction> H = this.createExactH(d, n);
558        final FieldMatrix<BigFraction> Hpower = H.power(n);
559
560        BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
561
562        for (int i = 1; i <= n; ++i) {
563            pFrac = pFrac.multiply(i).divide(n);
564        }
565
566        /*
567         * BigFraction.doubleValue converts numerator to double and the denominator to double and
568         * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
569         * digits):
570         */
571        return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
572    }
573
574    /**
575     * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
576     *
577     * @param d statistic
578     * @param n sample size
579     * @return \(P(D_n < d)\)
580     */
581    private double roundedK(double d, int n) {
582
583        final int k = (int) Math.ceil(n * d);
584        final RealMatrix H = this.createRoundedH(d, n);
585        final RealMatrix Hpower = H.power(n);
586
587        double pFrac = Hpower.getEntry(k - 1, k - 1);
588        for (int i = 1; i <= n; ++i) {
589            pFrac *= (double) i / (double) n;
590        }
591
592        return pFrac;
593    }
594
595    /**
596     * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
597     *
598     * @param d value of d-statistic (x in [2])
599     * @param n sample size
600     * @return \(P(D_n < d)\)
601     * @since 3.4
602     */
603    public double pelzGood(double d, int n) {
604        // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
605        final double sqrtN = FastMath.sqrt(n);
606        final double z = d * sqrtN;
607        final double z2 = d * d * n;
608        final double z4 = z2 * z2;
609        final double z6 = z4 * z2;
610        final double z8 = z4 * z4;
611
612        // Eventual return value
613        double ret = 0;
614
615        // Compute K_0(z)
616        double sum = 0;
617        double increment = 0;
618        double kTerm = 0;
619        double z2Term = MathUtils.PI_SQUARED / (8 * z2);
620        int k = 1;
621        for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
622            kTerm = 2 * k - 1;
623            increment = FastMath.exp(-z2Term * kTerm * kTerm);
624            sum += increment;
625            if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
626                break;
627            }
628        }
629        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
630            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
631        }
632        ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
633
634        // K_1(z)
635        // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
636        // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
637        final double twoZ2 = 2 * z2;
638        sum = 0;
639        kTerm = 0;
640        double kTerm2 = 0;
641        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
642            kTerm = k + 0.5;
643            kTerm2 = kTerm * kTerm;
644            increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
645            sum += increment;
646            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
647                break;
648            }
649        }
650        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
651            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
652        }
653        final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2);
654        // Instead of doubling sum, divide by 3 instead of 6
655        ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
656
657        // K_2(z)
658        // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
659        final double z4Term = 2 * z4;
660        final double z6Term = 6 * z6;
661        z2Term = 5 * z2;
662        final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED;
663        sum = 0;
664        kTerm = 0;
665        kTerm2 = 0;
666        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
667            kTerm = k + 0.5;
668            kTerm2 = kTerm * kTerm;
669            increment =  (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 +
670                    pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
671            sum += increment;
672            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
673                break;
674            }
675        }
676        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
677            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
678        }
679        double sum2 = 0;
680        kTerm2 = 0;
681        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
682            kTerm2 = k * k;
683            increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
684            sum2 += increment;
685            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
686                break;
687            }
688        }
689        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
690            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
691        }
692        // Again, adjust coefficients instead of doubling sum, sum2
693        ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
694
695        // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
696        // Multiply coefficient denominators by 2, so omit doubling sums.
697        final double pi6 = pi4 * MathUtils.PI_SQUARED;
698        sum = 0;
699        double kTerm4 = 0;
700        double kTerm6 = 0;
701        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
702            kTerm = k + 0.5;
703            kTerm2 = kTerm * kTerm;
704            kTerm4 = kTerm2 * kTerm2;
705            kTerm6 = kTerm4 * kTerm2;
706            increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
707                            MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
708                    FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
709            sum += increment;
710            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
711                break;
712            }
713        }
714        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
715            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
716        }
717        sum2 = 0;
718        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
719            kTerm2 = k * k;
720            kTerm4 = kTerm2 * kTerm2;
721            increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) *
722                    FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
723            sum2 += increment;
724            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
725                break;
726            }
727        }
728        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
729            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
730        }
731        return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
732                + sum2 / (108 * z6));
733
734    }
735
736    /***
737     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
738     *
739     * @param d statistic
740     * @param n sample size
741     * @return H matrix
742     * @throws NumberIsTooLargeException if fractional part is greater than 1
743     * @throws FractionConversionException if algorithm fails to convert {@code h} to a
744     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
745     *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
746     */
747    private FieldMatrix<BigFraction> createExactH(double d, int n)
748        throws NumberIsTooLargeException, FractionConversionException {
749
750        final int k = (int) Math.ceil(n * d);
751        final int m = 2 * k - 1;
752        final double hDouble = k - n * d;
753        if (hDouble >= 1) {
754            throw new NumberIsTooLargeException(hDouble, 1.0, false);
755        }
756        BigFraction h = null;
757        try {
758            h = new BigFraction(hDouble, 1.0e-20, 10000);
759        } catch (final FractionConversionException e1) {
760            try {
761                h = new BigFraction(hDouble, 1.0e-10, 10000);
762            } catch (final FractionConversionException e2) {
763                h = new BigFraction(hDouble, 1.0e-5, 10000);
764            }
765        }
766        final BigFraction[][] Hdata = new BigFraction[m][m];
767
768        /*
769         * Start by filling everything with either 0 or 1.
770         */
771        for (int i = 0; i < m; ++i) {
772            for (int j = 0; j < m; ++j) {
773                if (i - j + 1 < 0) {
774                    Hdata[i][j] = BigFraction.ZERO;
775                } else {
776                    Hdata[i][j] = BigFraction.ONE;
777                }
778            }
779        }
780
781        /*
782         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
783         * hPowers[m-1] = h^m
784         */
785        final BigFraction[] hPowers = new BigFraction[m];
786        hPowers[0] = h;
787        for (int i = 1; i < m; ++i) {
788            hPowers[i] = h.multiply(hPowers[i - 1]);
789        }
790
791        /*
792         * First column and last row has special values (each other reversed).
793         */
794        for (int i = 0; i < m; ++i) {
795            Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
796            Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
797        }
798
799        /*
800         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
801         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
802         */
803        if (h.compareTo(BigFraction.ONE_HALF) == 1) {
804            Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
805        }
806
807        /*
808         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
809         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
810         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
811         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
812         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
813         * really necessary.
814         */
815        for (int i = 0; i < m; ++i) {
816            for (int j = 0; j < i + 1; ++j) {
817                if (i - j + 1 > 0) {
818                    for (int g = 2; g <= i - j + 1; ++g) {
819                        Hdata[i][j] = Hdata[i][j].divide(g);
820                    }
821                }
822            }
823        }
824        return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
825    }
826
827    /***
828     * Creates {@code H} of size {@code m x m} as described in [1] (see above)
829     * using double-precision.
830     *
831     * @param d statistic
832     * @param n sample size
833     * @return H matrix
834     * @throws NumberIsTooLargeException if fractional part is greater than 1
835     */
836    private RealMatrix createRoundedH(double d, int n)
837        throws NumberIsTooLargeException {
838
839        final int k = (int) Math.ceil(n * d);
840        final int m = 2 * k - 1;
841        final double h = k - n * d;
842        if (h >= 1) {
843            throw new NumberIsTooLargeException(h, 1.0, false);
844        }
845        final double[][] Hdata = new double[m][m];
846
847        /*
848         * Start by filling everything with either 0 or 1.
849         */
850        for (int i = 0; i < m; ++i) {
851            for (int j = 0; j < m; ++j) {
852                if (i - j + 1 < 0) {
853                    Hdata[i][j] = 0;
854                } else {
855                    Hdata[i][j] = 1;
856                }
857            }
858        }
859
860        /*
861         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
862         * hPowers[m-1] = h^m
863         */
864        final double[] hPowers = new double[m];
865        hPowers[0] = h;
866        for (int i = 1; i < m; ++i) {
867            hPowers[i] = h * hPowers[i - 1];
868        }
869
870        /*
871         * First column and last row has special values (each other reversed).
872         */
873        for (int i = 0; i < m; ++i) {
874            Hdata[i][0] = Hdata[i][0] - hPowers[i];
875            Hdata[m - 1][i] -= hPowers[m - i - 1];
876        }
877
878        /*
879         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
880         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
881         */
882        if (Double.compare(h, 0.5) > 0) {
883            Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
884        }
885
886        /*
887         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
888         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
889         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
890         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
891         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
892         * really necessary.
893         */
894        for (int i = 0; i < m; ++i) {
895            for (int j = 0; j < i + 1; ++j) {
896                if (i - j + 1 > 0) {
897                    for (int g = 2; g <= i - j + 1; ++g) {
898                        Hdata[i][j] /= g;
899                    }
900                }
901            }
902        }
903        return MatrixUtils.createRealMatrix(Hdata);
904    }
905
906    /**
907     * Verifies that {@code array} has length at least 2.
908     *
909     * @param array array to test
910     * @throws NullArgumentException if array is null
911     * @throws InsufficientDataException if array is too short
912     */
913    private void checkArray(double[] array) {
914        if (array == null) {
915            throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
916        }
917        if (array.length < 2) {
918            throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
919                                                2);
920        }
921    }
922
923    /**
924     * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
925     * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
926     * have been computed. If the sum does not converge before {@code maxIterations} iterations a
927     * {@link TooManyIterationsException} is thrown.
928     *
929     * @param t argument
930     * @param tolerance Cauchy criterion for partial sums
931     * @param maxIterations maximum number of partial sums to compute
932     * @return Kolmogorov sum evaluated at t
933     * @throws TooManyIterationsException if the series does not converge
934     */
935    public double ksSum(double t, double tolerance, int maxIterations) {
936        if (t == 0.0) {
937            return 0.0;
938        }
939
940        // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
941        // from class javadoc should be used.
942
943        final double x = -2 * t * t;
944        int sign = -1;
945        long i = 1;
946        double partialSum = 0.5d;
947        double delta = 1;
948        while (delta > tolerance && i < maxIterations) {
949            delta = FastMath.exp(x * i * i);
950            partialSum += sign * delta;
951            sign *= -1;
952            i++;
953        }
954        if (i == maxIterations) {
955            throw new TooManyIterationsException(maxIterations);
956        }
957        return partialSum * 2;
958    }
959
960    /**
961     * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
962     * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
963     * comparison with other integral d-statistics. Depending whether {@code strict} is
964     * {@code true} or not, the returned value divided by (n*m) is greater than
965     * (resp greater than or equal to) the given d value (allowing some tolerance).
966     *
967     * @param d a d-statistic in the range [0, 1]
968     * @param n first sample size
969     * @param m second sample size
970     * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
971     * @return the integral d-statistic in the range [0, n*m]
972     */
973    private static long calculateIntegralD(double d, int n, int m, boolean strict) {
974        final double tol = 1e-12;  // d-values within tol of one another are considered equal
975        long nm = n * (long)m;
976        long upperBound = (long)FastMath.ceil((d - tol) * nm);
977        long lowerBound = (long)FastMath.floor((d + tol) * nm);
978        if (strict && lowerBound == upperBound) {
979            return upperBound + 1l;
980        }
981        else {
982            return upperBound;
983        }
984    }
985
986    /**
987     * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
988     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
989     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
990     * <p>
991     * The returned probability is exact, implemented by unwinding the recursive function
992     * definitions presented in [4] (class javadoc).
993     * </p>
994     *
995     * @param d D-statistic value
996     * @param n first sample size
997     * @param m second sample size
998     * @param strict whether or not the probability to compute is expressed as a strict inequality
999     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1000     *         greater than (resp. greater than or equal to) {@code d}
1001     */
1002    public double exactP(double d, int n, int m, boolean strict) {
1003       return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
1004               CombinatoricsUtils.binomialCoefficientDouble(n + m, m);
1005    }
1006
1007    /**
1008     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
1009     * is the 2-sample Kolmogorov-Smirnov statistic. See
1010     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1011     * <p>
1012     * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
1013     * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
1014     * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
1015     * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
1016     * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
1017     * </p>
1018     *
1019     * @param d D-statistic value
1020     * @param n first sample size
1021     * @param m second sample size
1022     * @return approximate probability that a randomly selected m-n partition of m + n generates
1023     *         \(D_{n,m}\) greater than {@code d}
1024     */
1025    public double approximateP(double d, int n, int m) {
1026        final double dm = m;
1027        final double dn = n;
1028        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
1029                         KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
1030    }
1031
1032    /**
1033     * Fills a boolean array randomly with a fixed number of {@code true} values.
1034     * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
1035     * By processing first the {@code true} values followed by the remaining {@code false} values
1036     * less random numbers need to be generated. The method is optimized for the case
1037     * that the number of {@code true} values is larger than or equal to the number of
1038     * {@code false} values.
1039     *
1040     * @param b boolean array
1041     * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
1042     * @param rng random data generator
1043     */
1044    static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
1045        Arrays.fill(b, true);
1046        for (int k = numberOfTrueValues; k < b.length; k++) {
1047            final int r = rng.nextInt(k + 1);
1048            b[(b[r]) ? r : k] = false;
1049        }
1050    }
1051
1052    /**
1053     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
1054     * 2-sample Kolmogorov-Smirnov statistic. See
1055     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1056     * <p>
1057     * The simulation generates {@code iterations} random partitions of {@code m + n} into an
1058     * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
1059     * the proportion of values that are greater than {@code d}, or greater than or equal to
1060     * {@code d} if {@code strict} is {@code false}.
1061     * </p>
1062     *
1063     * @param d D-statistic value
1064     * @param n first sample size
1065     * @param m second sample size
1066     * @param iterations number of random partitions to generate
1067     * @param strict whether or not the probability to compute is expressed as a strict inequality
1068     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1069     *         greater than (resp. greater than or equal to) {@code d}
1070     */
1071    public double monteCarloP(final double d, final int n, final int m, final boolean strict,
1072                              final int iterations) {
1073        return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
1074    }
1075
1076    /**
1077     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
1078     * 2-sample Kolmogorov-Smirnov statistic.
1079     * <p>
1080     * Here d is the D-statistic represented as long value.
1081     * The real D-statistic is obtained by dividing d by n*m.
1082     * See also {@link #monteCarloP(double, int, int, boolean, int)}.
1083     *
1084     * @param d integral D-statistic
1085     * @param n first sample size
1086     * @param m second sample size
1087     * @param iterations number of random partitions to generate
1088     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1089     *         greater than or equal to {@code d/(n*m))}
1090     */
1091    private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
1092
1093        // ensure that nn is always the max of (n, m) to require fewer random numbers
1094        final int nn = FastMath.max(n, m);
1095        final int mm = FastMath.min(n, m);
1096        final int sum = nn + mm;
1097
1098        int tail = 0;
1099        final boolean b[] = new boolean[sum];
1100        for (int i = 0; i < iterations; i++) {
1101            fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
1102            long curD = 0l;
1103            for(int j = 0; j < b.length; ++j) {
1104                if (b[j]) {
1105                    curD += mm;
1106                    if (curD >= d) {
1107                        tail++;
1108                        break;
1109                    }
1110                } else {
1111                    curD -= nn;
1112                    if (curD <= -d) {
1113                        tail++;
1114                        break;
1115                    }
1116                }
1117            }
1118        }
1119        return (double) tail / iterations;
1120    }
1121
1122    /**
1123     * If there are no ties in the combined dataset formed from x and y, this
1124     * method is a no-op.  If there are ties, a uniform random deviate in
1125     * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
1126     * minDelta is the minimum difference between unequal values in the combined
1127     * sample.  A fixed seed is used to generate the jitter, so repeated activations
1128     * with the same input arrays result in the same values.
1129     *
1130     * NOTE: if there are ties in the data, this method overwrites the data in
1131     * x and y with the jittered values.
1132     *
1133     * @param x first sample
1134     * @param y second sample
1135     */
1136    private static void fixTies(double[] x, double[] y) {
1137       final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
1138       if (values.length == x.length + y.length) {
1139           return;  // There are no ties
1140       }
1141
1142       // Find the smallest difference between values, or 1 if all values are the same
1143       double minDelta = 1;
1144       double prev = values[0];
1145       double delta = 1;
1146       for (int i = 1; i < values.length; i++) {
1147          delta = prev - values[i];
1148          if (delta < minDelta) {
1149              minDelta = delta;
1150          }
1151          prev = values[i];
1152       }
1153       minDelta /= 2;
1154
1155       // Add jitter using a fixed seed (so same arguments always give same results),
1156       // low-initialization-overhead generator
1157       final RealDistribution dist =
1158               new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta);
1159
1160       // It is theoretically possible that jitter does not break ties, so repeat
1161       // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
1162       int ct = 0;
1163       boolean ties = true;
1164       do {
1165           jitter(x, dist);
1166           jitter(y, dist);
1167           ties = hasTies(x, y);
1168           ct++;
1169       } while (ties && ct < 1000);
1170       if (ties) {
1171           throw new MathInternalError(); // Should never happen
1172       }
1173    }
1174
1175    /**
1176     * Returns true iff there are ties in the combined sample
1177     * formed from x and y.
1178     *
1179     * @param x first sample
1180     * @param y second sample
1181     * @return true if x and y together contain ties
1182     */
1183    private static boolean hasTies(double[] x, double[] y) {
1184        final HashSet<Double> values = new HashSet<Double>();
1185            for (int i = 0; i < x.length; i++) {
1186                if (!values.add(x[i])) {
1187                    return true;
1188                }
1189            }
1190            for (int i = 0; i < y.length; i++) {
1191                if (!values.add(y[i])) {
1192                    return true;
1193                }
1194            }
1195        return false;
1196    }
1197
1198    /**
1199     * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
1200     * <p>
1201     * Note that jitter is applied in-place - i.e., the array
1202     * values are overwritten with the result of applying jitter.</p>
1203     *
1204     * @param data input/output data array - entries overwritten by the method
1205     * @param dist probability distribution to sample for jitter values
1206     * @throws NullPointerException if either of the parameters is null
1207     */
1208    private static void jitter(double[] data, RealDistribution dist) {
1209        for (int i = 0; i < data.length; i++) {
1210            data[i] += dist.sample();
1211        }
1212    }
1213
1214    /**
1215     * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
1216     * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
1217     * and recoded as a long to avoid rounding errors in comparison tests, so what
1218     * is actually tested is |im - jn| <= cmn.
1219     *
1220     * @param i first path parameter
1221     * @param j second path paramter
1222     * @param m first sample size
1223     * @param n second sample size
1224     * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1225     * @param strict whether or not the null hypothesis uses strict inequality
1226     * @return C(i,j) for given m, n, c
1227     */
1228    private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
1229        if (strict) {
1230            return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
1231        }
1232        return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
1233    }
1234
1235    /**
1236     * The function N(i, j) defined in [4] (class javadoc).
1237     * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
1238     * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
1239     * The return value is integral, but subject to overflow, so it is maintained and
1240     * returned as a double.
1241     *
1242     * @param i first path parameter
1243     * @param j second path parameter
1244     * @param m first sample size
1245     * @param n second sample size
1246     * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1247     * @param strict whether or not the null hypothesis uses strict inequality
1248     * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
1249     */
1250    private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
1251        /*
1252         * Unwind the recursive definition given in [4].
1253         * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
1254         * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
1255         */
1256        final double[] lag = new double[n];
1257        double last = 0;
1258        for (int k = 0; k < n; k++) {
1259            lag[k] = c(0, k + 1, m, n, cnm, strict);
1260        }
1261        for (int k = 1; k <= i; k++) {
1262            last = c(k, 0, m, n, cnm, strict);
1263            for (int l = 1; l <= j; l++) {
1264                lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
1265                last = lag[l - 1];
1266            }
1267        }
1268        return last;
1269    }
1270}