001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.inference;
019
020import java.math.BigDecimal;
021import java.util.Arrays;
022import java.util.Iterator;
023
024import org.apache.commons.math3.util.Precision;
025import org.apache.commons.math3.distribution.RealDistribution;
026import org.apache.commons.math3.exception.InsufficientDataException;
027import org.apache.commons.math3.exception.MathArithmeticException;
028import org.apache.commons.math3.exception.NullArgumentException;
029import org.apache.commons.math3.exception.NumberIsTooLargeException;
030import org.apache.commons.math3.exception.OutOfRangeException;
031import org.apache.commons.math3.exception.TooManyIterationsException;
032import org.apache.commons.math3.exception.util.LocalizedFormats;
033import org.apache.commons.math3.fraction.BigFraction;
034import org.apache.commons.math3.fraction.BigFractionField;
035import org.apache.commons.math3.fraction.FractionConversionException;
036import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
037import org.apache.commons.math3.linear.FieldMatrix;
038import org.apache.commons.math3.linear.MatrixUtils;
039import org.apache.commons.math3.linear.RealMatrix;
040import org.apache.commons.math3.random.RandomGenerator;
041import org.apache.commons.math3.random.Well19937c;
042import org.apache.commons.math3.util.CombinatoricsUtils;
043import org.apache.commons.math3.util.FastMath;
044import org.apache.commons.math3.util.MathArrays;
045
046import static org.apache.commons.math3.util.MathUtils.PI_SQUARED;
047import static org.apache.commons.math3.util.FastMath.PI;
048
049/**
050 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
051 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
052 * <p>
053 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
054 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
055 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
056 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
057 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
058 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
059 * given in [2].
060 * </p>
061 * <p>
062 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
063 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
064 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
065 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
066 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
067 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
068 * follows:
069 * <ul>
070 * <li>For very small samples (where the product of the sample sizes is less than
071 * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the
072 * 2-sample test.</li>
073 * <li>For mid-size samples (product of sample sizes greater than or equal to
074 * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
075 * simulation is used to compute the p-value. The simulation randomly generates partitions of \(m +
076 * n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values
077 * exceeding the observed value.</li>
078 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
079 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
080 * the approximation.</li>
081 * </ul>
082 * </p>
083 * <p>
084 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
085 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
086 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
087 * {@code strict} parameter. This parameter is ignored for large samples.
088 * </p>
089 * <p>
090 * The methods used by the 2-sample default implementation are also exposed directly:
091 * <ul>
092 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
093 * <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte
094 * Carlo simulation</li>
095 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
096 * arguments in the first two methods allow the probability used to estimate the p-value to be
097 * expressed using strict or non-strict inequality. See
098 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
099 * </ul>
100 * </p>
101 * <p>
102 * References:
103 * <ul>
104 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
105 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
106 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
107 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
108 * </ul>
109 * <br/>
110 * Note that [1] contains an error in computing h, refer to <a
111 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
112 * </p>
113 *
114 * @since 3.3
115 */
116public class KolmogorovSmirnovTest {
117
118    /**
119     * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
120     */
121    protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
122
123    /** Convergence criterion for {@link #ksSum(double, double, int)} */
124    protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
125
126    /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
127    protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
128
129    /** When product of sample sizes is less than this value, 2-sample K-S test is exact */
130    protected static final int SMALL_SAMPLE_PRODUCT = 200;
131
132    /**
133     * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
134     * distribution for strict inequality p-value.
135     */
136    protected static final int LARGE_SAMPLE_PRODUCT = 10000;
137
138    /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */
139    protected static final int MONTE_CARLO_ITERATIONS = 1000000;
140
141    /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
142    private final RandomGenerator rng;
143
144    /**
145     * Construct a KolmogorovSmirnovTest instance with a default random data generator.
146     */
147    public KolmogorovSmirnovTest() {
148        rng = new Well19937c();
149    }
150
151    /**
152     * Construct a KolmogorovSmirnovTest with the provided random data generator.
153     *
154     * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
155     */
156    public KolmogorovSmirnovTest(RandomGenerator rng) {
157        this.rng = rng;
158    }
159
160    /**
161     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
162     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
163     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
164     * {@code exact} is true, the distribution used to compute the p-value is computed using
165     * extended precision. See {@link #cdfExact(double, int)}.
166     *
167     * @param distribution reference distribution
168     * @param data sample being being evaluated
169     * @param exact whether or not to force exact computation of the p-value
170     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
171     *         {@code distribution}
172     * @throws InsufficientDataException if {@code data} does not have length at least 2
173     * @throws NullArgumentException if {@code data} is null
174     */
175    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
176        return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
177    }
178
179    /**
180     * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
181     * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
182     * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
183     * each of the values in {@code data}.
184     *
185     * @param distribution reference distribution
186     * @param data sample being evaluated
187     * @return Kolmogorov-Smirnov statistic \(D_n\)
188     * @throws InsufficientDataException if {@code data} does not have length at least 2
189     * @throws NullArgumentException if {@code data} is null
190     */
191    public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
192        checkArray(data);
193        final int n = data.length;
194        final double nd = n;
195        final double[] dataCopy = new double[n];
196        System.arraycopy(data, 0, dataCopy, 0, n);
197        Arrays.sort(dataCopy);
198        double d = 0d;
199        for (int i = 1; i <= n; i++) {
200            final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
201            final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
202            if (currD > d) {
203                d = currD;
204            }
205        }
206        return d;
207    }
208
209    /**
210     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
211     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
212     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
213     * probability distribution. Specifically, what is returned is an estimate of the probability
214     * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
215     * selected partition of the combined sample into subsamples of sizes {@code x.length} and
216     * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
217     * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
218     * <ul>
219     * <li>For very small samples (where the product of the sample sizes is less than
220     * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
221     * is accomplished by enumerating all partitions of the combined sample into two subsamples of
222     * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
223     * proportion of partitions that give \(D\) values exceeding the observed value.</li>
224     * <li>For mid-size samples (product of sample sizes greater than or equal to
225     * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
226     * simulation is used to compute the p-value. The simulation randomly generates partitions and
227     * reports the proportion that give \(D\) values exceeding the observed value.</li>
228     * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
229     * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
230     * for details on the approximation.</li>
231     * </ul>
232     *
233     * @param x first sample dataset
234     * @param y second sample dataset
235     * @param strict whether or not the probability to compute is expressed as a strict inequality
236     *        (ignored for large samples)
237     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
238     *         samples from the same distribution
239     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
240     *         least 2
241     * @throws NullArgumentException if either {@code x} or {@code y} is null
242     */
243    public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
244        final long lengthProduct = (long) x.length * y.length;
245        if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
246            return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
247        }
248        if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
249            return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
250        }
251        return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
252    }
253
254    /**
255     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
256     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
257     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
258     * probability distribution. Assumes the strict form of the inequality used to compute the
259     * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
260     *
261     * @param x first sample dataset
262     * @param y second sample dataset
263     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
264     *         samples from the same distribution
265     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
266     *         least 2
267     * @throws NullArgumentException if either {@code x} or {@code y} is null
268     */
269    public double kolmogorovSmirnovTest(double[] x, double[] y) {
270        return kolmogorovSmirnovTest(x, y, true);
271    }
272
273    /**
274     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
275     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
276     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
277     * is the empirical distribution of the {@code y} values.
278     *
279     * @param x first sample
280     * @param y second sample
281     * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
282     *         {@code y} represent samples from the same underlying distribution
283     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
284     *         least 2
285     * @throws NullArgumentException if either {@code x} or {@code y} is null
286     */
287    public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
288        checkArray(x);
289        checkArray(y);
290        // Copy and sort the sample arrays
291        final double[] sx = MathArrays.copyOf(x);
292        final double[] sy = MathArrays.copyOf(y);
293        Arrays.sort(sx);
294        Arrays.sort(sy);
295        final int n = sx.length;
296        final int m = sy.length;
297
298        int rankX = 0;
299        int rankY = 0;
300
301        // Find the max difference between cdf_x and cdf_y
302        double supD = 0d;
303        do {
304            double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
305            while(rankX < n && Double.compare(sx[rankX], z) == 0) {
306                rankX += 1;
307            }
308            while(rankY < m && Double.compare(sy[rankY], z) == 0) {
309                rankY += 1;
310            }
311            final double cdf_x = rankX / (double) n;
312            final double cdf_y = rankY / (double) m;
313            final double curD = FastMath.abs(cdf_x - cdf_y);
314            if (curD > supD) {
315                supD = curD;
316            }
317        } while(rankX < n && rankY < m);
318        return supD;
319    }
320
321    /**
322     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
323     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
324     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
325     *
326     * @param distribution reference distribution
327     * @param data sample being being evaluated
328     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
329     *         {@code distribution}
330     * @throws InsufficientDataException if {@code data} does not have length at least 2
331     * @throws NullArgumentException if {@code data} is null
332     */
333    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
334        return kolmogorovSmirnovTest(distribution, data, false);
335    }
336
337    /**
338     * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
339     * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
340     *
341     * @param distribution reference distribution
342     * @param data sample being being evaluated
343     * @param alpha significance level of the test
344     * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
345     *         can be rejected with confidence 1 - {@code alpha}
346     * @throws InsufficientDataException if {@code data} does not have length at least 2
347     * @throws NullArgumentException if {@code data} is null
348     */
349    public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
350        if ((alpha <= 0) || (alpha > 0.5)) {
351            throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
352        }
353        return kolmogorovSmirnovTest(distribution, data) < alpha;
354    }
355
356    /**
357     * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
358     * values given in [2] (see above). The result is not exact as with
359     * {@link #cdfExact(double, int)} because calculations are based on
360     * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
361     *
362     * @param d statistic
363     * @param n sample size
364     * @return \(P(D_n < d)\)
365     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
366     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
367     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
368     */
369    public double cdf(double d, int n)
370        throws MathArithmeticException {
371        return cdf(d, n, false);
372    }
373
374    /**
375     * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
376     * used everywhere at the expense of very slow execution time. Almost never choose this in real
377     * applications unless you are very sure; this is almost solely for verification purposes.
378     * Normally, you would choose {@link #cdf(double, int)}. See the class
379     * javadoc for definitions and algorithm description.
380     *
381     * @param d statistic
382     * @param n sample size
383     * @return \(P(D_n < d)\)
384     * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
385     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
386     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
387     */
388    public double cdfExact(double d, int n)
389        throws MathArithmeticException {
390        return cdf(d, n, true);
391    }
392
393    /**
394     * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
395     * values given in [2] (see above).
396     *
397     * @param d statistic
398     * @param n sample size
399     * @param exact whether the probability should be calculated exact using
400     *        {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
401     *        very slow execution time, or if {@code double} should be used convenient places to
402     *        gain speed. Almost never choose {@code true} in real applications unless you are very
403     *        sure; {@code true} is almost solely for verification purposes.
404     * @return \(P(D_n < d)\)
405     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
406     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
407     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
408     */
409    public double cdf(double d, int n, boolean exact)
410        throws MathArithmeticException {
411
412        final double ninv = 1 / ((double) n);
413        final double ninvhalf = 0.5 * ninv;
414
415        if (d <= ninvhalf) {
416            return 0;
417        } else if (ninvhalf < d && d <= ninv) {
418            double res = 1;
419            final double f = 2 * d - ninv;
420            // n! f^n = n*f * (n-1)*f * ... * 1*x
421            for (int i = 1; i <= n; ++i) {
422                res *= i * f;
423            }
424            return res;
425        } else if (1 - ninv <= d && d < 1) {
426            return 1 - 2 * Math.pow(1 - d, n);
427        } else if (1 <= d) {
428            return 1;
429        }
430        if (exact) {
431            return exactK(d, n);
432        }
433        if (n <= 140) {
434            return roundedK(d, n);
435        }
436        return pelzGood(d, n);
437    }
438
439    /**
440     * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
441     * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
442     * above).
443     *
444     * @param d statistic
445     * @param n sample size
446     * @return the two-sided probability of \(P(D_n < d)\)
447     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
448     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
449     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
450     */
451    private double exactK(double d, int n)
452        throws MathArithmeticException {
453
454        final int k = (int) Math.ceil(n * d);
455
456        final FieldMatrix<BigFraction> H = this.createExactH(d, n);
457        final FieldMatrix<BigFraction> Hpower = H.power(n);
458
459        BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
460
461        for (int i = 1; i <= n; ++i) {
462            pFrac = pFrac.multiply(i).divide(n);
463        }
464
465        /*
466         * BigFraction.doubleValue converts numerator to double and the denominator to double and
467         * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
468         * digits):
469         */
470        return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
471    }
472
473    /**
474     * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
475     *
476     * @param d statistic
477     * @param n sample size
478     * @return \(P(D_n < d)\)
479     */
480    private double roundedK(double d, int n) {
481
482        final int k = (int) Math.ceil(n * d);
483        final RealMatrix H = this.createRoundedH(d, n);
484        final RealMatrix Hpower = H.power(n);
485
486        double pFrac = Hpower.getEntry(k - 1, k - 1);
487        for (int i = 1; i <= n; ++i) {
488            pFrac *= (double) i / (double) n;
489        }
490
491        return pFrac;
492    }
493
494    /**
495     * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
496     *
497     * @param d value of d-statistic (x in [2])
498     * @param n sample size
499     * @return \(P(D_n < d)\)
500     * @since 3.4
501     */
502    public double pelzGood(double d, int n) {
503
504        // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
505        final double sqrtN = FastMath.sqrt(n);
506        final double z = d * sqrtN;
507        final double z2 = d * d * n;
508        final double z4 = z2 * z2;
509        final double z6 = z4 * z2;
510        final double z8 = z4 * z4;
511
512        // Eventual return value
513        double ret = 0;
514
515        // Compute K_0(z)
516        double sum = 0;
517        double increment = 0;
518        double kTerm = 0;
519        double z2Term = PI_SQUARED / (8 * z2);
520        int k = 1;
521        for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
522            kTerm = 2 * k - 1;
523            increment = FastMath.exp(-z2Term * kTerm * kTerm);
524            sum += increment;
525            if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
526                break;
527            }
528        }
529        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
530            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
531        }
532        ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
533
534        // K_1(z)
535        // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
536        // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
537        final double twoZ2 = 2 * z2;
538        sum = 0;
539        kTerm = 0;
540        double kTerm2 = 0;
541        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
542            kTerm = k + 0.5;
543            kTerm2 = kTerm * kTerm;
544            increment = (PI_SQUARED * kTerm2 - z2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
545            sum += increment;
546            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
547                break;
548            }
549        }
550        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
551            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
552        }
553        final double sqrtHalfPi = FastMath.sqrt(PI / 2);
554        // Instead of doubling sum, divide by 3 instead of 6
555        ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
556
557        // K_2(z)
558        // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
559        final double z4Term = 2 * z4;
560        final double z6Term = 6 * z6;
561        z2Term = 5 * z2;
562        final double pi4 = PI_SQUARED * PI_SQUARED;
563        sum = 0;
564        kTerm = 0;
565        kTerm2 = 0;
566        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
567            kTerm = k + 0.5;
568            kTerm2 = kTerm * kTerm;
569            increment =  (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
570                    pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
571            sum += increment;
572            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
573                break;
574            }
575        }
576        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
577            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
578        }
579        double sum2 = 0;
580        kTerm2 = 0;
581        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
582            kTerm2 = k * k;
583            increment = PI_SQUARED * kTerm2 * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
584            sum2 += increment;
585            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
586                break;
587            }
588        }
589        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
590            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
591        }
592        // Again, adjust coefficients instead of doubling sum, sum2
593        ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
594
595        // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
596        // Multiply coefficient denominators by 2, so omit doubling sums.
597        final double pi6 = pi4 * PI_SQUARED;
598        sum = 0;
599        double kTerm4 = 0;
600        double kTerm6 = 0;
601        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
602            kTerm = k + 0.5;
603            kTerm2 = kTerm * kTerm;
604            kTerm4 = kTerm2 * kTerm2;
605            kTerm6 = kTerm4 * kTerm2;
606            increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
607                    PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
608                    FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
609            sum += increment;
610            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
611                break;
612            }
613        }
614        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
615            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
616        }
617        sum2 = 0;
618        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
619            kTerm2 = k * k;
620            kTerm4 = kTerm2 * kTerm2;
621            increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
622                    FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
623            sum2 += increment;
624            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
625                break;
626            }
627        }
628        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
629            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
630        }
631        return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
632                + sum2 / (108 * z6));
633
634    }
635
636    /***
637     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
638     *
639     * @param d statistic
640     * @param n sample size
641     * @return H matrix
642     * @throws NumberIsTooLargeException if fractional part is greater than 1
643     * @throws FractionConversionException if algorithm fails to convert {@code h} to a
644     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
645     *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
646     */
647    private FieldMatrix<BigFraction> createExactH(double d, int n)
648        throws NumberIsTooLargeException, FractionConversionException {
649
650        final int k = (int) Math.ceil(n * d);
651        final int m = 2 * k - 1;
652        final double hDouble = k - n * d;
653        if (hDouble >= 1) {
654            throw new NumberIsTooLargeException(hDouble, 1.0, false);
655        }
656        BigFraction h = null;
657        try {
658            h = new BigFraction(hDouble, 1.0e-20, 10000);
659        } catch (final FractionConversionException e1) {
660            try {
661                h = new BigFraction(hDouble, 1.0e-10, 10000);
662            } catch (final FractionConversionException e2) {
663                h = new BigFraction(hDouble, 1.0e-5, 10000);
664            }
665        }
666        final BigFraction[][] Hdata = new BigFraction[m][m];
667
668        /*
669         * Start by filling everything with either 0 or 1.
670         */
671        for (int i = 0; i < m; ++i) {
672            for (int j = 0; j < m; ++j) {
673                if (i - j + 1 < 0) {
674                    Hdata[i][j] = BigFraction.ZERO;
675                } else {
676                    Hdata[i][j] = BigFraction.ONE;
677                }
678            }
679        }
680
681        /*
682         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
683         * hPowers[m-1] = h^m
684         */
685        final BigFraction[] hPowers = new BigFraction[m];
686        hPowers[0] = h;
687        for (int i = 1; i < m; ++i) {
688            hPowers[i] = h.multiply(hPowers[i - 1]);
689        }
690
691        /*
692         * First column and last row has special values (each other reversed).
693         */
694        for (int i = 0; i < m; ++i) {
695            Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
696            Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
697        }
698
699        /*
700         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
701         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
702         */
703        if (h.compareTo(BigFraction.ONE_HALF) == 1) {
704            Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
705        }
706
707        /*
708         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
709         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
710         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
711         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
712         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
713         * really necessary.
714         */
715        for (int i = 0; i < m; ++i) {
716            for (int j = 0; j < i + 1; ++j) {
717                if (i - j + 1 > 0) {
718                    for (int g = 2; g <= i - j + 1; ++g) {
719                        Hdata[i][j] = Hdata[i][j].divide(g);
720                    }
721                }
722            }
723        }
724        return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
725    }
726
727    /***
728     * Creates {@code H} of size {@code m x m} as described in [1] (see above)
729     * using double-precision.
730     *
731     * @param d statistic
732     * @param n sample size
733     * @return H matrix
734     * @throws NumberIsTooLargeException if fractional part is greater than 1
735     */
736    private RealMatrix createRoundedH(double d, int n)
737        throws NumberIsTooLargeException {
738
739        final int k = (int) Math.ceil(n * d);
740        final int m = 2 * k - 1;
741        final double h = k - n * d;
742        if (h >= 1) {
743            throw new NumberIsTooLargeException(h, 1.0, false);
744        }
745        final double[][] Hdata = new double[m][m];
746
747        /*
748         * Start by filling everything with either 0 or 1.
749         */
750        for (int i = 0; i < m; ++i) {
751            for (int j = 0; j < m; ++j) {
752                if (i - j + 1 < 0) {
753                    Hdata[i][j] = 0;
754                } else {
755                    Hdata[i][j] = 1;
756                }
757            }
758        }
759
760        /*
761         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
762         * hPowers[m-1] = h^m
763         */
764        final double[] hPowers = new double[m];
765        hPowers[0] = h;
766        for (int i = 1; i < m; ++i) {
767            hPowers[i] = h * hPowers[i - 1];
768        }
769
770        /*
771         * First column and last row has special values (each other reversed).
772         */
773        for (int i = 0; i < m; ++i) {
774            Hdata[i][0] = Hdata[i][0] - hPowers[i];
775            Hdata[m - 1][i] -= hPowers[m - i - 1];
776        }
777
778        /*
779         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
780         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
781         */
782        if (Double.compare(h, 0.5) > 0) {
783            Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
784        }
785
786        /*
787         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
788         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
789         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
790         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
791         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
792         * really necessary.
793         */
794        for (int i = 0; i < m; ++i) {
795            for (int j = 0; j < i + 1; ++j) {
796                if (i - j + 1 > 0) {
797                    for (int g = 2; g <= i - j + 1; ++g) {
798                        Hdata[i][j] /= g;
799                    }
800                }
801            }
802        }
803        return MatrixUtils.createRealMatrix(Hdata);
804    }
805
806    /**
807     * Verifies that {@code array} has length at least 2.
808     *
809     * @param array array to test
810     * @throws NullArgumentException if array is null
811     * @throws InsufficientDataException if array is too short
812     */
813    private void checkArray(double[] array) {
814        if (array == null) {
815            throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
816        }
817        if (array.length < 2) {
818            throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
819                                                2);
820        }
821    }
822
823    /**
824     * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
825     * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
826     * have been computed. If the sum does not converge before {@code maxIterations} iterations a
827     * {@link TooManyIterationsException} is thrown.
828     *
829     * @param t argument
830     * @param tolerance Cauchy criterion for partial sums
831     * @param maxIterations maximum number of partial sums to compute
832     * @return Kolmogorov sum evaluated at t
833     * @throws TooManyIterationsException if the series does not converge
834     */
835    public double ksSum(double t, double tolerance, int maxIterations) {
836        if (t == 0.0) {
837            return 0.0;
838        }
839
840        // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
841        // from class javadoc should be used.
842
843        final double x = -2 * t * t;
844        int sign = -1;
845        long i = 1;
846        double partialSum = 0.5d;
847        double delta = 1;
848        while (delta > tolerance && i < maxIterations) {
849            delta = FastMath.exp(x * i * i);
850            partialSum += sign * delta;
851            sign *= -1;
852            i++;
853        }
854        if (i == maxIterations) {
855            throw new TooManyIterationsException(maxIterations);
856        }
857        return partialSum * 2;
858    }
859
860    /**
861     * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
862     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
863     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
864     * <p>
865     * The returned probability is exact, obtained by enumerating all partitions of {@code m + n}
866     * into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the
867     * number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to)
868     * {@code d}.
869     * </p>
870     * <p>
871     * <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose
872     * {n}\), it is very slow if called for large {@code m, n}. For this reason,
873     * {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < }
874     * {@value #SMALL_SAMPLE_PRODUCT}.
875     * </p>
876     *
877     * @param d D-statistic value
878     * @param n first sample size
879     * @param m second sample size
880     * @param strict whether or not the probability to compute is expressed as a strict inequality
881     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
882     *         greater than (resp. greater than or equal to) {@code d}
883     */
884    public double exactP(double d, int n, int m, boolean strict) {
885        Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
886        long tail = 0;
887        final double[] nSet = new double[n];
888        final double[] mSet = new double[m];
889        final double tol = 1e-12;  // d-values within tol of one another are considered equal
890        while (combinationsIterator.hasNext()) {
891            // Generate an n-set
892            final int[] nSetI = combinationsIterator.next();
893            // Copy the n-set to nSet and its complement to mSet
894            int j = 0;
895            int k = 0;
896            for (int i = 0; i < n + m; i++) {
897                if (j < n && nSetI[j] == i) {
898                    nSet[j++] = i;
899                } else {
900                    mSet[k++] = i;
901                }
902            }
903            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
904            final int order = Precision.compareTo(curD, d, tol);
905            if (order > 0 || (order == 0 && !strict)) {
906                tail++;
907            }
908        }
909        return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
910    }
911
912    /**
913     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
914     * is the 2-sample Kolmogorov-Smirnov statistic. See
915     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
916     * <p>
917     * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
918     * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
919     * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
920     * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
921     * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
922     * </p>
923     *
924     * @param d D-statistic value
925     * @param n first sample size
926     * @param m second sample size
927     * @return approximate probability that a randomly selected m-n partition of m + n generates
928     *         \(D_{n,m}\) greater than {@code d}
929     */
930    public double approximateP(double d, int n, int m) {
931        final double dm = m;
932        final double dn = n;
933        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
934    }
935
936    /**
937     * Fills a boolean array randomly with a fixed number of {@code true} values.
938     * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
939     * By processing first the {@code true} values followed by the remaining {@code false} values
940     * less random numbers need to be generated. The method is optimized for the case
941     * that the number of {@code true} values is larger than or equal to the number of
942     * {@code false} values.
943     *
944     * @param b boolean array
945     * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
946     * @param rng random data generator
947     */
948    static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
949        Arrays.fill(b, true);
950        for (int k = numberOfTrueValues; k < b.length; k++) {
951            final int r = rng.nextInt(k + 1);
952            b[(b[r]) ? r : k] = false;
953        }
954    }
955
956    /**
957     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
958     * 2-sample Kolmogorov-Smirnov statistic. See
959     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
960     * <p>
961     * The simulation generates {@code iterations} random partitions of {@code m + n} into an
962     * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
963     * the proportion of values that are greater than {@code d}, or greater than or equal to
964     * {@code d} if {@code strict} is {@code false}.
965     * </p>
966     *
967     * @param d D-statistic value
968     * @param n first sample size
969     * @param m second sample size
970     * @param iterations number of random partitions to generate
971     * @param strict whether or not the probability to compute is expressed as a strict inequality
972     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
973     *         greater than (resp. greater than or equal to) {@code d}
974     */
975    public double monteCarloP(final double d, final int n, final int m, final boolean strict,
976                              final int iterations) {
977        // ensure that nn is always the max of (n, m) to require fewer random numbers
978        final int nn = FastMath.max(n, m);
979        final int mm = FastMath.min(n, m);
980        final int sum = nn + mm;
981        final double tol = 1e-12;  // d-values within tol of one another are considered equal
982
983        int tail = 0;
984        final boolean b[] = new boolean[sum];
985        for (int i = 0; i < iterations; i++) {
986            fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
987            int rankN = b[0] ? 1 : 0;
988            int rankM = b[0] ? 0 : 1;
989            boolean previous = b[0];
990            for(int j = 1; j < b.length; ++j) {
991                if (b[j] != previous) {
992                    final double cdf_n = rankN / (double) nn;
993                    final double cdf_m = rankM / (double) mm;
994                    final double curD = FastMath.abs(cdf_n - cdf_m);
995                    final int order = Precision.compareTo(curD, d, tol);
996                    if (order > 0 || (order == 0 && !strict)) {
997                        tail++;
998                        break;
999                    }
1000                }
1001                previous = b[j];
1002                if (b[j]) {
1003                    rankN++;
1004                } else {
1005                    rankM++;
1006                }
1007            }
1008        }
1009        return (double) tail / iterations;
1010    }
1011}