001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.inference;
019
020import java.math.BigDecimal;
021import java.util.Arrays;
022import java.util.Iterator;
023
024import org.apache.commons.math3.distribution.RealDistribution;
025import org.apache.commons.math3.exception.InsufficientDataException;
026import org.apache.commons.math3.exception.MathArithmeticException;
027import org.apache.commons.math3.exception.NullArgumentException;
028import org.apache.commons.math3.exception.NumberIsTooLargeException;
029import org.apache.commons.math3.exception.OutOfRangeException;
030import org.apache.commons.math3.exception.TooManyIterationsException;
031import org.apache.commons.math3.exception.util.LocalizedFormats;
032import org.apache.commons.math3.fraction.BigFraction;
033import org.apache.commons.math3.fraction.BigFractionField;
034import org.apache.commons.math3.fraction.FractionConversionException;
035import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
036import org.apache.commons.math3.linear.FieldMatrix;
037import org.apache.commons.math3.linear.MatrixUtils;
038import org.apache.commons.math3.linear.RealMatrix;
039import org.apache.commons.math3.random.RandomGenerator;
040import org.apache.commons.math3.random.Well19937c;
041import org.apache.commons.math3.util.CombinatoricsUtils;
042import org.apache.commons.math3.util.FastMath;
043import org.apache.commons.math3.util.MathArrays;
044
045import static org.apache.commons.math3.util.MathUtils.PI_SQUARED;
046import static org.apache.commons.math3.util.FastMath.PI;
047
048/**
049 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
050 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
051 * <p>
052 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
053 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
054 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
055 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
056 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
057 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
058 * given in [2].
059 * </p>
060 * <p>
061 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
062 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
063 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
064 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
065 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
066 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
067 * follows:
068 * <ul>
069 * <li>For very small samples (where the product of the sample sizes is less than
070 * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the
071 * 2-sample test.</li>
072 * <li>For mid-size samples (product of sample sizes greater than or equal to
073 * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
074 * simulation is used to compute the p-value. The simulation randomly generates partitions of \(m +
075 * n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values
076 * exceeding the observed value.</li>
077 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
078 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
079 * the approximation.</li>
080 * </ul>
081 * </p>
082 * <p>
083 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
084 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
085 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
086 * {@code strict} parameter. This parameter is ignored for large samples.
087 * </p>
088 * <p>
089 * The methods used by the 2-sample default implementation are also exposed directly:
090 * <ul>
091 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
092 * <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte
093 * Carlo simulation</li>
094 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
095 * arguments in the first two methods allow the probability used to estimate the p-value to be
096 * expressed using strict or non-strict inequality. See
097 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
098 * </ul>
099 * </p>
100 * <p>
101 * References:
102 * <ul>
103 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
104 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
105 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
106 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
107 * </ul>
108 * <br/>
109 * Note that [1] contains an error in computing h, refer to <a
110 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
111 * </p>
112 *
113 * @since 3.3
114 */
115public class KolmogorovSmirnovTest {
116
117    /**
118     * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
119     */
120    protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
121
122    /** Convergence criterion for {@link #ksSum(double, double, int)} */
123    protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
124
125    /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
126    protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
127
128    /** When product of sample sizes is less than this value, 2-sample K-S test is exact */
129    protected static final int SMALL_SAMPLE_PRODUCT = 200;
130
131    /**
132     * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
133     * distribution for strict inequality p-value.
134     */
135    protected static final int LARGE_SAMPLE_PRODUCT = 10000;
136
137    /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */
138    protected static final int MONTE_CARLO_ITERATIONS = 1000000;
139
140    /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
141    private final RandomGenerator rng;
142
143    /**
144     * Construct a KolmogorovSmirnovTest instance with a default random data generator.
145     */
146    public KolmogorovSmirnovTest() {
147        rng = new Well19937c();
148    }
149
150    /**
151     * Construct a KolmogorovSmirnovTest with the provided random data generator.
152     *
153     * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
154     */
155    public KolmogorovSmirnovTest(RandomGenerator rng) {
156        this.rng = rng;
157    }
158
159    /**
160     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
161     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
162     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
163     * {@code exact} is true, the distribution used to compute the p-value is computed using
164     * extended precision. See {@link #cdfExact(double, int)}.
165     *
166     * @param distribution reference distribution
167     * @param data sample being being evaluated
168     * @param exact whether or not to force exact computation of the p-value
169     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
170     *         {@code distribution}
171     * @throws InsufficientDataException if {@code data} does not have length at least 2
172     * @throws NullArgumentException if {@code data} is null
173     */
174    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
175        return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
176    }
177
178    /**
179     * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
180     * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
181     * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
182     * each of the values in {@code data}.
183     *
184     * @param distribution reference distribution
185     * @param data sample being evaluated
186     * @return Kolmogorov-Smirnov statistic \(D_n\)
187     * @throws InsufficientDataException if {@code data} does not have length at least 2
188     * @throws NullArgumentException if {@code data} is null
189     */
190    public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
191        checkArray(data);
192        final int n = data.length;
193        final double nd = n;
194        final double[] dataCopy = new double[n];
195        System.arraycopy(data, 0, dataCopy, 0, n);
196        Arrays.sort(dataCopy);
197        double d = 0d;
198        for (int i = 1; i <= n; i++) {
199            final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
200            final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
201            if (currD > d) {
202                d = currD;
203            }
204        }
205        return d;
206    }
207
208    /**
209     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
210     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
211     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
212     * probability distribution. Specifically, what is returned is an estimate of the probability
213     * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
214     * selected partition of the combined sample into subsamples of sizes {@code x.length} and
215     * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
216     * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
217     * <ul>
218     * <li>For very small samples (where the product of the sample sizes is less than
219     * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
220     * is accomplished by enumerating all partitions of the combined sample into two subsamples of
221     * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
222     * proportion of partitions that give \(D\) values exceeding the observed value.</li>
223     * <li>For mid-size samples (product of sample sizes greater than or equal to
224     * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
225     * simulation is used to compute the p-value. The simulation randomly generates partitions and
226     * reports the proportion that give \(D\) values exceeding the observed value.</li>
227     * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
228     * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
229     * for details on the approximation.</li>
230     * </ul>
231     *
232     * @param x first sample dataset
233     * @param y second sample dataset
234     * @param strict whether or not the probability to compute is expressed as a strict inequality
235     *        (ignored for large samples)
236     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
237     *         samples from the same distribution
238     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
239     *         least 2
240     * @throws NullArgumentException if either {@code x} or {@code y} is null
241     */
242    public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
243        final long lengthProduct = (long) x.length * y.length;
244        if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
245            return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
246        }
247        if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
248            return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
249        }
250        return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
251    }
252
253    /**
254     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
255     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
256     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
257     * probability distribution. Assumes the strict form of the inequality used to compute the
258     * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
259     *
260     * @param x first sample dataset
261     * @param y second sample dataset
262     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
263     *         samples from the same distribution
264     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
265     *         least 2
266     * @throws NullArgumentException if either {@code x} or {@code y} is null
267     */
268    public double kolmogorovSmirnovTest(double[] x, double[] y) {
269        return kolmogorovSmirnovTest(x, y, true);
270    }
271
272    /**
273     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
274     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
275     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
276     * is the empirical distribution of the {@code y} values.
277     *
278     * @param x first sample
279     * @param y second sample
280     * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
281     *         {@code y} represent samples from the same underlying distribution
282     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
283     *         least 2
284     * @throws NullArgumentException if either {@code x} or {@code y} is null
285     */
286    public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
287        checkArray(x);
288        checkArray(y);
289        // Copy and sort the sample arrays
290        final double[] sx = MathArrays.copyOf(x);
291        final double[] sy = MathArrays.copyOf(y);
292        Arrays.sort(sx);
293        Arrays.sort(sy);
294        final int n = sx.length;
295        final int m = sy.length;
296
297        // Find the max difference between cdf_x and cdf_y
298        double supD = 0d;
299        // First walk x points
300        for (int i = 0; i < n; i++) {
301            final double x_i = sx[i];
302            // ties can be safely ignored
303            if (i > 0 && x_i == sx[i-1]) {
304                continue;
305            }
306            final double cdf_x = edf(x_i, sx);
307            final double cdf_y = edf(x_i, sy);
308            final double curD = FastMath.abs(cdf_x - cdf_y);
309            if (curD > supD) {
310                supD = curD;
311            }
312        }
313        // Now look at y
314        for (int i = 0; i < m; i++) {
315            final double y_i = sy[i];
316            // ties can be safely ignored
317            if (i > 0 && y_i == sy[i-1]) {
318                continue;
319            }
320            final double cdf_x = edf(y_i, sx);
321            final double cdf_y = edf(y_i, sy);
322            final double curD = FastMath.abs(cdf_x - cdf_y);
323            if (curD > supD) {
324                supD = curD;
325            }
326        }
327        return supD;
328    }
329
330    /**
331     * Computes the empirical distribution function.
332     *
333     * @param x the given x
334     * @param samples the observations
335     * @return the empirical distribution function \(F_n(x)\)
336     */
337    private double edf(final double x, final double[] samples) {
338        final int n = samples.length;
339        int index = Arrays.binarySearch(samples, x);
340        if (index >= 0) {
341            while(index < (n - 1) && samples[index+1] == x) {
342                ++index;
343            }
344        }
345        return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
346    }
347
348    /**
349     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
350     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
351     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
352     *
353     * @param distribution reference distribution
354     * @param data sample being being evaluated
355     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
356     *         {@code distribution}
357     * @throws InsufficientDataException if {@code data} does not have length at least 2
358     * @throws NullArgumentException if {@code data} is null
359     */
360    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
361        return kolmogorovSmirnovTest(distribution, data, false);
362    }
363
364    /**
365     * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
366     * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
367     *
368     * @param distribution reference distribution
369     * @param data sample being being evaluated
370     * @param alpha significance level of the test
371     * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
372     *         can be rejected with confidence 1 - {@code alpha}
373     * @throws InsufficientDataException if {@code data} does not have length at least 2
374     * @throws NullArgumentException if {@code data} is null
375     */
376    public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
377        if ((alpha <= 0) || (alpha > 0.5)) {
378            throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
379        }
380        return kolmogorovSmirnovTest(distribution, data) < alpha;
381    }
382
383    /**
384     * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
385     * values given in [2] (see above). The result is not exact as with
386     * {@link #cdfExact(double, int)} because calculations are based on
387     * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
388     *
389     * @param d statistic
390     * @param n sample size
391     * @return \(P(D_n < d)\)
392     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
393     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
394     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
395     */
396    public double cdf(double d, int n)
397        throws MathArithmeticException {
398        return cdf(d, n, false);
399    }
400
401    /**
402     * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
403     * used everywhere at the expense of very slow execution time. Almost never choose this in real
404     * applications unless you are very sure; this is almost solely for verification purposes.
405     * Normally, you would choose {@link #cdf(double, int)}. See the class
406     * javadoc for definitions and algorithm description.
407     *
408     * @param d statistic
409     * @param n sample size
410     * @return \(P(D_n < d)\)
411     * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
412     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
413     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
414     */
415    public double cdfExact(double d, int n)
416        throws MathArithmeticException {
417        return cdf(d, n, true);
418    }
419
420    /**
421     * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
422     * values given in [2] (see above).
423     *
424     * @param d statistic
425     * @param n sample size
426     * @param exact whether the probability should be calculated exact using
427     *        {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
428     *        very slow execution time, or if {@code double} should be used convenient places to
429     *        gain speed. Almost never choose {@code true} in real applications unless you are very
430     *        sure; {@code true} is almost solely for verification purposes.
431     * @return \(P(D_n < d)\)
432     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
433     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
434     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
435     */
436    public double cdf(double d, int n, boolean exact)
437        throws MathArithmeticException {
438
439        final double ninv = 1 / ((double) n);
440        final double ninvhalf = 0.5 * ninv;
441
442        if (d <= ninvhalf) {
443            return 0;
444        } else if (ninvhalf < d && d <= ninv) {
445            double res = 1;
446            final double f = 2 * d - ninv;
447            // n! f^n = n*f * (n-1)*f * ... * 1*x
448            for (int i = 1; i <= n; ++i) {
449                res *= i * f;
450            }
451            return res;
452        } else if (1 - ninv <= d && d < 1) {
453            return 1 - 2 * Math.pow(1 - d, n);
454        } else if (1 <= d) {
455            return 1;
456        }
457        if (exact) {
458            return exactK(d, n);
459        }
460        if (n <= 140) {
461            return roundedK(d, n);
462        }
463        return pelzGood(d, n);
464    }
465
466    /**
467     * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
468     * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
469     * above).
470     *
471     * @param d statistic
472     * @param n sample size
473     * @return the two-sided probability of \(P(D_n < d)\)
474     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
475     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
476     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
477     */
478    private double exactK(double d, int n)
479        throws MathArithmeticException {
480
481        final int k = (int) Math.ceil(n * d);
482
483        final FieldMatrix<BigFraction> H = this.createExactH(d, n);
484        final FieldMatrix<BigFraction> Hpower = H.power(n);
485
486        BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
487
488        for (int i = 1; i <= n; ++i) {
489            pFrac = pFrac.multiply(i).divide(n);
490        }
491
492        /*
493         * BigFraction.doubleValue converts numerator to double and the denominator to double and
494         * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
495         * digits):
496         */
497        return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
498    }
499
500    /**
501     * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
502     *
503     * @param d statistic
504     * @param n sample size
505     * @return \(P(D_n < d)\)
506     */
507    private double roundedK(double d, int n) {
508
509        final int k = (int) Math.ceil(n * d);
510        final RealMatrix H = this.createRoundedH(d, n);
511        final RealMatrix Hpower = H.power(n);
512
513        double pFrac = Hpower.getEntry(k - 1, k - 1);
514        for (int i = 1; i <= n; ++i) {
515            pFrac *= (double) i / (double) n;
516        }
517
518        return pFrac;
519    }
520
521    /**
522     * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
523     *
524     * @param d value of d-statistic (x in [2])
525     * @param n sample size
526     * @return \(P(D_n < d)\)
527     * @since 3.4
528     */
529    public double pelzGood(double d, int n) {
530
531        // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
532        final double sqrtN = FastMath.sqrt(n);
533        final double z = d * sqrtN;
534        final double z2 = d * d * n;
535        final double z4 = z2 * z2;
536        final double z6 = z4 * z2;
537        final double z8 = z4 * z4;
538
539        // Eventual return value
540        double ret = 0;
541
542        // Compute K_0(z)
543        double sum = 0;
544        double increment = 0;
545        double kTerm = 0;
546        double z2Term = PI_SQUARED / (8 * z2);
547        int k = 1;
548        for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
549            kTerm = 2 * k - 1;
550            increment = FastMath.exp(-z2Term * kTerm * kTerm);
551            sum += increment;
552            if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
553                break;
554            }
555        }
556        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
557            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
558        }
559        ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
560
561        // K_1(z)
562        // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
563        // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
564        final double twoZ2 = 2 * z2;
565        sum = 0;
566        kTerm = 0;
567        double kTerm2 = 0;
568        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
569            kTerm = k + 0.5;
570            kTerm2 = kTerm * kTerm;
571            increment = (PI_SQUARED * kTerm2 - z2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
572            sum += increment;
573            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
574                break;
575            }
576        }
577        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
578            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
579        }
580        final double sqrtHalfPi = FastMath.sqrt(PI / 2);
581        // Instead of doubling sum, divide by 3 instead of 6
582        ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
583
584        // K_2(z)
585        // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
586        final double z4Term = 2 * z4;
587        final double z6Term = 6 * z6;
588        z2Term = 5 * z2;
589        final double pi4 = PI_SQUARED * PI_SQUARED;
590        sum = 0;
591        kTerm = 0;
592        kTerm2 = 0;
593        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
594            kTerm = k + 0.5;
595            kTerm2 = kTerm * kTerm;
596            increment =  (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
597                    pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
598            sum += increment;
599            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
600                break;
601            }
602        }
603        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
604            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
605        }
606        double sum2 = 0;
607        kTerm2 = 0;
608        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
609            kTerm2 = k * k;
610            increment = PI_SQUARED * kTerm2 * FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
611            sum2 += increment;
612            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
613                break;
614            }
615        }
616        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
617            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
618        }
619        // Again, adjust coefficients instead of doubling sum, sum2
620        ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
621
622        // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
623        // Multiply coefficient denominators by 2, so omit doubling sums.
624        final double pi6 = pi4 * PI_SQUARED;
625        sum = 0;
626        double kTerm4 = 0;
627        double kTerm6 = 0;
628        for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
629            kTerm = k + 0.5;
630            kTerm2 = kTerm * kTerm;
631            kTerm4 = kTerm2 * kTerm2;
632            kTerm6 = kTerm4 * kTerm2;
633            increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
634                    PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
635                    FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
636            sum += increment;
637            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
638                break;
639            }
640        }
641        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
642            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
643        }
644        sum2 = 0;
645        for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
646            kTerm2 = k * k;
647            kTerm4 = kTerm2 * kTerm2;
648            increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
649                    FastMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
650            sum2 += increment;
651            if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
652                break;
653            }
654        }
655        if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
656            throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
657        }
658        return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
659                + sum2 / (108 * z6));
660
661    }
662
663    /***
664     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
665     *
666     * @param d statistic
667     * @param n sample size
668     * @return H matrix
669     * @throws NumberIsTooLargeException if fractional part is greater than 1
670     * @throws FractionConversionException if algorithm fails to convert {@code h} to a
671     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
672     *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
673     */
674    private FieldMatrix<BigFraction> createExactH(double d, int n)
675        throws NumberIsTooLargeException, FractionConversionException {
676
677        final int k = (int) Math.ceil(n * d);
678        final int m = 2 * k - 1;
679        final double hDouble = k - n * d;
680        if (hDouble >= 1) {
681            throw new NumberIsTooLargeException(hDouble, 1.0, false);
682        }
683        BigFraction h = null;
684        try {
685            h = new BigFraction(hDouble, 1.0e-20, 10000);
686        } catch (final FractionConversionException e1) {
687            try {
688                h = new BigFraction(hDouble, 1.0e-10, 10000);
689            } catch (final FractionConversionException e2) {
690                h = new BigFraction(hDouble, 1.0e-5, 10000);
691            }
692        }
693        final BigFraction[][] Hdata = new BigFraction[m][m];
694
695        /*
696         * Start by filling everything with either 0 or 1.
697         */
698        for (int i = 0; i < m; ++i) {
699            for (int j = 0; j < m; ++j) {
700                if (i - j + 1 < 0) {
701                    Hdata[i][j] = BigFraction.ZERO;
702                } else {
703                    Hdata[i][j] = BigFraction.ONE;
704                }
705            }
706        }
707
708        /*
709         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
710         * hPowers[m-1] = h^m
711         */
712        final BigFraction[] hPowers = new BigFraction[m];
713        hPowers[0] = h;
714        for (int i = 1; i < m; ++i) {
715            hPowers[i] = h.multiply(hPowers[i - 1]);
716        }
717
718        /*
719         * First column and last row has special values (each other reversed).
720         */
721        for (int i = 0; i < m; ++i) {
722            Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
723            Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
724        }
725
726        /*
727         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
728         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
729         */
730        if (h.compareTo(BigFraction.ONE_HALF) == 1) {
731            Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
732        }
733
734        /*
735         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
736         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
737         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
738         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
739         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
740         * really necessary.
741         */
742        for (int i = 0; i < m; ++i) {
743            for (int j = 0; j < i + 1; ++j) {
744                if (i - j + 1 > 0) {
745                    for (int g = 2; g <= i - j + 1; ++g) {
746                        Hdata[i][j] = Hdata[i][j].divide(g);
747                    }
748                }
749            }
750        }
751        return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
752    }
753
754    /***
755     * Creates {@code H} of size {@code m x m} as described in [1] (see above)
756     * using double-precision.
757     *
758     * @param d statistic
759     * @param n sample size
760     * @return H matrix
761     * @throws NumberIsTooLargeException if fractional part is greater than 1
762     */
763    private RealMatrix createRoundedH(double d, int n)
764        throws NumberIsTooLargeException {
765
766        final int k = (int) Math.ceil(n * d);
767        final int m = 2 * k - 1;
768        final double h = k - n * d;
769        if (h >= 1) {
770            throw new NumberIsTooLargeException(h, 1.0, false);
771        }
772        final double[][] Hdata = new double[m][m];
773
774        /*
775         * Start by filling everything with either 0 or 1.
776         */
777        for (int i = 0; i < m; ++i) {
778            for (int j = 0; j < m; ++j) {
779                if (i - j + 1 < 0) {
780                    Hdata[i][j] = 0;
781                } else {
782                    Hdata[i][j] = 1;
783                }
784            }
785        }
786
787        /*
788         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
789         * hPowers[m-1] = h^m
790         */
791        final double[] hPowers = new double[m];
792        hPowers[0] = h;
793        for (int i = 1; i < m; ++i) {
794            hPowers[i] = h * hPowers[i - 1];
795        }
796
797        /*
798         * First column and last row has special values (each other reversed).
799         */
800        for (int i = 0; i < m; ++i) {
801            Hdata[i][0] = Hdata[i][0] - hPowers[i];
802            Hdata[m - 1][i] -= hPowers[m - i - 1];
803        }
804
805        /*
806         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
807         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
808         */
809        if (Double.compare(h, 0.5) > 0) {
810            Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
811        }
812
813        /*
814         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
815         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
816         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
817         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
818         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
819         * really necessary.
820         */
821        for (int i = 0; i < m; ++i) {
822            for (int j = 0; j < i + 1; ++j) {
823                if (i - j + 1 > 0) {
824                    for (int g = 2; g <= i - j + 1; ++g) {
825                        Hdata[i][j] /= g;
826                    }
827                }
828            }
829        }
830        return MatrixUtils.createRealMatrix(Hdata);
831    }
832
833    /**
834     * Verifies that {@code array} has length at least 2.
835     *
836     * @param array array to test
837     * @throws NullArgumentException if array is null
838     * @throws InsufficientDataException if array is too short
839     */
840    private void checkArray(double[] array) {
841        if (array == null) {
842            throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
843        }
844        if (array.length < 2) {
845            throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
846                                                2);
847        }
848    }
849
850    /**
851     * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
852     * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
853     * have been computed. If the sum does not converge before {@code maxIterations} iterations a
854     * {@link TooManyIterationsException} is thrown.
855     *
856     * @param t argument
857     * @param tolerance Cauchy criterion for partial sums
858     * @param maxIterations maximum number of partial sums to compute
859     * @return Kolmogorov sum evaluated at t
860     * @throws TooManyIterationsException if the series does not converge
861     */
862    public double ksSum(double t, double tolerance, int maxIterations) {
863        if (t == 0.0) {
864            return 1.0;
865        }
866
867        // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
868        // from class javadoc should be used.
869
870        final double x = -2 * t * t;
871        int sign = -1;
872        long i = 1;
873        double partialSum = 0.5d;
874        double delta = 1;
875        while (delta > tolerance && i < maxIterations) {
876            delta = FastMath.exp(x * i * i);
877            partialSum += sign * delta;
878            sign *= -1;
879            i++;
880        }
881        if (i == maxIterations) {
882            throw new TooManyIterationsException(maxIterations);
883        }
884        return partialSum * 2;
885    }
886
887    /**
888     * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
889     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
890     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
891     * <p>
892     * The returned probability is exact, obtained by enumerating all partitions of {@code m + n}
893     * into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the
894     * number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to)
895     * {@code d}.
896     * </p>
897     * <p>
898     * <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose
899     * {n}\), it is very slow if called for large {@code m, n}. For this reason,
900     * {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < }
901     * {@value #SMALL_SAMPLE_PRODUCT}.
902     * </p>
903     *
904     * @param d D-statistic value
905     * @param n first sample size
906     * @param m second sample size
907     * @param strict whether or not the probability to compute is expressed as a strict inequality
908     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
909     *         greater than (resp. greater than or equal to) {@code d}
910     */
911    public double exactP(double d, int n, int m, boolean strict) {
912        Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
913        long tail = 0;
914        final double[] nSet = new double[n];
915        final double[] mSet = new double[m];
916        while (combinationsIterator.hasNext()) {
917            // Generate an n-set
918            final int[] nSetI = combinationsIterator.next();
919            // Copy the n-set to nSet and its complement to mSet
920            int j = 0;
921            int k = 0;
922            for (int i = 0; i < n + m; i++) {
923                if (j < n && nSetI[j] == i) {
924                    nSet[j++] = i;
925                } else {
926                    mSet[k++] = i;
927                }
928            }
929            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
930            if (curD > d) {
931                tail++;
932            } else if (curD == d && !strict) {
933                tail++;
934            }
935        }
936        return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
937    }
938
939    /**
940     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
941     * is the 2-sample Kolmogorov-Smirnov statistic. See
942     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
943     * <p>
944     * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
945     * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
946     * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
947     * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
948     * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
949     * </p>
950     *
951     * @param d D-statistic value
952     * @param n first sample size
953     * @param m second sample size
954     * @return approximate probability that a randomly selected m-n partition of m + n generates
955     *         \(D_{n,m}\) greater than {@code d}
956     */
957    public double approximateP(double d, int n, int m) {
958        final double dm = m;
959        final double dn = n;
960        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
961    }
962
963    /**
964     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
965     * 2-sample Kolmogorov-Smirnov statistic. See
966     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
967     * <p>
968     * The simulation generates {@code iterations} random partitions of {@code m + n} into an
969     * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
970     * the proportion of values that are greater than {@code d}, or greater than or equal to
971     * {@code d} if {@code strict} is {@code false}.
972     * </p>
973     *
974     * @param d D-statistic value
975     * @param n first sample size
976     * @param m second sample size
977     * @param iterations number of random partitions to generate
978     * @param strict whether or not the probability to compute is expressed as a strict inequality
979     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
980     *         greater than (resp. greater than or equal to) {@code d}
981     */
982    public double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
983        final int[] nPlusMSet = MathArrays.natural(m + n);
984        final double[] nSet = new double[n];
985        final double[] mSet = new double[m];
986        int tail = 0;
987        for (int i = 0; i < iterations; i++) {
988            copyPartition(nSet, mSet, nPlusMSet, n, m);
989            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
990            if (curD > d) {
991                tail++;
992            } else if (curD == d && !strict) {
993                tail++;
994            }
995            MathArrays.shuffle(nPlusMSet, rng);
996            Arrays.sort(nPlusMSet, 0, n);
997        }
998        return (double) tail / iterations;
999    }
1000
1001    /**
1002     * Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement
1003     * relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and
1004     * {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have
1005     * {@code nSet = [1,4,5], mSet = [0,2,3]}.
1006     * <p>
1007     * <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted
1008     * in ascending order.
1009     * </p>
1010     *
1011     * @param nSet array to fill with the first {@code n} elements of {@code nSetI}
1012     * @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative
1013     *        to {@code m + n}
1014     * @param nSetI array whose first {@code n} elements specify the members of {@code nSet}
1015     * @param n number of elements in the first output array
1016     * @param m number of elements in the second output array
1017     */
1018    private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) {
1019        int j = 0;
1020        int k = 0;
1021        for (int i = 0; i < n + m; i++) {
1022            if (j < n && nSetI[j] == i) {
1023                nSet[j++] = i;
1024            } else {
1025                mSet[k++] = i;
1026            }
1027        }
1028    }
1029}