001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.stat.inference;
019
020import java.math.BigDecimal;
021import java.util.Arrays;
022import java.util.Iterator;
023
024import org.apache.commons.math3.distribution.RealDistribution;
025import org.apache.commons.math3.exception.InsufficientDataException;
026import org.apache.commons.math3.exception.MathArithmeticException;
027import org.apache.commons.math3.exception.NullArgumentException;
028import org.apache.commons.math3.exception.NumberIsTooLargeException;
029import org.apache.commons.math3.exception.OutOfRangeException;
030import org.apache.commons.math3.exception.TooManyIterationsException;
031import org.apache.commons.math3.exception.util.LocalizedFormats;
032import org.apache.commons.math3.fraction.BigFraction;
033import org.apache.commons.math3.fraction.BigFractionField;
034import org.apache.commons.math3.fraction.FractionConversionException;
035import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
036import org.apache.commons.math3.linear.Array2DRowRealMatrix;
037import org.apache.commons.math3.linear.FieldMatrix;
038import org.apache.commons.math3.linear.RealMatrix;
039import org.apache.commons.math3.random.RandomGenerator;
040import org.apache.commons.math3.random.Well19937c;
041import org.apache.commons.math3.util.CombinatoricsUtils;
042import org.apache.commons.math3.util.FastMath;
043import org.apache.commons.math3.util.MathArrays;
044
045/**
046 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
047 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
048 * <p>
049 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
050 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
051 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
052 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
053 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
054 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
055 * given in [2].
056 * </p>
057 * <p>
058 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
059 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
060 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
061 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
062 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
063 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
064 * follows:
065 * <ul>
066 * <li>For very small samples (where the product of the sample sizes is less than
067 * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the
068 * 2-sample test.</li>
069 * <li>For mid-size samples (product of sample sizes greater than or equal to
070 * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
071 * simulation is used to compute the p-value. The simulation randomly generates partitions of \(m +
072 * n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values
073 * exceeding the observed value.</li>
074 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
075 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
076 * the approximation.</li>
077 * </ul>
078 * </p>
079 * <p>
080 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
081 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
082 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
083 * {@code strict} parameter. This parameter is ignored for large samples.
084 * </p>
085 * <p>
086 * The methods used by the 2-sample default implementation are also exposed directly:
087 * <ul>
088 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
089 * <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte
090 * Carlo simulation</li>
091 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
092 * arguments in the first two methods allow the probability used to estimate the p-value to be
093 * expressed using strict or non-strict inequality. See
094 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
095 * </ul>
096 * </p>
097 * <p>
098 * References:
099 * <ul>
100 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
101 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
102 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
103 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
104 * </ul>
105 * <br/>
106 * Note that [1] contains an error in computing h, refer to <a
107 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
108 * </p>
109 *
110 * @since 3.3
111 * @version $Id: KolmogorovSmirnovTest.html 908881 2014-05-15 07:10:28Z luc $
112 */
113public class KolmogorovSmirnovTest {
114
115    /**
116     * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
117     */
118    protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
119
120    /** Convergence criterion for {@link #ksSum(double, double, int)} */
121    protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
122
123    /** When product of sample sizes is less than this value, 2-sample K-S test is exact */
124    protected static final int SMALL_SAMPLE_PRODUCT = 200;
125
126    /**
127     * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
128     * distribution for strict inequality p-value.
129     */
130    protected static final int LARGE_SAMPLE_PRODUCT = 10000;
131
132    /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */
133    protected static final int MONTE_CARLO_ITERATIONS = 1000000;
134
135    /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
136    private final RandomGenerator rng;
137
138    /**
139     * Construct a KolmogorovSmirnovTest instance with a default random data generator.
140     */
141    public KolmogorovSmirnovTest() {
142        rng = new Well19937c();
143    }
144
145    /**
146     * Construct a KolmogorovSmirnovTest with the provided random data generator.
147     *
148     * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
149     */
150    public KolmogorovSmirnovTest(RandomGenerator rng) {
151        this.rng = rng;
152    }
153
154    /**
155     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
156     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
157     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
158     * {@code exact} is true, the distribution used to compute the p-value is computed using
159     * extended precision. See {@link #cdfExact(double, int)}.
160     *
161     * @param distribution reference distribution
162     * @param data sample being being evaluated
163     * @param exact whether or not to force exact computation of the p-value
164     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
165     *         {@code distribution}
166     * @throws InsufficientDataException if {@code data} does not have length at least 2
167     * @throws NullArgumentException if {@code data} is null
168     */
169    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
170        return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
171    }
172
173    /**
174     * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
175     * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
176     * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
177     * each of the values in {@code data}.
178     *
179     * @param distribution reference distribution
180     * @param data sample being evaluated
181     * @return Kolmogorov-Smirnov statistic \(D_n\)
182     * @throws InsufficientDataException if {@code data} does not have length at least 2
183     * @throws NullArgumentException if {@code data} is null
184     */
185    public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
186        checkArray(data);
187        final int n = data.length;
188        final double nd = n;
189        final double[] dataCopy = new double[n];
190        System.arraycopy(data, 0, dataCopy, 0, n);
191        Arrays.sort(dataCopy);
192        double d = 0d;
193        for (int i = 1; i <= n; i++) {
194            final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
195            final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
196            if (currD > d) {
197                d = currD;
198            }
199        }
200        return d;
201    }
202
203    /**
204     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
205     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
206     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
207     * probability distribution. Specifically, what is returned is an estimate of the probability
208     * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
209     * selected partition of the combined sample into subsamples of sizes {@code x.length} and
210     * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
211     * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
212     * <ul>
213     * <li>For very small samples (where the product of the sample sizes is less than
214     * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
215     * is accomplished by enumerating all partitions of the combined sample into two subsamples of
216     * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
217     * proportion of partitions that give \(D\) values exceeding the observed value.</li>
218     * <li>For mid-size samples (product of sample sizes greater than or equal to
219     * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
220     * simulation is used to compute the p-value. The simulation randomly generates partitions and
221     * reports the proportion that give \(D\) values exceeding the observed value.</li>
222     * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
223     * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
224     * for details on the approximation.</li>
225     * </ul>
226     *
227     * @param x first sample dataset
228     * @param y second sample dataset
229     * @param strict whether or not the probability to compute is expressed as a strict inequality
230     *        (ignored for large samples)
231     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
232     *         samples from the same distribution
233     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
234     *         least 2
235     * @throws NullArgumentException if either {@code x} or {@code y} is null
236     */
237    public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
238        if (x.length * y.length < SMALL_SAMPLE_PRODUCT) {
239            return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
240        }
241        if (x.length * y.length < LARGE_SAMPLE_PRODUCT) {
242            return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
243        }
244        return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
245    }
246
247    /**
248     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
249     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
250     * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
251     * probability distribution. Assumes the strict form of the inequality used to compute the
252     * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
253     *
254     * @param x first sample dataset
255     * @param y second sample dataset
256     * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
257     *         samples from the same distribution
258     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
259     *         least 2
260     * @throws NullArgumentException if either {@code x} or {@code y} is null
261     */
262    public double kolmogorovSmirnovTest(double[] x, double[] y) {
263        return kolmogorovSmirnovTest(x, y, true);
264    }
265
266    /**
267     * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
268     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
269     * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
270     * is the empirical distribution of the {@code y} values.
271     *
272     * @param x first sample
273     * @param y second sample
274     * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
275     *         {@code y} represent samples from the same underlying distribution
276     * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
277     *         least 2
278     * @throws NullArgumentException if either {@code x} or {@code y} is null
279     */
280    public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
281        checkArray(x);
282        checkArray(y);
283        // Copy and sort the sample arrays
284        final double[] sx = MathArrays.copyOf(x);
285        final double[] sy = MathArrays.copyOf(y);
286        Arrays.sort(sx);
287        Arrays.sort(sy);
288        final int n = sx.length;
289        final int m = sy.length;
290
291        // Find the max difference between cdf_x and cdf_y
292        double supD = 0d;
293        // First walk x points
294        for (int i = 0; i < n; i++) {
295            final double cdf_x = (i + 1d) / n;
296            final int yIndex = Arrays.binarySearch(sy, sx[i]);
297            final double cdf_y = yIndex >= 0 ? (yIndex + 1d) / m : (-yIndex - 1d) / m;
298            final double curD = FastMath.abs(cdf_x - cdf_y);
299            if (curD > supD) {
300                supD = curD;
301            }
302        }
303        // Now look at y
304        for (int i = 0; i < m; i++) {
305            final double cdf_y = (i + 1d) / m;
306            final int xIndex = Arrays.binarySearch(sx, sy[i]);
307            final double cdf_x = xIndex >= 0 ? (xIndex + 1d) / n : (-xIndex - 1d) / n;
308            final double curD = FastMath.abs(cdf_x - cdf_y);
309            if (curD > supD) {
310                supD = curD;
311            }
312        }
313        return supD;
314    }
315
316    /**
317     * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
318     * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
319     * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
320     *
321     * @param distribution reference distribution
322     * @param data sample being being evaluated
323     * @return the p-value associated with the null hypothesis that {@code data} is a sample from
324     *         {@code distribution}
325     * @throws InsufficientDataException if {@code data} does not have length at least 2
326     * @throws NullArgumentException if {@code data} is null
327     */
328    public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
329        return kolmogorovSmirnovTest(distribution, data, false);
330    }
331
332    /**
333     * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
334     * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
335     *
336     * @param distribution reference distribution
337     * @param data sample being being evaluated
338     * @param alpha significance level of the test
339     * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
340     *         can be rejected with confidence 1 - {@code alpha}
341     * @throws InsufficientDataException if {@code data} does not have length at least 2
342     * @throws NullArgumentException if {@code data} is null
343     */
344    public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
345        if ((alpha <= 0) || (alpha > 0.5)) {
346            throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
347        }
348        return kolmogorovSmirnovTest(distribution, data) < alpha;
349    }
350
351    /**
352     * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
353     * values given in [2] (see above). The result is not exact as with
354     * {@link #cdfExact(double, int)} because calculations are based on
355     * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
356     *
357     * @param d statistic
358     * @param n sample size
359     * @return \(P(D_n < d)\)
360     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
361     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
362     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
363     */
364    public double cdf(double d, int n)
365        throws MathArithmeticException {
366        return cdf(d, n, false);
367    }
368
369    /**
370     * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
371     * used everywhere at the expense of very slow execution time. Almost never choose this in real
372     * applications unless you are very sure; this is almost solely for verification purposes.
373     * Normally, you would choose {@link #cdf(double, int)}. See the class
374     * javadoc for definitions and algorithm description.
375     *
376     * @param d statistic
377     * @param n sample size
378     * @return \(P(D_n < d)\)
379     * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
380     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
381     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
382     */
383    public double cdfExact(double d, int n)
384        throws MathArithmeticException {
385        return cdf(d, n, true);
386    }
387
388    /**
389     * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
390     * values given in [2] (see above).
391     *
392     * @param d statistic
393     * @param n sample size
394     * @param exact whether the probability should be calculated exact using
395     *        {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
396     *        very slow execution time, or if {@code double} should be used convenient places to
397     *        gain speed. Almost never choose {@code true} in real applications unless you are very
398     *        sure; {@code true} is almost solely for verification purposes.
399     * @return \(P(D_n < d)\)
400     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
401     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
402     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
403     */
404    public double cdf(double d, int n, boolean exact)
405        throws MathArithmeticException {
406
407        final double ninv = 1 / ((double) n);
408        final double ninvhalf = 0.5 * ninv;
409
410        if (d <= ninvhalf) {
411            return 0;
412        } else if (ninvhalf < d && d <= ninv) {
413            double res = 1;
414            final double f = 2 * d - ninv;
415            // n! f^n = n*f * (n-1)*f * ... * 1*x
416            for (int i = 1; i <= n; ++i) {
417                res *= i * f;
418            }
419            return res;
420        } else if (1 - ninv <= d && d < 1) {
421            return 1 - 2 * Math.pow(1 - d, n);
422        } else if (1 <= d) {
423            return 1;
424        }
425        return exact ? exactK(d, n) : roundedK(d, n);
426    }
427
428    /**
429     * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
430     * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
431     * above).
432     *
433     * @param d statistic
434     * @param n sample size
435     * @return the two-sided probability of \(P(D_n < d)\)
436     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
437     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
438     *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
439     */
440    private double exactK(double d, int n)
441        throws MathArithmeticException {
442
443        final int k = (int) Math.ceil(n * d);
444
445        final FieldMatrix<BigFraction> H = this.createH(d, n);
446        final FieldMatrix<BigFraction> Hpower = H.power(n);
447
448        BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
449
450        for (int i = 1; i <= n; ++i) {
451            pFrac = pFrac.multiply(i).divide(n);
452        }
453
454        /*
455         * BigFraction.doubleValue converts numerator to double and the denominator to double and
456         * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
457         * digits):
458         */
459        return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
460    }
461
462    /**
463     * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
464     *
465     * @param d statistic
466     * @param n sample size
467     * @return the two-sided probability of \(P(D_n < d)\)
468     * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
469     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
470     *         - h) / m\ for integer {@code k, m} and \(0 <= h < 1\).
471     */
472    private double roundedK(double d, int n)
473        throws MathArithmeticException {
474
475        final int k = (int) Math.ceil(n * d);
476        final FieldMatrix<BigFraction> HBigFraction = this.createH(d, n);
477        final int m = HBigFraction.getRowDimension();
478
479        /*
480         * Here the rounding part comes into play: use RealMatrix instead of
481         * FieldMatrix<BigFraction>
482         */
483        final RealMatrix H = new Array2DRowRealMatrix(m, m);
484        for (int i = 0; i < m; ++i) {
485            for (int j = 0; j < m; ++j) {
486                H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue());
487            }
488        }
489        final RealMatrix Hpower = H.power(n);
490        double pFrac = Hpower.getEntry(k - 1, k - 1);
491        for (int i = 1; i <= n; ++i) {
492            pFrac *= (double) i / (double) n;
493        }
494        return pFrac;
495    }
496
497    /***
498     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
499     *
500     * @param d statistic
501     * @param n sample size
502     * @return H matrix
503     * @throws NumberIsTooLargeException if fractional part is greater than 1
504     * @throws FractionConversionException if algorithm fails to convert {@code h} to a
505     *         {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
506     *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
507     */
508    private FieldMatrix<BigFraction> createH(double d, int n)
509        throws NumberIsTooLargeException, FractionConversionException {
510
511        final int k = (int) Math.ceil(n * d);
512        final int m = 2 * k - 1;
513        final double hDouble = k - n * d;
514        if (hDouble >= 1) {
515            throw new NumberIsTooLargeException(hDouble, 1.0, false);
516        }
517        BigFraction h = null;
518        try {
519            h = new BigFraction(hDouble, 1.0e-20, 10000);
520        } catch (final FractionConversionException e1) {
521            try {
522                h = new BigFraction(hDouble, 1.0e-10, 10000);
523            } catch (final FractionConversionException e2) {
524                h = new BigFraction(hDouble, 1.0e-5, 10000);
525            }
526        }
527        final BigFraction[][] Hdata = new BigFraction[m][m];
528
529        /*
530         * Start by filling everything with either 0 or 1.
531         */
532        for (int i = 0; i < m; ++i) {
533            for (int j = 0; j < m; ++j) {
534                if (i - j + 1 < 0) {
535                    Hdata[i][j] = BigFraction.ZERO;
536                } else {
537                    Hdata[i][j] = BigFraction.ONE;
538                }
539            }
540        }
541
542        /*
543         * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
544         * hPowers[m-1] = h^m
545         */
546        final BigFraction[] hPowers = new BigFraction[m];
547        hPowers[0] = h;
548        for (int i = 1; i < m; ++i) {
549            hPowers[i] = h.multiply(hPowers[i - 1]);
550        }
551
552        /*
553         * First column and last row has special values (each other reversed).
554         */
555        for (int i = 0; i < m; ++i) {
556            Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
557            Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
558        }
559
560        /*
561         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
562         * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
563         */
564        if (h.compareTo(BigFraction.ONE_HALF) == 1) {
565            Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
566        }
567
568        /*
569         * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
570         * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
571         * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
572         * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
573         * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
574         * really necessary.
575         */
576        for (int i = 0; i < m; ++i) {
577            for (int j = 0; j < i + 1; ++j) {
578                if (i - j + 1 > 0) {
579                    for (int g = 2; g <= i - j + 1; ++g) {
580                        Hdata[i][j] = Hdata[i][j].divide(g);
581                    }
582                }
583            }
584        }
585        return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
586    }
587
588    /**
589     * Verifies that {@code array} has length at least 2.
590     *
591     * @param array array to test
592     * @throws NullArgumentException if array is null
593     * @throws InsufficientDataException if array is too short
594     */
595    private void checkArray(double[] array) {
596        if (array == null) {
597            throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
598        }
599        if (array.length < 2) {
600            throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
601                                                2);
602        }
603    }
604
605    /**
606     * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
607     * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
608     * have been computed. If the sum does not converge before {@code maxIterations} iterations a
609     * {@link TooManyIterationsException} is thrown.
610     *
611     * @param t argument
612     * @param tolerance Cauchy criterion for partial sums
613     * @param maxIterations maximum number of partial sums to compute
614     * @return Kolmogorov sum evaluated at t
615     * @throws TooManyIterationsException if the series does not converge
616     */
617    public double ksSum(double t, double tolerance, int maxIterations) {
618        // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
619        // from class javadoc should be used.
620        final double x = -2 * t * t;
621        int sign = -1;
622        long i = 1;
623        double partialSum = 0.5d;
624        double delta = 1;
625        while (delta > tolerance && i < maxIterations) {
626            delta = FastMath.exp(x * i * i);
627            partialSum += sign * delta;
628            sign *= -1;
629            i++;
630        }
631        if (i == maxIterations) {
632            throw new TooManyIterationsException(maxIterations);
633        }
634        return partialSum * 2;
635    }
636
637    /**
638     * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
639     * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
640     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
641     * <p>
642     * The returned probability is exact, obtained by enumerating all partitions of {@code m + n}
643     * into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the
644     * number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to)
645     * {@code d}.
646     * </p>
647     * <p>
648     * <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose
649     * {n}\), it is very slow if called for large {@code m, n}. For this reason,
650     * {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < }
651     * {@value #SMALL_SAMPLE_PRODUCT}.
652     * </p>
653     *
654     * @param d D-statistic value
655     * @param n first sample size
656     * @param m second sample size
657     * @param strict whether or not the probability to compute is expressed as a strict inequality
658     * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
659     *         greater than (resp. greater than or equal to) {@code d}
660     */
661    public double exactP(double d, int n, int m, boolean strict) {
662        Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
663        long tail = 0;
664        final double[] nSet = new double[n];
665        final double[] mSet = new double[m];
666        while (combinationsIterator.hasNext()) {
667            // Generate an n-set
668            final int[] nSetI = combinationsIterator.next();
669            // Copy the n-set to nSet and its complement to mSet
670            int j = 0;
671            int k = 0;
672            for (int i = 0; i < n + m; i++) {
673                if (j < n && nSetI[j] == i) {
674                    nSet[j++] = i;
675                } else {
676                    mSet[k++] = i;
677                }
678            }
679            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
680            if (curD > d) {
681                tail++;
682            } else if (curD == d && !strict) {
683                tail++;
684            }
685        }
686        return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
687    }
688
689    /**
690     * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
691     * is the 2-sample Kolmogorov-Smirnov statistic. See
692     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
693     * <p>
694     * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
695     * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
696     * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
697     * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
698     * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
699     * </p>
700     *
701     * @param d D-statistic value
702     * @param n first sample size
703     * @param m second sample size
704     * @return approximate probability that a randomly selected m-n partition of m + n generates
705     *         \(D_{n,m}\) greater than {@code d}
706     */
707    public double approximateP(double d, int n, int m) {
708        final double dm = m;
709        final double dn = n;
710        return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
711    }
712
713    /**
714     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
715     * 2-sample Kolmogorov-Smirnov statistic. See
716     * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
717     * <p>
718     * The simulation generates {@code iterations} random partitions of {@code m + n} into an
719     * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
720     * the proportion of values that are greater than {@code d}, or greater than or equal to
721     * {@code d} if {@code strict} is {@code false}.
722     * </p>
723     *
724     * @param d D-statistic value
725     * @param n first sample size
726     * @param m second sample size
727     * @param iterations number of random partitions to generate
728     * @param strict whether or not the probability to compute is expressed as a strict inequality
729     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
730     *         greater than (resp. greater than or equal to) {@code d}
731     */
732    public double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
733        final int[] nPlusMSet = MathArrays.natural(m + n);
734        final double[] nSet = new double[n];
735        final double[] mSet = new double[m];
736        int tail = 0;
737        for (int i = 0; i < iterations; i++) {
738            copyPartition(nSet, mSet, nPlusMSet, n, m);
739            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
740            if (curD > d) {
741                tail++;
742            } else if (curD == d && !strict) {
743                tail++;
744            }
745            MathArrays.shuffle(nPlusMSet, rng);
746            Arrays.sort(nPlusMSet, 0, n);
747        }
748        return (double) tail / iterations;
749    }
750
751    /**
752     * Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement
753     * relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and
754     * {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have
755     * {@code nSet = [1,4,5], mSet = [0,2,3]}.
756     * <p>
757     * <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted
758     * in ascending order.
759     * </p>
760     *
761     * @param nSet array to fill with the first {@code n} elements of {@code nSetI}
762     * @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative
763     *        to {@code m + n}
764     * @param nSetI array whose first {@code n} elements specify the members of {@code nSet}
765     * @param n number of elements in the first output array
766     * @param m number of elements in the second output array
767     */
768    private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) {
769        int j = 0;
770        int k = 0;
771        for (int i = 0; i < n + m; i++) {
772            if (j < n && nSetI[j] == i) {
773                nSet[j++] = i;
774            } else {
775                mSet[k++] = i;
776            }
777        }
778    }
779}