KolmogorovSmirnovTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.math4.legacy.stat.inference;

  18. import java.math.RoundingMode;
  19. import java.util.Arrays;

  20. import org.apache.commons.rng.simple.RandomSource;
  21. import org.apache.commons.rng.UniformRandomProvider;
  22. import org.apache.commons.statistics.distribution.ContinuousDistribution;
  23. import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
  24. import org.apache.commons.numbers.fraction.BigFraction;
  25. import org.apache.commons.numbers.field.BigFractionField;
  26. import org.apache.commons.math4.legacy.distribution.EnumeratedRealDistribution;
  27. import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution;
  28. import org.apache.commons.math4.legacy.exception.InsufficientDataException;
  29. import org.apache.commons.math4.legacy.exception.MathArithmeticException;
  30. import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
  31. import org.apache.commons.math4.legacy.exception.NullArgumentException;
  32. import org.apache.commons.math4.legacy.exception.NumberIsTooLargeException;
  33. import org.apache.commons.math4.legacy.exception.OutOfRangeException;
  34. import org.apache.commons.math4.legacy.exception.TooManyIterationsException;
  35. import org.apache.commons.math4.legacy.exception.NotANumberException;
  36. import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
  37. import org.apache.commons.math4.legacy.linear.MatrixUtils;
  38. import org.apache.commons.math4.legacy.linear.RealMatrix;
  39. import org.apache.commons.math4.core.jdkmath.JdkMath;
  40. import org.apache.commons.math4.legacy.core.MathArrays;
  41. import org.apache.commons.math4.legacy.field.linalg.FieldDenseMatrix;

  42. /**
  43.  * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
  44.  * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
  45.  * <p>
  46.  * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
  47.  * sample data points from the distribution expected under the null hypothesis. For one-sample tests
  48.  * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
  49.  * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
  50.  * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
  51.  * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
  52.  * given in [2].
  53.  * </p>
  54.  * <p>
  55.  * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
  56.  * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
  57.  * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
  58.  * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
  59.  * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
  60.  * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
  61.  * follows:
  62.  * <ul>
  63.  * <li>When the product of the sample sizes is less than 10000, the method presented in [4]
  64.  * is used to compute the exact p-value for the 2-sample test.</li>
  65.  * <li>When the product of the sample sizes is larger, the asymptotic
  66.  * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
  67.  * the approximation.</li>
  68.  * </ul><p>
  69.  * For small samples (former case), if the data contains ties, random jitter is added
  70.  * to the sample data to break ties before applying the algorithm above. Alternatively,
  71.  * the {@link #bootstrap(double[],double[],int,boolean,UniformRandomProvider)}
  72.  * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
  73.  * in the R Matching package [3], can be used if ties are known to be present in the data.
  74.  * </p>
  75.  * <p>
  76.  * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
  77.  * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} \ge d \)
  78.  * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
  79.  * {@code strict} parameter. This parameter is ignored for large samples.
  80.  * </p>
  81.  * <p>
  82.  * The methods used by the 2-sample default implementation are also exposed directly:
  83.  * <ul>
  84.  * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
  85.  * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
  86.  * arguments in the first two methods allow the probability used to estimate the p-value to be
  87.  * expressed using strict or non-strict inequality. See
  88.  * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
  89.  * </ul>
  90.  * <p>
  91.  * References:
  92.  * <ul>
  93.  * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
  94.  * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
  95.  * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
  96.  * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
  97.  * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
  98.  * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
  99.  * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
  100.  * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
  101.  * Chapter 5, 3rd Ed. Academic Press.</li>
  102.  * </ul>
  103.  * <br>
  104.  * Note that [1] contains an error in computing h, refer to <a
  105.  * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
  106.  *
  107.  * @since 3.3
  108.  */
  109. public class KolmogorovSmirnovTest {
  110.     /** pi^2. */
  111.     private static final double PI_SQUARED = JdkMath.PI * JdkMath.PI;
  112.     /**
  113.      * Bound on the number of partial sums in {@link #ksSum(double, double, int)}.
  114.      */
  115.     private static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
  116.     /** Convergence criterion for {@link #ksSum(double, double, int)}. */
  117.     private static final double KS_SUM_CAUCHY_CRITERION = 1e-20;
  118.     /** Convergence criterion for the sums in {@link #pelzGood(double, int)}. */
  119.     private static final double PG_SUM_RELATIVE_ERROR = 1e-10;
  120.     /** 1/2. */
  121.     private static final BigFraction ONE_HALF = BigFraction.of(1, 2);

  122.     /**
  123.      * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
  124.      * distribution to compute the p-value.
  125.      */
  126.     private static final int LARGE_SAMPLE_PRODUCT = 10000;

  127.     /**
  128.      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
  129.      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
  130.      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
  131.      * {@code exact} is true, the distribution used to compute the p-value is computed using
  132.      * extended precision. See {@link #cdfExact(double, int)}.
  133.      *
  134.      * @param distribution reference distribution
  135.      * @param data sample being being evaluated
  136.      * @param exact whether or not to force exact computation of the p-value
  137.      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
  138.      *         {@code distribution}
  139.      * @throws InsufficientDataException if {@code data} does not have length at least 2
  140.      * @throws NullArgumentException if {@code data} is null
  141.      */
  142.     public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, boolean exact) {
  143.         return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
  144.     }

  145.     /**
  146.      * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
  147.      * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
  148.      * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
  149.      * each of the values in {@code data}.
  150.      *
  151.      * @param distribution reference distribution
  152.      * @param data sample being evaluated
  153.      * @return Kolmogorov-Smirnov statistic \(D_n\)
  154.      * @throws InsufficientDataException if {@code data} does not have length at least 2
  155.      * @throws NullArgumentException if {@code data} is null
  156.      */
  157.     public double kolmogorovSmirnovStatistic(ContinuousDistribution distribution, double[] data) {
  158.         checkArray(data);
  159.         final int n = data.length;
  160.         final double nd = n;
  161.         final double[] dataCopy = new double[n];
  162.         System.arraycopy(data, 0, dataCopy, 0, n);
  163.         Arrays.sort(dataCopy);
  164.         double d = 0d;
  165.         for (int i = 1; i <= n; i++) {
  166.             final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
  167.             final double currD = JdkMath.max(yi - (i - 1) / nd, i / nd - yi);
  168.             if (currD > d) {
  169.                 d = currD;
  170.             }
  171.         }
  172.         return d;
  173.     }

  174.     /**
  175.      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
  176.      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
  177.      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
  178.      * probability distribution. Specifically, what is returned is an estimate of the probability
  179.      * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
  180.      * selected partition of the combined sample into subsamples of sizes {@code x.length} and
  181.      * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
  182.      * large as (if {@code strict} is {@code false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
  183.      *
  184.      * @param x first sample dataset.
  185.      * @param y second sample dataset.
  186.      * @param strict whether or not the probability to compute is expressed as
  187.      * a strict inequality (ignored for large samples).
  188.      * @return p-value associated with the null hypothesis that {@code x} and
  189.      * {@code y} represent samples from the same distribution.
  190.      * @throws InsufficientDataException if either {@code x} or {@code y} does
  191.      * not have length at least 2.
  192.      * @throws NullArgumentException if either {@code x} or {@code y} is null.
  193.      * @throws NotANumberException if the input arrays contain NaN values.
  194.      *
  195.      * @see #bootstrap(double[],double[],int,boolean,UniformRandomProvider)
  196.      */
  197.     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
  198.         final long lengthProduct = (long) x.length * y.length;
  199.         double[] xa = null;
  200.         double[] ya = null;
  201.         if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
  202.             xa = Arrays.copyOf(x, x.length);
  203.             ya = Arrays.copyOf(y, y.length);
  204.             fixTies(xa, ya);
  205.         } else {
  206.             xa = x;
  207.             ya = y;
  208.         }
  209.         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
  210.             return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
  211.         }
  212.         return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
  213.     }

  214.     /**
  215.      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
  216.      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
  217.      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
  218.      * probability distribution. Assumes the strict form of the inequality used to compute the
  219.      * p-value. See {@link #kolmogorovSmirnovTest(ContinuousDistribution, double[], boolean)}.
  220.      *
  221.      * @param x first sample dataset
  222.      * @param y second sample dataset
  223.      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
  224.      *         samples from the same distribution
  225.      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
  226.      *         least 2
  227.      * @throws NullArgumentException if either {@code x} or {@code y} is null
  228.      */
  229.     public double kolmogorovSmirnovTest(double[] x, double[] y) {
  230.         return kolmogorovSmirnovTest(x, y, true);
  231.     }

  232.     /**
  233.      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
  234.      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
  235.      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
  236.      * is the empirical distribution of the {@code y} values.
  237.      *
  238.      * @param x first sample
  239.      * @param y second sample
  240.      * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
  241.      *         {@code y} represent samples from the same underlying distribution
  242.      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
  243.      *         least 2
  244.      * @throws NullArgumentException if either {@code x} or {@code y} is null
  245.      */
  246.     public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
  247.         return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
  248.     }

  249.     /**
  250.      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
  251.      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
  252.      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
  253.      * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
  254.      * as long value.
  255.      *
  256.      * @param x first sample
  257.      * @param y second sample
  258.      * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
  259.      *         {@code y} represent samples from the same underlying distribution
  260.      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
  261.      *         least 2
  262.      * @throws NullArgumentException if either {@code x} or {@code y} is null
  263.      */
  264.     private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
  265.         checkArray(x);
  266.         checkArray(y);
  267.         // Copy and sort the sample arrays
  268.         final double[] sx = Arrays.copyOf(x, x.length);
  269.         final double[] sy = Arrays.copyOf(y, y.length);
  270.         Arrays.sort(sx);
  271.         Arrays.sort(sy);
  272.         final int n = sx.length;
  273.         final int m = sy.length;

  274.         int rankX = 0;
  275.         int rankY = 0;
  276.         long curD = 0L;

  277.         // Find the max difference between cdf_x and cdf_y
  278.         long supD = 0L;
  279.         do {
  280.             double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
  281.             while(rankX < n && Double.compare(sx[rankX], z) == 0) {
  282.                 rankX += 1;
  283.                 curD += m;
  284.             }
  285.             while(rankY < m && Double.compare(sy[rankY], z) == 0) {
  286.                 rankY += 1;
  287.                 curD -= n;
  288.             }
  289.             if (curD > supD) {
  290.                 supD = curD;
  291.             } else if (-curD > supD) {
  292.                 supD = -curD;
  293.             }
  294.         } while(rankX < n && rankY < m);
  295.         return supD;
  296.     }

  297.     /**
  298.      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
  299.      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
  300.      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
  301.      *
  302.      * @param distribution reference distribution
  303.      * @param data sample being being evaluated
  304.      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
  305.      *         {@code distribution}
  306.      * @throws InsufficientDataException if {@code data} does not have length at least 2
  307.      * @throws NullArgumentException if {@code data} is null
  308.      */
  309.     public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data) {
  310.         return kolmogorovSmirnovTest(distribution, data, false);
  311.     }

  312.     /**
  313.      * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
  314.      * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
  315.      *
  316.      * @param distribution reference distribution
  317.      * @param data sample being being evaluated
  318.      * @param alpha significance level of the test
  319.      * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
  320.      *         can be rejected with confidence 1 - {@code alpha}
  321.      * @throws InsufficientDataException if {@code data} does not have length at least 2
  322.      * @throws NullArgumentException if {@code data} is null
  323.      */
  324.     public boolean kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, double alpha) {
  325.         if (alpha <= 0 || alpha > 0.5) {
  326.             throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
  327.         }
  328.         return kolmogorovSmirnovTest(distribution, data) < alpha;
  329.     }

  330.     /**
  331.      * Estimates the <i>p-value</i> of a two-sample
  332.      * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test</a>
  333.      * evaluating the null hypothesis that {@code x} and {@code y} are samples
  334.      * drawn from the same probability distribution.
  335.      * This method estimates the p-value by repeatedly sampling sets of size
  336.      * {@code x.length} and {@code y.length} from the empirical distribution
  337.      * of the combined sample.
  338.      * When {@code strict} is true, this is equivalent to the algorithm implemented
  339.      * in the R function {@code ks.boot}, described in <pre>
  340.      * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
  341.      * Software with Automated Balance Optimization: The Matching package for R.'
  342.      * Journal of Statistical Software, 42(7): 1-52.
  343.      * </pre>
  344.      *
  345.      * @param x First sample.
  346.      * @param y Second sample.
  347.      * @param iterations Number of bootstrap resampling iterations.
  348.      * @param strict Whether or not the null hypothesis is expressed as a strict inequality.
  349.      * @param rng RNG for creating the sampling sets.
  350.      * @return the estimated p-value.
  351.      */
  352.     public double bootstrap(double[] x,
  353.                             double[] y,
  354.                             int iterations,
  355.                             boolean strict,
  356.                             UniformRandomProvider rng) {
  357.         final int xLength = x.length;
  358.         final int yLength = y.length;
  359.         final double[] combined = new double[xLength + yLength];
  360.         System.arraycopy(x, 0, combined, 0, xLength);
  361.         System.arraycopy(y, 0, combined, xLength, yLength);
  362.         final ContinuousDistribution.Sampler sampler = new EnumeratedRealDistribution(combined).createSampler(rng);
  363.         final long d = integralKolmogorovSmirnovStatistic(x, y);
  364.         int greaterCount = 0;
  365.         int equalCount = 0;
  366.         double[] curX;
  367.         double[] curY;
  368.         long curD;
  369.         for (int i = 0; i < iterations; i++) {
  370.             curX = AbstractRealDistribution.sample(xLength, sampler);
  371.             curY = AbstractRealDistribution.sample(yLength, sampler);
  372.             curD = integralKolmogorovSmirnovStatistic(curX, curY);
  373.             if (curD > d) {
  374.                 greaterCount++;
  375.             } else if (curD == d) {
  376.                 equalCount++;
  377.             }
  378.         }
  379.         return strict ? greaterCount / (double) iterations :
  380.             (greaterCount + equalCount) / (double) iterations;
  381.     }

  382.     /**
  383.      * Calculates \(P(D_n &lt; d)\) using the method described in [1] with quick decisions for extreme
  384.      * values given in [2] (see above). The result is not exact as with
  385.      * {@link #cdfExact(double, int)} because calculations are based on
  386.      * {@code double} rather than {@link BigFraction}.
  387.      *
  388.      * @param d statistic
  389.      * @param n sample size
  390.      * @return \(P(D_n &lt; d)\)
  391.      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
  392.      * {@link BigFraction} in expressing {@code d} as
  393.      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
  394.      */
  395.     public double cdf(double d, int n) {
  396.         return cdf(d, n, false);
  397.     }

  398.     /**
  399.      * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
  400.      * used everywhere at the expense of very slow execution time. Almost never choose this in real
  401.      * applications unless you are very sure; this is almost solely for verification purposes.
  402.      * Normally, you would choose {@link #cdf(double, int)}. See the class
  403.      * javadoc for definitions and algorithm description.
  404.      *
  405.      * @param d statistic
  406.      * @param n sample size
  407.      * @return \(P(D_n &lt; d)\)
  408.      * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
  409.      * {@link BigFraction} in expressing {@code d} as
  410.      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
  411.      */
  412.     public double cdfExact(double d, int n) {
  413.         return cdf(d, n, true);
  414.     }

  415.     /**
  416.      * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
  417.      * values given in [2] (see above).
  418.      *
  419.      * @param d statistic
  420.      * @param n sample size
  421.      * @param exact whether the probability should be calculated exact using
  422.      *        {@link BigFraction} everywhere at the expense of
  423.      *        very slow execution time, or if {@code double} should be used convenient places to
  424.      *        gain speed. Almost never choose {@code true} in real applications unless you are very
  425.      *        sure; {@code true} is almost solely for verification purposes.
  426.      * @return \(P(D_n &lt; d)\)
  427.      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
  428.      * {@link BigFraction} in expressing {@code d} as
  429.      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\).
  430.      */
  431.     public double cdf(double d, int n, boolean exact) {
  432.         final double ninv = 1 / ((double) n);
  433.         final double ninvhalf = 0.5 * ninv;

  434.         if (d <= ninvhalf) {
  435.             return 0;
  436.         } else if (ninvhalf < d && d <= ninv) {
  437.             double res = 1;
  438.             final double f = 2 * d - ninv;
  439.             // n! f^n = n*f * (n-1)*f * ... * 1*x
  440.             for (int i = 1; i <= n; ++i) {
  441.                 res *= i * f;
  442.             }
  443.             return res;
  444.         } else if (1 - ninv <= d && d < 1) {
  445.             return 1 - 2 * Math.pow(1 - d, n);
  446.         } else if (1 <= d) {
  447.             return 1;
  448.         }
  449.         if (exact) {
  450.             return exactK(d, n);
  451.         }
  452.         if (n <= 140) {
  453.             return roundedK(d, n);
  454.         }
  455.         return pelzGood(d, n);
  456.     }

  457.     /**
  458.      * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
  459.      * in class javadoc above) and {@link BigFraction} (see above).
  460.      *
  461.      * @param d statistic
  462.      * @param n sample size
  463.      * @return the two-sided probability of \(P(D_n &lt; d)\)
  464.      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
  465.      * {@link BigFraction}.
  466.      */
  467.     private double exactK(double d, int n) {
  468.         final int k = (int) Math.ceil(n * d);

  469.         final FieldDenseMatrix<BigFraction> h;
  470.         try {
  471.             h = createExactH(d, n);
  472.         } catch (ArithmeticException e) {
  473.             throw new MathArithmeticException(LocalizedFormats.FRACTION);
  474.         }

  475.         final FieldDenseMatrix<BigFraction> hPower = h.pow(n);

  476.         BigFraction pFrac = hPower.get(k - 1, k - 1);

  477.         for (int i = 1; i <= n; ++i) {
  478.             pFrac = pFrac.multiply(i).divide(n);
  479.         }

  480.         /*
  481.          * BigFraction.doubleValue converts numerator to double and the denominator to double and
  482.          * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
  483.          * digits):
  484.          */
  485.         return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
  486.     }

  487.     /**
  488.      * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
  489.      *
  490.      * @param d statistic
  491.      * @param n sample size
  492.      * @return \(P(D_n &lt; d)\)
  493.      */
  494.     private double roundedK(double d, int n) {

  495.         final int k = (int) Math.ceil(n * d);
  496.         final RealMatrix h = this.createRoundedH(d, n);
  497.         final RealMatrix hPower = h.power(n);

  498.         double pFrac = hPower.getEntry(k - 1, k - 1);
  499.         for (int i = 1; i <= n; ++i) {
  500.             pFrac *= (double) i / (double) n;
  501.         }

  502.         return pFrac;
  503.     }

  504.     /**
  505.      * Computes the Pelz-Good approximation for \(P(D_n &lt; d)\) as described in [2] in the class javadoc.
  506.      *
  507.      * @param d value of d-statistic (x in [2])
  508.      * @param n sample size
  509.      * @return \(P(D_n &lt; d)\)
  510.      * @since 3.4
  511.      */
  512.     public double pelzGood(double d, int n) {
  513.         // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
  514.         final double sqrtN = JdkMath.sqrt(n);
  515.         final double z = d * sqrtN;
  516.         final double z2 = d * d * n;
  517.         final double z4 = z2 * z2;
  518.         final double z6 = z4 * z2;
  519.         final double z8 = z4 * z4;

  520.         // Eventual return value
  521.         double ret = 0;

  522.         // Compute K_0(z)
  523.         double sum = 0;
  524.         double increment = 0;
  525.         double kTerm = 0;
  526.         double z2Term = PI_SQUARED / (8 * z2);
  527.         int k = 1;
  528.         for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  529.             kTerm = 2 * k - 1;
  530.             increment = JdkMath.exp(-z2Term * kTerm * kTerm);
  531.             sum += increment;
  532.             if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
  533.                 break;
  534.             }
  535.         }
  536.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  537.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  538.         }
  539.         ret = sum * JdkMath.sqrt(2 * JdkMath.PI) / z;

  540.         // K_1(z)
  541.         // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
  542.         // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
  543.         final double twoZ2 = 2 * z2;
  544.         sum = 0;
  545.         kTerm = 0;
  546.         double kTerm2 = 0;
  547.         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  548.             kTerm = k + 0.5;
  549.             kTerm2 = kTerm * kTerm;
  550.             increment = (PI_SQUARED * kTerm2 - z2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
  551.             sum += increment;
  552.             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
  553.                 break;
  554.             }
  555.         }
  556.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  557.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  558.         }
  559.         final double sqrtHalfPi = JdkMath.sqrt(JdkMath.PI / 2);
  560.         // Instead of doubling sum, divide by 3 instead of 6
  561.         ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);

  562.         // K_2(z)
  563.         // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
  564.         final double z4Term = 2 * z4;
  565.         final double z6Term = 6 * z6;
  566.         z2Term = 5 * z2;
  567.         final double pi4 = PI_SQUARED * PI_SQUARED;
  568.         sum = 0;
  569.         kTerm = 0;
  570.         kTerm2 = 0;
  571.         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  572.             kTerm = k + 0.5;
  573.             kTerm2 = kTerm * kTerm;
  574.             increment =  (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
  575.                     pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
  576.             sum += increment;
  577.             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
  578.                 break;
  579.             }
  580.         }
  581.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  582.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  583.         }
  584.         double sum2 = 0;
  585.         kTerm2 = 0;
  586.         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  587.             kTerm2 = (double) k * k;
  588.             increment = PI_SQUARED * kTerm2 * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
  589.             sum2 += increment;
  590.             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
  591.                 break;
  592.             }
  593.         }
  594.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  595.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  596.         }
  597.         // Again, adjust coefficients instead of doubling sum, sum2
  598.         ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));

  599.         // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
  600.         // Multiply coefficient denominators by 2, so omit doubling sums.
  601.         final double pi6 = pi4 * PI_SQUARED;
  602.         sum = 0;
  603.         double kTerm4 = 0;
  604.         double kTerm6 = 0;
  605.         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  606.             kTerm = k + 0.5;
  607.             kTerm2 = kTerm * kTerm;
  608.             kTerm4 = kTerm2 * kTerm2;
  609.             kTerm6 = kTerm4 * kTerm2;
  610.             increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
  611.                             PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
  612.                     JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
  613.             sum += increment;
  614.             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
  615.                 break;
  616.             }
  617.         }
  618.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  619.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  620.         }
  621.         sum2 = 0;
  622.         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
  623.             kTerm2 = (double) k * k;
  624.             kTerm4 = kTerm2 * kTerm2;
  625.             increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
  626.                     JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
  627.             sum2 += increment;
  628.             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
  629.                 break;
  630.             }
  631.         }
  632.         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
  633.             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
  634.         }
  635.         return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
  636.                 sum2 / (108 * z6));
  637.     }

  638.     /**
  639.      * Creates {@code H} of size {@code m x m} as described in [1] (see above).
  640.      *
  641.      * @param d statistic
  642.      * @param n sample size
  643.      * @return H matrix
  644.      * @throws NumberIsTooLargeException if fractional part is greater than 1.
  645.      * @throws ArithmeticException if algorithm fails to convert {@code h} to a
  646.      * {@link BigFraction}.
  647.      */
  648.     private FieldDenseMatrix<BigFraction> createExactH(double d,
  649.                                                        int n) {
  650.         final int k = (int) Math.ceil(n * d);
  651.         final int m = 2 * k - 1;
  652.         final double hDouble = k - n * d;
  653.         if (hDouble >= 1) {
  654.             throw new NumberIsTooLargeException(hDouble, 1.0, false);
  655.         }
  656.         BigFraction h = null;
  657.         try {
  658.             h = BigFraction.from(hDouble, 1e-20, 10000);
  659.         } catch (final ArithmeticException e1) {
  660.             try {
  661.                 h = BigFraction.from(hDouble, 1e-10, 10000);
  662.             } catch (final ArithmeticException e2) {
  663.                 h = BigFraction.from(hDouble, 1e-5, 10000);
  664.             }
  665.         }
  666.         final FieldDenseMatrix<BigFraction> hData = FieldDenseMatrix.create(BigFractionField.get(), m, m);

  667.         /*
  668.          * Start by filling everything with either 0 or 1.
  669.          */
  670.         for (int i = 0; i < m; ++i) {
  671.             for (int j = 0; j < m; ++j) {
  672.                 if (i - j + 1 < 0) {
  673.                     hData.set(i, j, BigFraction.ZERO);
  674.                 } else {
  675.                     hData.set(i, j, BigFraction.ONE);
  676.                 }
  677.             }
  678.         }

  679.         /*
  680.          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
  681.          * hPowers[m-1] = h^m
  682.          */
  683.         final BigFraction[] hPowers = new BigFraction[m];
  684.         hPowers[0] = h;
  685.         for (int i = 1; i < m; i++) {
  686.             hPowers[i] = h.multiply(hPowers[i - 1]);
  687.         }

  688.         /*
  689.          * First column and last row has special values (each other reversed).
  690.          */
  691.         for (int i = 0; i < m; ++i) {
  692.             hData.set(i, 0,
  693.                       hData.get(i, 0).subtract(hPowers[i]));
  694.             hData.set(m - 1, i,
  695.                       hData.get(m - 1, i).subtract(hPowers[m - i - 1]));
  696.         }

  697.         /*
  698.          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
  699.          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
  700.          */
  701.         if (h.compareTo(ONE_HALF) > 0) {
  702.             hData.set(m - 1, 0,
  703.                       hData.get(m - 1, 0).add(h.multiply(2).subtract(1).pow(m)));
  704.         }

  705.         /*
  706.          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
  707.          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
  708.          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
  709.          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
  710.          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
  711.          * really necessary.
  712.          */
  713.         for (int i = 0; i < m; ++i) {
  714.             for (int j = 0; j < i + 1; ++j) {
  715.                 if (i - j + 1 > 0) {
  716.                     for (int g = 2; g <= i - j + 1; ++g) {
  717.                         hData.set(i, j,
  718.                                   hData.get(i, j).divide(g));
  719.                     }
  720.                 }
  721.             }
  722.         }
  723.         return hData;
  724.     }

  725.     /***
  726.      * Creates {@code H} of size {@code m x m} as described in [1] (see above)
  727.      * using double-precision.
  728.      *
  729.      * @param d statistic
  730.      * @param n sample size
  731.      * @return H matrix
  732.      * @throws NumberIsTooLargeException if fractional part is greater than 1
  733.      */
  734.     private RealMatrix createRoundedH(double d, int n)
  735.         throws NumberIsTooLargeException {

  736.         final int k = (int) Math.ceil(n * d);
  737.         final int m = 2 * k - 1;
  738.         final double h = k - n * d;
  739.         if (h >= 1) {
  740.             throw new NumberIsTooLargeException(h, 1.0, false);
  741.         }
  742.         final double[][] hData = new double[m][m];

  743.         /*
  744.          * Start by filling everything with either 0 or 1.
  745.          */
  746.         for (int i = 0; i < m; ++i) {
  747.             for (int j = 0; j < m; ++j) {
  748.                 if (i - j + 1 < 0) {
  749.                     hData[i][j] = 0;
  750.                 } else {
  751.                     hData[i][j] = 1;
  752.                 }
  753.             }
  754.         }

  755.         /*
  756.          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
  757.          * hPowers[m-1] = h^m
  758.          */
  759.         final double[] hPowers = new double[m];
  760.         hPowers[0] = h;
  761.         for (int i = 1; i < m; ++i) {
  762.             hPowers[i] = h * hPowers[i - 1];
  763.         }

  764.         /*
  765.          * First column and last row has special values (each other reversed).
  766.          */
  767.         for (int i = 0; i < m; ++i) {
  768.             hData[i][0] = hData[i][0] - hPowers[i];
  769.             hData[m - 1][i] -= hPowers[m - i - 1];
  770.         }

  771.         /*
  772.          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
  773.          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
  774.          */
  775.         if (Double.compare(h, 0.5) > 0) {
  776.             hData[m - 1][0] += JdkMath.pow(2 * h - 1, m);
  777.         }

  778.         /*
  779.          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
  780.          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
  781.          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
  782.          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
  783.          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
  784.          * really necessary.
  785.          */
  786.         for (int i = 0; i < m; ++i) {
  787.             for (int j = 0; j < i + 1; ++j) {
  788.                 if (i - j + 1 > 0) {
  789.                     for (int g = 2; g <= i - j + 1; ++g) {
  790.                         hData[i][j] /= g;
  791.                     }
  792.                 }
  793.             }
  794.         }
  795.         return MatrixUtils.createRealMatrix(hData);
  796.     }

  797.     /**
  798.      * Verifies that {@code array} has length at least 2.
  799.      *
  800.      * @param array array to test
  801.      * @throws NullArgumentException if array is null
  802.      * @throws InsufficientDataException if array is too short
  803.      */
  804.     private void checkArray(double[] array) {
  805.         if (array == null) {
  806.             throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
  807.         }
  808.         if (array.length < 2) {
  809.             throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
  810.                                                 2);
  811.         }
  812.     }

  813.     /**
  814.      * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
  815.      * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
  816.      * have been computed. If the sum does not converge before {@code maxIterations} iterations a
  817.      * {@link TooManyIterationsException} is thrown.
  818.      *
  819.      * @param t argument
  820.      * @param tolerance Cauchy criterion for partial sums
  821.      * @param maxIterations maximum number of partial sums to compute
  822.      * @return Kolmogorov sum evaluated at t
  823.      * @throws TooManyIterationsException if the series does not converge
  824.      */
  825.     public double ksSum(double t, double tolerance, int maxIterations) {
  826.         if (t == 0.0) {
  827.             return 0.0;
  828.         }

  829.         // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
  830.         // from class javadoc should be used.

  831.         final double x = -2 * t * t;
  832.         int sign = -1;
  833.         long i = 1;
  834.         double partialSum = 0.5d;
  835.         double delta = 1;
  836.         while (delta > tolerance && i < maxIterations) {
  837.             delta = JdkMath.exp(x * i * i);
  838.             partialSum += sign * delta;
  839.             sign *= -1;
  840.             i++;
  841.         }
  842.         if (i == maxIterations) {
  843.             throw new TooManyIterationsException(maxIterations);
  844.         }
  845.         return partialSum * 2;
  846.     }

  847.     /**
  848.      * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
  849.      * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
  850.      * comparison with other integral d-statistics. Depending whether {@code strict} is
  851.      * {@code true} or not, the returned value divided by (n*m) is greater than
  852.      * (resp greater than or equal to) the given d value (allowing some tolerance).
  853.      *
  854.      * @param d a d-statistic in the range [0, 1]
  855.      * @param n first sample size
  856.      * @param m second sample size
  857.      * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
  858.      * @return the integral d-statistic in the range [0, n*m]
  859.      */
  860.     private static long calculateIntegralD(double d, int n, int m, boolean strict) {
  861.         final double tol = 1e-12;  // d-values within tol of one another are considered equal
  862.         long nm = n * (long)m;
  863.         long upperBound = (long)JdkMath.ceil((d - tol) * nm);
  864.         long lowerBound = (long)JdkMath.floor((d + tol) * nm);
  865.         if (strict && lowerBound == upperBound) {
  866.             return upperBound + 1L;
  867.         } else {
  868.             return upperBound;
  869.         }
  870.     }

  871.     /**
  872.      * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
  873.      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
  874.      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
  875.      * <p>
  876.      * The returned probability is exact, implemented by unwinding the recursive function
  877.      * definitions presented in [4] (class javadoc).
  878.      * </p>
  879.      *
  880.      * @param d D-statistic value
  881.      * @param n first sample size
  882.      * @param m second sample size
  883.      * @param strict whether or not the probability to compute is expressed as a strict inequality
  884.      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
  885.      *         greater than (resp. greater than or equal to) {@code d}
  886.      */
  887.     public double exactP(double d, int n, int m, boolean strict) {
  888.        return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
  889.            BinomialCoefficientDouble.value(n + m, m);
  890.     }

  891.     /**
  892.      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
  893.      * is the 2-sample Kolmogorov-Smirnov statistic. See
  894.      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
  895.      * <p>
  896.      * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
  897.      * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
  898.      * details on how convergence of the sum is determined.
  899.      * </p>
  900.      *
  901.      * @param d D-statistic value
  902.      * @param n first sample size
  903.      * @param m second sample size
  904.      * @return approximate probability that a randomly selected m-n partition of m + n generates
  905.      *         \(D_{n,m}\) greater than {@code d}
  906.      */
  907.     public double approximateP(double d, int n, int m) {
  908.         return 1 - ksSum(d * JdkMath.sqrt(((double) m * (double) n) / ((double) m + (double) n)),
  909.                          KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
  910.     }

  911.     /**
  912.      * Fills a boolean array randomly with a fixed number of {@code true} values.
  913.      * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
  914.      * By processing first the {@code true} values followed by the remaining {@code false} values
  915.      * less random numbers need to be generated. The method is optimized for the case
  916.      * that the number of {@code true} values is larger than or equal to the number of
  917.      * {@code false} values.
  918.      *
  919.      * @param b boolean array
  920.      * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
  921.      * @param rng random data generator
  922.      */
  923.     private static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final UniformRandomProvider rng) {
  924.         Arrays.fill(b, true);
  925.         for (int k = numberOfTrueValues; k < b.length; k++) {
  926.             final int r = rng.nextInt(k + 1);
  927.             b[(b[r]) ? r : k] = false;
  928.         }
  929.     }

  930.     /**
  931.      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\) is the
  932.      * 2-sample Kolmogorov-Smirnov statistic. See
  933.      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
  934.      * <p>
  935.      * The simulation generates {@code iterations} random partitions of {@code m + n} into an
  936.      * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
  937.      * the proportion of values that are greater than {@code d}, or greater than or equal to
  938.      * {@code d} if {@code strict} is {@code false}.
  939.      * </p>
  940.      *
  941.      * @param d D-statistic value.
  942.      * @param n First sample size.
  943.      * @param m Second sample size.
  944.      * @param iterations Number of random partitions to generate.
  945.      * @param strict whether or not the probability to compute is expressed as a strict inequality
  946.      * @param rng RNG used for generating the partitions.
  947.      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
  948.      * greater than (resp. greater than or equal to) {@code d}.
  949.      */
  950.     public double monteCarloP(final double d,
  951.                               final int n,
  952.                               final int m,
  953.                               final boolean strict,
  954.                               final int iterations,
  955.                               UniformRandomProvider rng) {
  956.         return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations, rng);
  957.     }

  958.     /**
  959.      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d / (n * m))\)
  960.      * where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
  961.      * <p>
  962.      * Here {@code d} is the D-statistic represented as long value.
  963.      * The real D-statistic is obtained by dividing {@code d} by {@code n * m}.
  964.      * See also {@link #monteCarloP(double,int,int,boolean,int,UniformRandomProvider)}.
  965.      *
  966.      * @param d Integral D-statistic.
  967.      * @param n First sample size.
  968.      * @param m Second sample size.
  969.      * @param iterations Number of random partitions to generate.
  970.      * @param rng RNG used for generating the partitions.
  971.      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
  972.      * greater than or equal to {@code d / (n * m))}.
  973.      */
  974.     private double integralMonteCarloP(final long d,
  975.                                        final int n,
  976.                                        final int m,
  977.                                        final int iterations,
  978.                                        UniformRandomProvider rng) {
  979.         // ensure that nn is always the max of (n, m) to require fewer random numbers
  980.         final int nn = JdkMath.max(n, m);
  981.         final int mm = JdkMath.min(n, m);
  982.         final int sum = nn + mm;

  983.         int tail = 0;
  984.         final boolean[] b = new boolean[sum];
  985.         for (int i = 0; i < iterations; i++) {
  986.             fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
  987.             long curD = 0L;
  988.             for(int j = 0; j < b.length; ++j) {
  989.                 if (b[j]) {
  990.                     curD += mm;
  991.                     if (curD >= d) {
  992.                         tail++;
  993.                         break;
  994.                     }
  995.                 } else {
  996.                     curD -= nn;
  997.                     if (curD <= -d) {
  998.                         tail++;
  999.                         break;
  1000.                     }
  1001.                 }
  1002.             }
  1003.         }
  1004.         return (double) tail / iterations;
  1005.     }

  1006.     /**
  1007.      * If there are no ties in the combined dataset formed from x and y,
  1008.      * this method is a no-op.
  1009.      * If there are ties, a uniform random deviate in
  1010.      * is added to each value in x and y, and this method overwrites the
  1011.      * data in x and y with the jittered values.
  1012.      *
  1013.      * @param x First sample.
  1014.      * @param y Second sample.
  1015.      * @throw NotANumberException if any of the input arrays contain
  1016.      * a NaN value.
  1017.      */
  1018.     private static void fixTies(double[] x, double[] y) {
  1019.         if (hasTies(x, y)) {
  1020.             // Add jitter using a fixed seed (so same arguments always give same results),
  1021.             // low-initialization-overhead generator.
  1022.             final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(876543217L);

  1023.             // It is theoretically possible that jitter does not break ties, so repeat
  1024.             // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
  1025.             int fixedRangeTrials = 10;
  1026.             int jitterRange = 10;
  1027.             int ct = 0;
  1028.             boolean ties = true;
  1029.             do {
  1030.                 // Nudge the data using a fixed range.
  1031.                 jitter(x, rng, jitterRange);
  1032.                 jitter(y, rng, jitterRange);
  1033.                 ties = hasTies(x, y);
  1034.                 ++ct;
  1035.             } while (ties && ct < fixedRangeTrials);

  1036.             if (ties) {
  1037.                 throw new MaxCountExceededException(fixedRangeTrials);
  1038.             }
  1039.         }
  1040.     }

  1041.     /**
  1042.      * Returns true iff there are ties in the combined sample formed from
  1043.      * x and y.
  1044.      *
  1045.      * @param x First sample.
  1046.      * @param y Second sample.
  1047.      * @return true if x and y together contain ties.
  1048.      * @throw InsufficientDataException if the input arrays only contain infinite values.
  1049.      * @throw NotANumberException if any of the input arrays contain a NaN value.
  1050.      */
  1051.     private static boolean hasTies(double[] x, double[] y) {
  1052.        final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));

  1053.        // "unique" moves NaN to the head of the output array.
  1054.        if (Double.isNaN(values[0])) {
  1055.            throw new NotANumberException();
  1056.        }

  1057.        int infCount = 0;
  1058.        for (double v : values) {
  1059.            if (Double.isInfinite(v)) {
  1060.                ++infCount;
  1061.            }
  1062.        }
  1063.        if (infCount == values.length) {
  1064.            throw new InsufficientDataException();
  1065.        }

  1066.         return values.length != x.length + y.length;  // There are no ties.
  1067.     }

  1068.     /**
  1069.      * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
  1070.      * <p>
  1071.      * Note that jitter is applied in-place - i.e., the array
  1072.      * values are overwritten with the result of applying jitter.</p>
  1073.      *
  1074.      * @param data input/output data array - entries overwritten by the method
  1075.      * @param rng probability distribution to sample for jitter values
  1076.      * @param ulp ulp used when generating random numbers
  1077.      * @throws NullPointerException if either of the parameters is null
  1078.      */
  1079.     private static void jitter(double[] data,
  1080.                                UniformRandomProvider rng,
  1081.                                int ulp) {
  1082.         final int range = ulp * 2;
  1083.         for (int i = 0; i < data.length; i++) {
  1084.             final double orig = data[i];
  1085.             if (Double.isInfinite(orig)) {
  1086.                 continue; // Avoid NaN creation.
  1087.             }

  1088.             final int rand = rng.nextInt(range) - ulp;
  1089.             final double ulps = Math.ulp(orig);
  1090.             final double r = rand * ulps;

  1091.             data[i] += r;
  1092.         }
  1093.     }

  1094.     /**
  1095.      * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
  1096.      * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
  1097.      * and recoded as a long to avoid rounding errors in comparison tests, so what
  1098.      * is actually tested is |im - jn| <= cmn.
  1099.      *
  1100.      * @param i first path parameter
  1101.      * @param j second path paramter
  1102.      * @param m first sample size
  1103.      * @param n second sample size
  1104.      * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
  1105.      * @param strict whether or not the null hypothesis uses strict inequality
  1106.      * @return C(i,j) for given m, n, c
  1107.      */
  1108.     private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
  1109.         if (strict) {
  1110.             return JdkMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
  1111.         }
  1112.         return JdkMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
  1113.     }

  1114.     /**
  1115.      * The function N(i, j) defined in [4] (class javadoc).
  1116.      * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
  1117.      * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
  1118.      * The return value is integral, but subject to overflow, so it is maintained and
  1119.      * returned as a double.
  1120.      *
  1121.      * @param i first path parameter
  1122.      * @param j second path parameter
  1123.      * @param m first sample size
  1124.      * @param n second sample size
  1125.      * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
  1126.      * @param strict whether or not the null hypothesis uses strict inequality
  1127.      * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
  1128.      */
  1129.     private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
  1130.         /*
  1131.          * Unwind the recursive definition given in [4].
  1132.          * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
  1133.          * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
  1134.          */
  1135.         final double[] lag = new double[n];
  1136.         double last = 0;
  1137.         for (int k = 0; k < n; k++) {
  1138.             lag[k] = c(0, k + 1, m, n, cnm, strict);
  1139.         }
  1140.         for (int k = 1; k <= i; k++) {
  1141.             last = c(k, 0, m, n, cnm, strict);
  1142.             for (int l = 1; l <= j; l++) {
  1143.                 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
  1144.                 last = lag[l - 1];
  1145.             }
  1146.         }
  1147.         return last;
  1148.     }
  1149. }