View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math4.legacy.stat.inference;
19  
20  import java.math.RoundingMode;
21  import java.util.Arrays;
22  
23  import org.apache.commons.rng.simple.RandomSource;
24  import org.apache.commons.rng.UniformRandomProvider;
25  import org.apache.commons.statistics.distribution.ContinuousDistribution;
26  import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
27  import org.apache.commons.numbers.fraction.BigFraction;
28  import org.apache.commons.numbers.field.BigFractionField;
29  import org.apache.commons.math4.legacy.distribution.EnumeratedRealDistribution;
30  import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution;
31  import org.apache.commons.math4.legacy.exception.InsufficientDataException;
32  import org.apache.commons.math4.legacy.exception.MathArithmeticException;
33  import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
34  import org.apache.commons.math4.legacy.exception.NullArgumentException;
35  import org.apache.commons.math4.legacy.exception.NumberIsTooLargeException;
36  import org.apache.commons.math4.legacy.exception.OutOfRangeException;
37  import org.apache.commons.math4.legacy.exception.TooManyIterationsException;
38  import org.apache.commons.math4.legacy.exception.NotANumberException;
39  import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
40  import org.apache.commons.math4.legacy.linear.MatrixUtils;
41  import org.apache.commons.math4.legacy.linear.RealMatrix;
42  import org.apache.commons.math4.core.jdkmath.JdkMath;
43  import org.apache.commons.math4.legacy.core.MathArrays;
44  import org.apache.commons.math4.legacy.field.linalg.FieldDenseMatrix;
45  
46  /**
47   * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
48   * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
49   * <p>
50   * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
51   * sample data points from the distribution expected under the null hypothesis. For one-sample tests
52   * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
53   * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
54   * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
55   * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
56   * given in [2].
57   * </p>
58   * <p>
59   * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
60   * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
61   * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
62   * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
63   * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
64   * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
65   * follows:
66   * <ul>
67   * <li>When the product of the sample sizes is less than 10000, the method presented in [4]
68   * is used to compute the exact p-value for the 2-sample test.</li>
69   * <li>When the product of the sample sizes is larger, the asymptotic
70   * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
71   * the approximation.</li>
72   * </ul><p>
73   * For small samples (former case), if the data contains ties, random jitter is added
74   * to the sample data to break ties before applying the algorithm above. Alternatively,
75   * the {@link #bootstrap(double[],double[],int,boolean,UniformRandomProvider)}
76   * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
77   * in the R Matching package [3], can be used if ties are known to be present in the data.
78   * </p>
79   * <p>
80   * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
81   * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} \ge d \)
82   * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
83   * {@code strict} parameter. This parameter is ignored for large samples.
84   * </p>
85   * <p>
86   * The methods used by the 2-sample default implementation are also exposed directly:
87   * <ul>
88   * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
89   * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
90   * arguments in the first two methods allow the probability used to estimate the p-value to be
91   * expressed using strict or non-strict inequality. See
92   * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
93   * </ul>
94   * <p>
95   * References:
96   * <ul>
97   * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
98   * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
99   * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
100  * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
101  * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
102  * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
103  * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
104  * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
105  * Chapter 5, 3rd Ed. Academic Press.</li>
106  * </ul>
107  * <br>
108  * Note that [1] contains an error in computing h, refer to <a
109  * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
110  *
111  * @since 3.3
112  */
113 public class KolmogorovSmirnovTest {
114     /** pi^2. */
115     private static final double PI_SQUARED = JdkMath.PI * JdkMath.PI;
116     /**
117      * Bound on the number of partial sums in {@link #ksSum(double, double, int)}.
118      */
119     private static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
120     /** Convergence criterion for {@link #ksSum(double, double, int)}. */
121     private static final double KS_SUM_CAUCHY_CRITERION = 1e-20;
122     /** Convergence criterion for the sums in {@link #pelzGood(double, int)}. */
123     private static final double PG_SUM_RELATIVE_ERROR = 1e-10;
124     /** 1/2. */
125     private static final BigFraction ONE_HALF = BigFraction.of(1, 2);
126 
127     /**
128      * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
129      * distribution to compute the p-value.
130      */
131     private static final int LARGE_SAMPLE_PRODUCT = 10000;
132 
133     /**
134      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
135      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
136      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
137      * {@code exact} is true, the distribution used to compute the p-value is computed using
138      * extended precision. See {@link #cdfExact(double, int)}.
139      *
140      * @param distribution reference distribution
141      * @param data sample being being evaluated
142      * @param exact whether or not to force exact computation of the p-value
143      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
144      *         {@code distribution}
145      * @throws InsufficientDataException if {@code data} does not have length at least 2
146      * @throws NullArgumentException if {@code data} is null
147      */
148     public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, boolean exact) {
149         return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
150     }
151 
152     /**
153      * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
154      * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
155      * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
156      * each of the values in {@code data}.
157      *
158      * @param distribution reference distribution
159      * @param data sample being evaluated
160      * @return Kolmogorov-Smirnov statistic \(D_n\)
161      * @throws InsufficientDataException if {@code data} does not have length at least 2
162      * @throws NullArgumentException if {@code data} is null
163      */
164     public double kolmogorovSmirnovStatistic(ContinuousDistribution distribution, double[] data) {
165         checkArray(data);
166         final int n = data.length;
167         final double nd = n;
168         final double[] dataCopy = new double[n];
169         System.arraycopy(data, 0, dataCopy, 0, n);
170         Arrays.sort(dataCopy);
171         double d = 0d;
172         for (int i = 1; i <= n; i++) {
173             final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
174             final double currD = JdkMath.max(yi - (i - 1) / nd, i / nd - yi);
175             if (currD > d) {
176                 d = currD;
177             }
178         }
179         return d;
180     }
181 
182     /**
183      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
184      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
185      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
186      * probability distribution. Specifically, what is returned is an estimate of the probability
187      * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
188      * selected partition of the combined sample into subsamples of sizes {@code x.length} and
189      * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
190      * large as (if {@code strict} is {@code false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
191      *
192      * @param x first sample dataset.
193      * @param y second sample dataset.
194      * @param strict whether or not the probability to compute is expressed as
195      * a strict inequality (ignored for large samples).
196      * @return p-value associated with the null hypothesis that {@code x} and
197      * {@code y} represent samples from the same distribution.
198      * @throws InsufficientDataException if either {@code x} or {@code y} does
199      * not have length at least 2.
200      * @throws NullArgumentException if either {@code x} or {@code y} is null.
201      * @throws NotANumberException if the input arrays contain NaN values.
202      *
203      * @see #bootstrap(double[],double[],int,boolean,UniformRandomProvider)
204      */
205     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
206         final long lengthProduct = (long) x.length * y.length;
207         double[] xa = null;
208         double[] ya = null;
209         if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
210             xa = Arrays.copyOf(x, x.length);
211             ya = Arrays.copyOf(y, y.length);
212             fixTies(xa, ya);
213         } else {
214             xa = x;
215             ya = y;
216         }
217         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
218             return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
219         }
220         return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
221     }
222 
223     /**
224      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
225      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
226      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
227      * probability distribution. Assumes the strict form of the inequality used to compute the
228      * p-value. See {@link #kolmogorovSmirnovTest(ContinuousDistribution, double[], boolean)}.
229      *
230      * @param x first sample dataset
231      * @param y second sample dataset
232      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
233      *         samples from the same distribution
234      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
235      *         least 2
236      * @throws NullArgumentException if either {@code x} or {@code y} is null
237      */
238     public double kolmogorovSmirnovTest(double[] x, double[] y) {
239         return kolmogorovSmirnovTest(x, y, true);
240     }
241 
242     /**
243      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
244      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
245      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
246      * is the empirical distribution of the {@code y} values.
247      *
248      * @param x first sample
249      * @param y second sample
250      * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
251      *         {@code y} represent samples from the same underlying distribution
252      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
253      *         least 2
254      * @throws NullArgumentException if either {@code x} or {@code y} is null
255      */
256     public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
257         return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
258     }
259 
260     /**
261      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
262      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
263      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
264      * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
265      * as long value.
266      *
267      * @param x first sample
268      * @param y second sample
269      * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
270      *         {@code y} represent samples from the same underlying distribution
271      * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
272      *         least 2
273      * @throws NullArgumentException if either {@code x} or {@code y} is null
274      */
275     private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
276         checkArray(x);
277         checkArray(y);
278         // Copy and sort the sample arrays
279         final double[] sx = Arrays.copyOf(x, x.length);
280         final double[] sy = Arrays.copyOf(y, y.length);
281         Arrays.sort(sx);
282         Arrays.sort(sy);
283         final int n = sx.length;
284         final int m = sy.length;
285 
286         int rankX = 0;
287         int rankY = 0;
288         long curD = 0L;
289 
290         // Find the max difference between cdf_x and cdf_y
291         long supD = 0L;
292         do {
293             double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
294             while(rankX < n && Double.compare(sx[rankX], z) == 0) {
295                 rankX += 1;
296                 curD += m;
297             }
298             while(rankY < m && Double.compare(sy[rankY], z) == 0) {
299                 rankY += 1;
300                 curD -= n;
301             }
302             if (curD > supD) {
303                 supD = curD;
304             } else if (-curD > supD) {
305                 supD = -curD;
306             }
307         } while(rankX < n && rankY < m);
308         return supD;
309     }
310 
311     /**
312      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
313      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
314      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
315      *
316      * @param distribution reference distribution
317      * @param data sample being being evaluated
318      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
319      *         {@code distribution}
320      * @throws InsufficientDataException if {@code data} does not have length at least 2
321      * @throws NullArgumentException if {@code data} is null
322      */
323     public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data) {
324         return kolmogorovSmirnovTest(distribution, data, false);
325     }
326 
327     /**
328      * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
329      * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
330      *
331      * @param distribution reference distribution
332      * @param data sample being being evaluated
333      * @param alpha significance level of the test
334      * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
335      *         can be rejected with confidence 1 - {@code alpha}
336      * @throws InsufficientDataException if {@code data} does not have length at least 2
337      * @throws NullArgumentException if {@code data} is null
338      */
339     public boolean kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, double alpha) {
340         if (alpha <= 0 || alpha > 0.5) {
341             throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
342         }
343         return kolmogorovSmirnovTest(distribution, data) < alpha;
344     }
345 
346     /**
347      * Estimates the <i>p-value</i> of a two-sample
348      * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test</a>
349      * evaluating the null hypothesis that {@code x} and {@code y} are samples
350      * drawn from the same probability distribution.
351      * This method estimates the p-value by repeatedly sampling sets of size
352      * {@code x.length} and {@code y.length} from the empirical distribution
353      * of the combined sample.
354      * When {@code strict} is true, this is equivalent to the algorithm implemented
355      * in the R function {@code ks.boot}, described in <pre>
356      * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
357      * Software with Automated Balance Optimization: The Matching package for R.'
358      * Journal of Statistical Software, 42(7): 1-52.
359      * </pre>
360      *
361      * @param x First sample.
362      * @param y Second sample.
363      * @param iterations Number of bootstrap resampling iterations.
364      * @param strict Whether or not the null hypothesis is expressed as a strict inequality.
365      * @param rng RNG for creating the sampling sets.
366      * @return the estimated p-value.
367      */
368     public double bootstrap(double[] x,
369                             double[] y,
370                             int iterations,
371                             boolean strict,
372                             UniformRandomProvider rng) {
373         final int xLength = x.length;
374         final int yLength = y.length;
375         final double[] combined = new double[xLength + yLength];
376         System.arraycopy(x, 0, combined, 0, xLength);
377         System.arraycopy(y, 0, combined, xLength, yLength);
378         final ContinuousDistribution.Sampler sampler = new EnumeratedRealDistribution(combined).createSampler(rng);
379         final long d = integralKolmogorovSmirnovStatistic(x, y);
380         int greaterCount = 0;
381         int equalCount = 0;
382         double[] curX;
383         double[] curY;
384         long curD;
385         for (int i = 0; i < iterations; i++) {
386             curX = AbstractRealDistribution.sample(xLength, sampler);
387             curY = AbstractRealDistribution.sample(yLength, sampler);
388             curD = integralKolmogorovSmirnovStatistic(curX, curY);
389             if (curD > d) {
390                 greaterCount++;
391             } else if (curD == d) {
392                 equalCount++;
393             }
394         }
395         return strict ? greaterCount / (double) iterations :
396             (greaterCount + equalCount) / (double) iterations;
397     }
398 
399     /**
400      * Calculates \(P(D_n &lt; d)\) using the method described in [1] with quick decisions for extreme
401      * values given in [2] (see above). The result is not exact as with
402      * {@link #cdfExact(double, int)} because calculations are based on
403      * {@code double} rather than {@link BigFraction}.
404      *
405      * @param d statistic
406      * @param n sample size
407      * @return \(P(D_n &lt; d)\)
408      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
409      * {@link BigFraction} in expressing {@code d} as
410      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
411      */
412     public double cdf(double d, int n) {
413         return cdf(d, n, false);
414     }
415 
416     /**
417      * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
418      * used everywhere at the expense of very slow execution time. Almost never choose this in real
419      * applications unless you are very sure; this is almost solely for verification purposes.
420      * Normally, you would choose {@link #cdf(double, int)}. See the class
421      * javadoc for definitions and algorithm description.
422      *
423      * @param d statistic
424      * @param n sample size
425      * @return \(P(D_n &lt; d)\)
426      * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
427      * {@link BigFraction} in expressing {@code d} as
428      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\)
429      */
430     public double cdfExact(double d, int n) {
431         return cdf(d, n, true);
432     }
433 
434     /**
435      * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
436      * values given in [2] (see above).
437      *
438      * @param d statistic
439      * @param n sample size
440      * @param exact whether the probability should be calculated exact using
441      *        {@link BigFraction} everywhere at the expense of
442      *        very slow execution time, or if {@code double} should be used convenient places to
443      *        gain speed. Almost never choose {@code true} in real applications unless you are very
444      *        sure; {@code true} is almost solely for verification purposes.
445      * @return \(P(D_n &lt; d)\)
446      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
447      * {@link BigFraction} in expressing {@code d} as
448      * \((k - h) / m\) for integer {@code k, m} and \(0 \le h &lt; 1\).
449      */
450     public double cdf(double d, int n, boolean exact) {
451         final double ninv = 1 / ((double) n);
452         final double ninvhalf = 0.5 * ninv;
453 
454         if (d <= ninvhalf) {
455             return 0;
456         } else if (ninvhalf < d && d <= ninv) {
457             double res = 1;
458             final double f = 2 * d - ninv;
459             // n! f^n = n*f * (n-1)*f * ... * 1*x
460             for (int i = 1; i <= n; ++i) {
461                 res *= i * f;
462             }
463             return res;
464         } else if (1 - ninv <= d && d < 1) {
465             return 1 - 2 * Math.pow(1 - d, n);
466         } else if (1 <= d) {
467             return 1;
468         }
469         if (exact) {
470             return exactK(d, n);
471         }
472         if (n <= 140) {
473             return roundedK(d, n);
474         }
475         return pelzGood(d, n);
476     }
477 
478     /**
479      * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
480      * in class javadoc above) and {@link BigFraction} (see above).
481      *
482      * @param d statistic
483      * @param n sample size
484      * @return the two-sided probability of \(P(D_n &lt; d)\)
485      * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
486      * {@link BigFraction}.
487      */
488     private double exactK(double d, int n) {
489         final int k = (int) Math.ceil(n * d);
490 
491         final FieldDenseMatrix<BigFraction> h;
492         try {
493             h = createExactH(d, n);
494         } catch (ArithmeticException e) {
495             throw new MathArithmeticException(LocalizedFormats.FRACTION);
496         }
497 
498         final FieldDenseMatrix<BigFraction> hPower = h.pow(n);
499 
500         BigFraction pFrac = hPower.get(k - 1, k - 1);
501 
502         for (int i = 1; i <= n; ++i) {
503             pFrac = pFrac.multiply(i).divide(n);
504         }
505 
506         /*
507          * BigFraction.doubleValue converts numerator to double and the denominator to double and
508          * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
509          * digits):
510          */
511         return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
512     }
513 
514     /**
515      * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
516      *
517      * @param d statistic
518      * @param n sample size
519      * @return \(P(D_n &lt; d)\)
520      */
521     private double roundedK(double d, int n) {
522 
523         final int k = (int) Math.ceil(n * d);
524         final RealMatrix h = this.createRoundedH(d, n);
525         final RealMatrix hPower = h.power(n);
526 
527         double pFrac = hPower.getEntry(k - 1, k - 1);
528         for (int i = 1; i <= n; ++i) {
529             pFrac *= (double) i / (double) n;
530         }
531 
532         return pFrac;
533     }
534 
535     /**
536      * Computes the Pelz-Good approximation for \(P(D_n &lt; d)\) as described in [2] in the class javadoc.
537      *
538      * @param d value of d-statistic (x in [2])
539      * @param n sample size
540      * @return \(P(D_n &lt; d)\)
541      * @since 3.4
542      */
543     public double pelzGood(double d, int n) {
544         // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
545         final double sqrtN = JdkMath.sqrt(n);
546         final double z = d * sqrtN;
547         final double z2 = d * d * n;
548         final double z4 = z2 * z2;
549         final double z6 = z4 * z2;
550         final double z8 = z4 * z4;
551 
552         // Eventual return value
553         double ret = 0;
554 
555         // Compute K_0(z)
556         double sum = 0;
557         double increment = 0;
558         double kTerm = 0;
559         double z2Term = PI_SQUARED / (8 * z2);
560         int k = 1;
561         for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
562             kTerm = 2 * k - 1;
563             increment = JdkMath.exp(-z2Term * kTerm * kTerm);
564             sum += increment;
565             if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
566                 break;
567             }
568         }
569         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
570             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
571         }
572         ret = sum * JdkMath.sqrt(2 * JdkMath.PI) / z;
573 
574         // K_1(z)
575         // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
576         // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
577         final double twoZ2 = 2 * z2;
578         sum = 0;
579         kTerm = 0;
580         double kTerm2 = 0;
581         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
582             kTerm = k + 0.5;
583             kTerm2 = kTerm * kTerm;
584             increment = (PI_SQUARED * kTerm2 - z2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
585             sum += increment;
586             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
587                 break;
588             }
589         }
590         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
591             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
592         }
593         final double sqrtHalfPi = JdkMath.sqrt(JdkMath.PI / 2);
594         // Instead of doubling sum, divide by 3 instead of 6
595         ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
596 
597         // K_2(z)
598         // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
599         final double z4Term = 2 * z4;
600         final double z6Term = 6 * z6;
601         z2Term = 5 * z2;
602         final double pi4 = PI_SQUARED * PI_SQUARED;
603         sum = 0;
604         kTerm = 0;
605         kTerm2 = 0;
606         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
607             kTerm = k + 0.5;
608             kTerm2 = kTerm * kTerm;
609             increment =  (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
610                     pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
611             sum += increment;
612             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
613                 break;
614             }
615         }
616         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
617             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
618         }
619         double sum2 = 0;
620         kTerm2 = 0;
621         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
622             kTerm2 = (double) k * k;
623             increment = PI_SQUARED * kTerm2 * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
624             sum2 += increment;
625             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
626                 break;
627             }
628         }
629         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
630             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
631         }
632         // Again, adjust coefficients instead of doubling sum, sum2
633         ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
634 
635         // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
636         // Multiply coefficient denominators by 2, so omit doubling sums.
637         final double pi6 = pi4 * PI_SQUARED;
638         sum = 0;
639         double kTerm4 = 0;
640         double kTerm6 = 0;
641         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
642             kTerm = k + 0.5;
643             kTerm2 = kTerm * kTerm;
644             kTerm4 = kTerm2 * kTerm2;
645             kTerm6 = kTerm4 * kTerm2;
646             increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
647                             PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
648                     JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
649             sum += increment;
650             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
651                 break;
652             }
653         }
654         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
655             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
656         }
657         sum2 = 0;
658         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
659             kTerm2 = (double) k * k;
660             kTerm4 = kTerm2 * kTerm2;
661             increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
662                     JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
663             sum2 += increment;
664             if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
665                 break;
666             }
667         }
668         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
669             throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
670         }
671         return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
672                 sum2 / (108 * z6));
673     }
674 
675     /**
676      * Creates {@code H} of size {@code m x m} as described in [1] (see above).
677      *
678      * @param d statistic
679      * @param n sample size
680      * @return H matrix
681      * @throws NumberIsTooLargeException if fractional part is greater than 1.
682      * @throws ArithmeticException if algorithm fails to convert {@code h} to a
683      * {@link BigFraction}.
684      */
685     private FieldDenseMatrix<BigFraction> createExactH(double d,
686                                                        int n) {
687         final int k = (int) Math.ceil(n * d);
688         final int m = 2 * k - 1;
689         final double hDouble = k - n * d;
690         if (hDouble >= 1) {
691             throw new NumberIsTooLargeException(hDouble, 1.0, false);
692         }
693         BigFraction h = null;
694         try {
695             h = BigFraction.from(hDouble, 1e-20, 10000);
696         } catch (final ArithmeticException e1) {
697             try {
698                 h = BigFraction.from(hDouble, 1e-10, 10000);
699             } catch (final ArithmeticException e2) {
700                 h = BigFraction.from(hDouble, 1e-5, 10000);
701             }
702         }
703         final FieldDenseMatrix<BigFraction> hData = FieldDenseMatrix.create(BigFractionField.get(), m, m);
704 
705         /*
706          * Start by filling everything with either 0 or 1.
707          */
708         for (int i = 0; i < m; ++i) {
709             for (int j = 0; j < m; ++j) {
710                 if (i - j + 1 < 0) {
711                     hData.set(i, j, BigFraction.ZERO);
712                 } else {
713                     hData.set(i, j, BigFraction.ONE);
714                 }
715             }
716         }
717 
718         /*
719          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
720          * hPowers[m-1] = h^m
721          */
722         final BigFraction[] hPowers = new BigFraction[m];
723         hPowers[0] = h;
724         for (int i = 1; i < m; i++) {
725             hPowers[i] = h.multiply(hPowers[i - 1]);
726         }
727 
728         /*
729          * First column and last row has special values (each other reversed).
730          */
731         for (int i = 0; i < m; ++i) {
732             hData.set(i, 0,
733                       hData.get(i, 0).subtract(hPowers[i]));
734             hData.set(m - 1, i,
735                       hData.get(m - 1, i).subtract(hPowers[m - i - 1]));
736         }
737 
738         /*
739          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
740          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
741          */
742         if (h.compareTo(ONE_HALF) > 0) {
743             hData.set(m - 1, 0,
744                       hData.get(m - 1, 0).add(h.multiply(2).subtract(1).pow(m)));
745         }
746 
747         /*
748          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
749          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
750          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
751          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
752          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
753          * really necessary.
754          */
755         for (int i = 0; i < m; ++i) {
756             for (int j = 0; j < i + 1; ++j) {
757                 if (i - j + 1 > 0) {
758                     for (int g = 2; g <= i - j + 1; ++g) {
759                         hData.set(i, j,
760                                   hData.get(i, j).divide(g));
761                     }
762                 }
763             }
764         }
765         return hData;
766     }
767 
768     /***
769      * Creates {@code H} of size {@code m x m} as described in [1] (see above)
770      * using double-precision.
771      *
772      * @param d statistic
773      * @param n sample size
774      * @return H matrix
775      * @throws NumberIsTooLargeException if fractional part is greater than 1
776      */
777     private RealMatrix createRoundedH(double d, int n)
778         throws NumberIsTooLargeException {
779 
780         final int k = (int) Math.ceil(n * d);
781         final int m = 2 * k - 1;
782         final double h = k - n * d;
783         if (h >= 1) {
784             throw new NumberIsTooLargeException(h, 1.0, false);
785         }
786         final double[][] hData = new double[m][m];
787 
788         /*
789          * Start by filling everything with either 0 or 1.
790          */
791         for (int i = 0; i < m; ++i) {
792             for (int j = 0; j < m; ++j) {
793                 if (i - j + 1 < 0) {
794                     hData[i][j] = 0;
795                 } else {
796                     hData[i][j] = 1;
797                 }
798             }
799         }
800 
801         /*
802          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
803          * hPowers[m-1] = h^m
804          */
805         final double[] hPowers = new double[m];
806         hPowers[0] = h;
807         for (int i = 1; i < m; ++i) {
808             hPowers[i] = h * hPowers[i - 1];
809         }
810 
811         /*
812          * First column and last row has special values (each other reversed).
813          */
814         for (int i = 0; i < m; ++i) {
815             hData[i][0] = hData[i][0] - hPowers[i];
816             hData[m - 1][i] -= hPowers[m - i - 1];
817         }
818 
819         /*
820          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
821          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
822          */
823         if (Double.compare(h, 0.5) > 0) {
824             hData[m - 1][0] += JdkMath.pow(2 * h - 1, m);
825         }
826 
827         /*
828          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
829          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
830          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
831          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
832          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
833          * really necessary.
834          */
835         for (int i = 0; i < m; ++i) {
836             for (int j = 0; j < i + 1; ++j) {
837                 if (i - j + 1 > 0) {
838                     for (int g = 2; g <= i - j + 1; ++g) {
839                         hData[i][j] /= g;
840                     }
841                 }
842             }
843         }
844         return MatrixUtils.createRealMatrix(hData);
845     }
846 
847     /**
848      * Verifies that {@code array} has length at least 2.
849      *
850      * @param array array to test
851      * @throws NullArgumentException if array is null
852      * @throws InsufficientDataException if array is too short
853      */
854     private void checkArray(double[] array) {
855         if (array == null) {
856             throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
857         }
858         if (array.length < 2) {
859             throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
860                                                 2);
861         }
862     }
863 
864     /**
865      * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
866      * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
867      * have been computed. If the sum does not converge before {@code maxIterations} iterations a
868      * {@link TooManyIterationsException} is thrown.
869      *
870      * @param t argument
871      * @param tolerance Cauchy criterion for partial sums
872      * @param maxIterations maximum number of partial sums to compute
873      * @return Kolmogorov sum evaluated at t
874      * @throws TooManyIterationsException if the series does not converge
875      */
876     public double ksSum(double t, double tolerance, int maxIterations) {
877         if (t == 0.0) {
878             return 0.0;
879         }
880 
881         // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
882         // from class javadoc should be used.
883 
884         final double x = -2 * t * t;
885         int sign = -1;
886         long i = 1;
887         double partialSum = 0.5d;
888         double delta = 1;
889         while (delta > tolerance && i < maxIterations) {
890             delta = JdkMath.exp(x * i * i);
891             partialSum += sign * delta;
892             sign *= -1;
893             i++;
894         }
895         if (i == maxIterations) {
896             throw new TooManyIterationsException(maxIterations);
897         }
898         return partialSum * 2;
899     }
900 
901     /**
902      * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
903      * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
904      * comparison with other integral d-statistics. Depending whether {@code strict} is
905      * {@code true} or not, the returned value divided by (n*m) is greater than
906      * (resp greater than or equal to) the given d value (allowing some tolerance).
907      *
908      * @param d a d-statistic in the range [0, 1]
909      * @param n first sample size
910      * @param m second sample size
911      * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
912      * @return the integral d-statistic in the range [0, n*m]
913      */
914     private static long calculateIntegralD(double d, int n, int m, boolean strict) {
915         final double tol = 1e-12;  // d-values within tol of one another are considered equal
916         long nm = n * (long)m;
917         long upperBound = (long)JdkMath.ceil((d - tol) * nm);
918         long lowerBound = (long)JdkMath.floor((d + tol) * nm);
919         if (strict && lowerBound == upperBound) {
920             return upperBound + 1L;
921         } else {
922             return upperBound;
923         }
924     }
925 
926     /**
927      * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
928      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
929      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
930      * <p>
931      * The returned probability is exact, implemented by unwinding the recursive function
932      * definitions presented in [4] (class javadoc).
933      * </p>
934      *
935      * @param d D-statistic value
936      * @param n first sample size
937      * @param m second sample size
938      * @param strict whether or not the probability to compute is expressed as a strict inequality
939      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
940      *         greater than (resp. greater than or equal to) {@code d}
941      */
942     public double exactP(double d, int n, int m, boolean strict) {
943        return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
944            BinomialCoefficientDouble.value(n + m, m);
945     }
946 
947     /**
948      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
949      * is the 2-sample Kolmogorov-Smirnov statistic. See
950      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
951      * <p>
952      * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
953      * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
954      * details on how convergence of the sum is determined.
955      * </p>
956      *
957      * @param d D-statistic value
958      * @param n first sample size
959      * @param m second sample size
960      * @return approximate probability that a randomly selected m-n partition of m + n generates
961      *         \(D_{n,m}\) greater than {@code d}
962      */
963     public double approximateP(double d, int n, int m) {
964         return 1 - ksSum(d * JdkMath.sqrt(((double) m * (double) n) / ((double) m + (double) n)),
965                          KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
966     }
967 
968     /**
969      * Fills a boolean array randomly with a fixed number of {@code true} values.
970      * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
971      * By processing first the {@code true} values followed by the remaining {@code false} values
972      * less random numbers need to be generated. The method is optimized for the case
973      * that the number of {@code true} values is larger than or equal to the number of
974      * {@code false} values.
975      *
976      * @param b boolean array
977      * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
978      * @param rng random data generator
979      */
980     private static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final UniformRandomProvider rng) {
981         Arrays.fill(b, true);
982         for (int k = numberOfTrueValues; k < b.length; k++) {
983             final int r = rng.nextInt(k + 1);
984             b[(b[r]) ? r : k] = false;
985         }
986     }
987 
988     /**
989      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\) is the
990      * 2-sample Kolmogorov-Smirnov statistic. See
991      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
992      * <p>
993      * The simulation generates {@code iterations} random partitions of {@code m + n} into an
994      * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
995      * the proportion of values that are greater than {@code d}, or greater than or equal to
996      * {@code d} if {@code strict} is {@code false}.
997      * </p>
998      *
999      * @param d D-statistic value.
1000      * @param n First sample size.
1001      * @param m Second sample size.
1002      * @param iterations Number of random partitions to generate.
1003      * @param strict whether or not the probability to compute is expressed as a strict inequality
1004      * @param rng RNG used for generating the partitions.
1005      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1006      * greater than (resp. greater than or equal to) {@code d}.
1007      */
1008     public double monteCarloP(final double d,
1009                               final int n,
1010                               final int m,
1011                               final boolean strict,
1012                               final int iterations,
1013                               UniformRandomProvider rng) {
1014         return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations, rng);
1015     }
1016 
1017     /**
1018      * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d / (n * m))\)
1019      * where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
1020      * <p>
1021      * Here {@code d} is the D-statistic represented as long value.
1022      * The real D-statistic is obtained by dividing {@code d} by {@code n * m}.
1023      * See also {@link #monteCarloP(double,int,int,boolean,int,UniformRandomProvider)}.
1024      *
1025      * @param d Integral D-statistic.
1026      * @param n First sample size.
1027      * @param m Second sample size.
1028      * @param iterations Number of random partitions to generate.
1029      * @param rng RNG used for generating the partitions.
1030      * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1031      * greater than or equal to {@code d / (n * m))}.
1032      */
1033     private double integralMonteCarloP(final long d,
1034                                        final int n,
1035                                        final int m,
1036                                        final int iterations,
1037                                        UniformRandomProvider rng) {
1038         // ensure that nn is always the max of (n, m) to require fewer random numbers
1039         final int nn = JdkMath.max(n, m);
1040         final int mm = JdkMath.min(n, m);
1041         final int sum = nn + mm;
1042 
1043         int tail = 0;
1044         final boolean[] b = new boolean[sum];
1045         for (int i = 0; i < iterations; i++) {
1046             fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
1047             long curD = 0L;
1048             for(int j = 0; j < b.length; ++j) {
1049                 if (b[j]) {
1050                     curD += mm;
1051                     if (curD >= d) {
1052                         tail++;
1053                         break;
1054                     }
1055                 } else {
1056                     curD -= nn;
1057                     if (curD <= -d) {
1058                         tail++;
1059                         break;
1060                     }
1061                 }
1062             }
1063         }
1064         return (double) tail / iterations;
1065     }
1066 
1067     /**
1068      * If there are no ties in the combined dataset formed from x and y,
1069      * this method is a no-op.
1070      * If there are ties, a uniform random deviate in
1071      * is added to each value in x and y, and this method overwrites the
1072      * data in x and y with the jittered values.
1073      *
1074      * @param x First sample.
1075      * @param y Second sample.
1076      * @throw NotANumberException if any of the input arrays contain
1077      * a NaN value.
1078      */
1079     private static void fixTies(double[] x, double[] y) {
1080         if (hasTies(x, y)) {
1081             // Add jitter using a fixed seed (so same arguments always give same results),
1082             // low-initialization-overhead generator.
1083             final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(876543217L);
1084 
1085             // It is theoretically possible that jitter does not break ties, so repeat
1086             // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
1087             int fixedRangeTrials = 10;
1088             int jitterRange = 10;
1089             int ct = 0;
1090             boolean ties = true;
1091             do {
1092                 // Nudge the data using a fixed range.
1093                 jitter(x, rng, jitterRange);
1094                 jitter(y, rng, jitterRange);
1095                 ties = hasTies(x, y);
1096                 ++ct;
1097             } while (ties && ct < fixedRangeTrials);
1098 
1099             if (ties) {
1100                 throw new MaxCountExceededException(fixedRangeTrials);
1101             }
1102         }
1103     }
1104 
1105     /**
1106      * Returns true iff there are ties in the combined sample formed from
1107      * x and y.
1108      *
1109      * @param x First sample.
1110      * @param y Second sample.
1111      * @return true if x and y together contain ties.
1112      * @throw InsufficientDataException if the input arrays only contain infinite values.
1113      * @throw NotANumberException if any of the input arrays contain a NaN value.
1114      */
1115     private static boolean hasTies(double[] x, double[] y) {
1116        final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
1117 
1118        // "unique" moves NaN to the head of the output array.
1119        if (Double.isNaN(values[0])) {
1120            throw new NotANumberException();
1121        }
1122 
1123        int infCount = 0;
1124        for (double v : values) {
1125            if (Double.isInfinite(v)) {
1126                ++infCount;
1127            }
1128        }
1129        if (infCount == values.length) {
1130            throw new InsufficientDataException();
1131        }
1132 
1133         return values.length != x.length + y.length;  // There are no ties.
1134     }
1135 
1136     /**
1137      * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
1138      * <p>
1139      * Note that jitter is applied in-place - i.e., the array
1140      * values are overwritten with the result of applying jitter.</p>
1141      *
1142      * @param data input/output data array - entries overwritten by the method
1143      * @param rng probability distribution to sample for jitter values
1144      * @param ulp ulp used when generating random numbers
1145      * @throws NullPointerException if either of the parameters is null
1146      */
1147     private static void jitter(double[] data,
1148                                UniformRandomProvider rng,
1149                                int ulp) {
1150         final int range = ulp * 2;
1151         for (int i = 0; i < data.length; i++) {
1152             final double orig = data[i];
1153             if (Double.isInfinite(orig)) {
1154                 continue; // Avoid NaN creation.
1155             }
1156 
1157             final int rand = rng.nextInt(range) - ulp;
1158             final double ulps = Math.ulp(orig);
1159             final double r = rand * ulps;
1160 
1161             data[i] += r;
1162         }
1163     }
1164 
1165     /**
1166      * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
1167      * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
1168      * and recoded as a long to avoid rounding errors in comparison tests, so what
1169      * is actually tested is |im - jn| <= cmn.
1170      *
1171      * @param i first path parameter
1172      * @param j second path paramter
1173      * @param m first sample size
1174      * @param n second sample size
1175      * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1176      * @param strict whether or not the null hypothesis uses strict inequality
1177      * @return C(i,j) for given m, n, c
1178      */
1179     private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
1180         if (strict) {
1181             return JdkMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
1182         }
1183         return JdkMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
1184     }
1185 
1186     /**
1187      * The function N(i, j) defined in [4] (class javadoc).
1188      * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
1189      * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
1190      * The return value is integral, but subject to overflow, so it is maintained and
1191      * returned as a double.
1192      *
1193      * @param i first path parameter
1194      * @param j second path parameter
1195      * @param m first sample size
1196      * @param n second sample size
1197      * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1198      * @param strict whether or not the null hypothesis uses strict inequality
1199      * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
1200      */
1201     private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
1202         /*
1203          * Unwind the recursive definition given in [4].
1204          * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
1205          * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
1206          */
1207         final double[] lag = new double[n];
1208         double last = 0;
1209         for (int k = 0; k < n; k++) {
1210             lag[k] = c(0, k + 1, m, n, cnm, strict);
1211         }
1212         for (int k = 1; k <= i; k++) {
1213             last = c(k, 0, m, n, cnm, strict);
1214             for (int l = 1; l <= j; l++) {
1215                 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
1216                 last = lag[l - 1];
1217             }
1218         }
1219         return last;
1220     }
1221 }