1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math4.legacy.stat.inference;
19
20 import java.math.RoundingMode;
21 import java.util.Arrays;
22
23 import org.apache.commons.rng.simple.RandomSource;
24 import org.apache.commons.rng.UniformRandomProvider;
25 import org.apache.commons.statistics.distribution.ContinuousDistribution;
26 import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
27 import org.apache.commons.numbers.fraction.BigFraction;
28 import org.apache.commons.numbers.field.BigFractionField;
29 import org.apache.commons.math4.legacy.distribution.EnumeratedRealDistribution;
30 import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution;
31 import org.apache.commons.math4.legacy.exception.InsufficientDataException;
32 import org.apache.commons.math4.legacy.exception.MathArithmeticException;
33 import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
34 import org.apache.commons.math4.legacy.exception.NullArgumentException;
35 import org.apache.commons.math4.legacy.exception.NumberIsTooLargeException;
36 import org.apache.commons.math4.legacy.exception.OutOfRangeException;
37 import org.apache.commons.math4.legacy.exception.TooManyIterationsException;
38 import org.apache.commons.math4.legacy.exception.NotANumberException;
39 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
40 import org.apache.commons.math4.legacy.linear.MatrixUtils;
41 import org.apache.commons.math4.legacy.linear.RealMatrix;
42 import org.apache.commons.math4.core.jdkmath.JdkMath;
43 import org.apache.commons.math4.legacy.core.MathArrays;
44 import org.apache.commons.math4.legacy.field.linalg.FieldDenseMatrix;
45
46 /**
47 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
48 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
49 * <p>
50 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
51 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
52 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
53 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
54 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
55 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
56 * given in [2].
57 * </p>
58 * <p>
59 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
60 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
61 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
62 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
63 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
64 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
65 * follows:
66 * <ul>
67 * <li>When the product of the sample sizes is less than 10000, the method presented in [4]
68 * is used to compute the exact p-value for the 2-sample test.</li>
69 * <li>When the product of the sample sizes is larger, the asymptotic
70 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
71 * the approximation.</li>
72 * </ul><p>
73 * For small samples (former case), if the data contains ties, random jitter is added
74 * to the sample data to break ties before applying the algorithm above. Alternatively,
75 * the {@link #bootstrap(double[],double[],int,boolean,UniformRandomProvider)}
76 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
77 * in the R Matching package [3], can be used if ties are known to be present in the data.
78 * </p>
79 * <p>
80 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
81 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} \ge d \)
82 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
83 * {@code strict} parameter. This parameter is ignored for large samples.
84 * </p>
85 * <p>
86 * The methods used by the 2-sample default implementation are also exposed directly:
87 * <ul>
88 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
89 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
90 * arguments in the first two methods allow the probability used to estimate the p-value to be
91 * expressed using strict or non-strict inequality. See
92 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
93 * </ul>
94 * <p>
95 * References:
96 * <ul>
97 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
98 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
99 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
100 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
101 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
102 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
103 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
104 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
105 * Chapter 5, 3rd Ed. Academic Press.</li>
106 * </ul>
107 * <br>
108 * Note that [1] contains an error in computing h, refer to <a
109 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
110 *
111 * @since 3.3
112 */
113 public class KolmogorovSmirnovTest {
114 /** pi^2. */
115 private static final double PI_SQUARED = JdkMath.PI * JdkMath.PI;
116 /**
117 * Bound on the number of partial sums in {@link #ksSum(double, double, int)}.
118 */
119 private static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
120 /** Convergence criterion for {@link #ksSum(double, double, int)}. */
121 private static final double KS_SUM_CAUCHY_CRITERION = 1e-20;
122 /** Convergence criterion for the sums in {@link #pelzGood(double, int)}. */
123 private static final double PG_SUM_RELATIVE_ERROR = 1e-10;
124 /** 1/2. */
125 private static final BigFraction ONE_HALF = BigFraction.of(1, 2);
126
127 /**
128 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
129 * distribution to compute the p-value.
130 */
131 private static final int LARGE_SAMPLE_PRODUCT = 10000;
132
133 /**
134 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
135 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
136 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
137 * {@code exact} is true, the distribution used to compute the p-value is computed using
138 * extended precision. See {@link #cdfExact(double, int)}.
139 *
140 * @param distribution reference distribution
141 * @param data sample being being evaluated
142 * @param exact whether or not to force exact computation of the p-value
143 * @return the p-value associated with the null hypothesis that {@code data} is a sample from
144 * {@code distribution}
145 * @throws InsufficientDataException if {@code data} does not have length at least 2
146 * @throws NullArgumentException if {@code data} is null
147 */
148 public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, boolean exact) {
149 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
150 }
151
152 /**
153 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
154 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
155 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
156 * each of the values in {@code data}.
157 *
158 * @param distribution reference distribution
159 * @param data sample being evaluated
160 * @return Kolmogorov-Smirnov statistic \(D_n\)
161 * @throws InsufficientDataException if {@code data} does not have length at least 2
162 * @throws NullArgumentException if {@code data} is null
163 */
164 public double kolmogorovSmirnovStatistic(ContinuousDistribution distribution, double[] data) {
165 checkArray(data);
166 final int n = data.length;
167 final double nd = n;
168 final double[] dataCopy = new double[n];
169 System.arraycopy(data, 0, dataCopy, 0, n);
170 Arrays.sort(dataCopy);
171 double d = 0d;
172 for (int i = 1; i <= n; i++) {
173 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
174 final double currD = JdkMath.max(yi - (i - 1) / nd, i / nd - yi);
175 if (currD > d) {
176 d = currD;
177 }
178 }
179 return d;
180 }
181
182 /**
183 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
184 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
185 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
186 * probability distribution. Specifically, what is returned is an estimate of the probability
187 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
188 * selected partition of the combined sample into subsamples of sizes {@code x.length} and
189 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
190 * large as (if {@code strict} is {@code false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
191 *
192 * @param x first sample dataset.
193 * @param y second sample dataset.
194 * @param strict whether or not the probability to compute is expressed as
195 * a strict inequality (ignored for large samples).
196 * @return p-value associated with the null hypothesis that {@code x} and
197 * {@code y} represent samples from the same distribution.
198 * @throws InsufficientDataException if either {@code x} or {@code y} does
199 * not have length at least 2.
200 * @throws NullArgumentException if either {@code x} or {@code y} is null.
201 * @throws NotANumberException if the input arrays contain NaN values.
202 *
203 * @see #bootstrap(double[],double[],int,boolean,UniformRandomProvider)
204 */
205 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
206 final long lengthProduct = (long) x.length * y.length;
207 double[] xa = null;
208 double[] ya = null;
209 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
210 xa = Arrays.copyOf(x, x.length);
211 ya = Arrays.copyOf(y, y.length);
212 fixTies(xa, ya);
213 } else {
214 xa = x;
215 ya = y;
216 }
217 if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
218 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
219 }
220 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
221 }
222
223 /**
224 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
225 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
226 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
227 * probability distribution. Assumes the strict form of the inequality used to compute the
228 * p-value. See {@link #kolmogorovSmirnovTest(ContinuousDistribution, double[], boolean)}.
229 *
230 * @param x first sample dataset
231 * @param y second sample dataset
232 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
233 * samples from the same distribution
234 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
235 * least 2
236 * @throws NullArgumentException if either {@code x} or {@code y} is null
237 */
238 public double kolmogorovSmirnovTest(double[] x, double[] y) {
239 return kolmogorovSmirnovTest(x, y, true);
240 }
241
242 /**
243 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
244 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
245 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
246 * is the empirical distribution of the {@code y} values.
247 *
248 * @param x first sample
249 * @param y second sample
250 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
251 * {@code y} represent samples from the same underlying distribution
252 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
253 * least 2
254 * @throws NullArgumentException if either {@code x} or {@code y} is null
255 */
256 public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
257 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
258 }
259
260 /**
261 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
262 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
263 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
264 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
265 * as long value.
266 *
267 * @param x first sample
268 * @param y second sample
269 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
270 * {@code y} represent samples from the same underlying distribution
271 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
272 * least 2
273 * @throws NullArgumentException if either {@code x} or {@code y} is null
274 */
275 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
276 checkArray(x);
277 checkArray(y);
278 // Copy and sort the sample arrays
279 final double[] sx = Arrays.copyOf(x, x.length);
280 final double[] sy = Arrays.copyOf(y, y.length);
281 Arrays.sort(sx);
282 Arrays.sort(sy);
283 final int n = sx.length;
284 final int m = sy.length;
285
286 int rankX = 0;
287 int rankY = 0;
288 long curD = 0L;
289
290 // Find the max difference between cdf_x and cdf_y
291 long supD = 0L;
292 do {
293 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
294 while(rankX < n && Double.compare(sx[rankX], z) == 0) {
295 rankX += 1;
296 curD += m;
297 }
298 while(rankY < m && Double.compare(sy[rankY], z) == 0) {
299 rankY += 1;
300 curD -= n;
301 }
302 if (curD > supD) {
303 supD = curD;
304 } else if (-curD > supD) {
305 supD = -curD;
306 }
307 } while(rankX < n && rankY < m);
308 return supD;
309 }
310
311 /**
312 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
313 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
314 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
315 *
316 * @param distribution reference distribution
317 * @param data sample being being evaluated
318 * @return the p-value associated with the null hypothesis that {@code data} is a sample from
319 * {@code distribution}
320 * @throws InsufficientDataException if {@code data} does not have length at least 2
321 * @throws NullArgumentException if {@code data} is null
322 */
323 public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data) {
324 return kolmogorovSmirnovTest(distribution, data, false);
325 }
326
327 /**
328 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
329 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
330 *
331 * @param distribution reference distribution
332 * @param data sample being being evaluated
333 * @param alpha significance level of the test
334 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
335 * can be rejected with confidence 1 - {@code alpha}
336 * @throws InsufficientDataException if {@code data} does not have length at least 2
337 * @throws NullArgumentException if {@code data} is null
338 */
339 public boolean kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, double alpha) {
340 if (alpha <= 0 || alpha > 0.5) {
341 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
342 }
343 return kolmogorovSmirnovTest(distribution, data) < alpha;
344 }
345
346 /**
347 * Estimates the <i>p-value</i> of a two-sample
348 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test</a>
349 * evaluating the null hypothesis that {@code x} and {@code y} are samples
350 * drawn from the same probability distribution.
351 * This method estimates the p-value by repeatedly sampling sets of size
352 * {@code x.length} and {@code y.length} from the empirical distribution
353 * of the combined sample.
354 * When {@code strict} is true, this is equivalent to the algorithm implemented
355 * in the R function {@code ks.boot}, described in <pre>
356 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
357 * Software with Automated Balance Optimization: The Matching package for R.'
358 * Journal of Statistical Software, 42(7): 1-52.
359 * </pre>
360 *
361 * @param x First sample.
362 * @param y Second sample.
363 * @param iterations Number of bootstrap resampling iterations.
364 * @param strict Whether or not the null hypothesis is expressed as a strict inequality.
365 * @param rng RNG for creating the sampling sets.
366 * @return the estimated p-value.
367 */
368 public double bootstrap(double[] x,
369 double[] y,
370 int iterations,
371 boolean strict,
372 UniformRandomProvider rng) {
373 final int xLength = x.length;
374 final int yLength = y.length;
375 final double[] combined = new double[xLength + yLength];
376 System.arraycopy(x, 0, combined, 0, xLength);
377 System.arraycopy(y, 0, combined, xLength, yLength);
378 final ContinuousDistribution.Sampler sampler = new EnumeratedRealDistribution(combined).createSampler(rng);
379 final long d = integralKolmogorovSmirnovStatistic(x, y);
380 int greaterCount = 0;
381 int equalCount = 0;
382 double[] curX;
383 double[] curY;
384 long curD;
385 for (int i = 0; i < iterations; i++) {
386 curX = AbstractRealDistribution.sample(xLength, sampler);
387 curY = AbstractRealDistribution.sample(yLength, sampler);
388 curD = integralKolmogorovSmirnovStatistic(curX, curY);
389 if (curD > d) {
390 greaterCount++;
391 } else if (curD == d) {
392 equalCount++;
393 }
394 }
395 return strict ? greaterCount / (double) iterations :
396 (greaterCount + equalCount) / (double) iterations;
397 }
398
399 /**
400 * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
401 * values given in [2] (see above). The result is not exact as with
402 * {@link #cdfExact(double, int)} because calculations are based on
403 * {@code double} rather than {@link BigFraction}.
404 *
405 * @param d statistic
406 * @param n sample size
407 * @return \(P(D_n < d)\)
408 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
409 * {@link BigFraction} in expressing {@code d} as
410 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
411 */
412 public double cdf(double d, int n) {
413 return cdf(d, n, false);
414 }
415
416 /**
417 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
418 * used everywhere at the expense of very slow execution time. Almost never choose this in real
419 * applications unless you are very sure; this is almost solely for verification purposes.
420 * Normally, you would choose {@link #cdf(double, int)}. See the class
421 * javadoc for definitions and algorithm description.
422 *
423 * @param d statistic
424 * @param n sample size
425 * @return \(P(D_n < d)\)
426 * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
427 * {@link BigFraction} in expressing {@code d} as
428 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
429 */
430 public double cdfExact(double d, int n) {
431 return cdf(d, n, true);
432 }
433
434 /**
435 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
436 * values given in [2] (see above).
437 *
438 * @param d statistic
439 * @param n sample size
440 * @param exact whether the probability should be calculated exact using
441 * {@link BigFraction} everywhere at the expense of
442 * very slow execution time, or if {@code double} should be used convenient places to
443 * gain speed. Almost never choose {@code true} in real applications unless you are very
444 * sure; {@code true} is almost solely for verification purposes.
445 * @return \(P(D_n < d)\)
446 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
447 * {@link BigFraction} in expressing {@code d} as
448 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
449 */
450 public double cdf(double d, int n, boolean exact) {
451 final double ninv = 1 / ((double) n);
452 final double ninvhalf = 0.5 * ninv;
453
454 if (d <= ninvhalf) {
455 return 0;
456 } else if (ninvhalf < d && d <= ninv) {
457 double res = 1;
458 final double f = 2 * d - ninv;
459 // n! f^n = n*f * (n-1)*f * ... * 1*x
460 for (int i = 1; i <= n; ++i) {
461 res *= i * f;
462 }
463 return res;
464 } else if (1 - ninv <= d && d < 1) {
465 return 1 - 2 * Math.pow(1 - d, n);
466 } else if (1 <= d) {
467 return 1;
468 }
469 if (exact) {
470 return exactK(d, n);
471 }
472 if (n <= 140) {
473 return roundedK(d, n);
474 }
475 return pelzGood(d, n);
476 }
477
478 /**
479 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
480 * in class javadoc above) and {@link BigFraction} (see above).
481 *
482 * @param d statistic
483 * @param n sample size
484 * @return the two-sided probability of \(P(D_n < d)\)
485 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a
486 * {@link BigFraction}.
487 */
488 private double exactK(double d, int n) {
489 final int k = (int) Math.ceil(n * d);
490
491 final FieldDenseMatrix<BigFraction> h;
492 try {
493 h = createExactH(d, n);
494 } catch (ArithmeticException e) {
495 throw new MathArithmeticException(LocalizedFormats.FRACTION);
496 }
497
498 final FieldDenseMatrix<BigFraction> hPower = h.pow(n);
499
500 BigFraction pFrac = hPower.get(k - 1, k - 1);
501
502 for (int i = 1; i <= n; ++i) {
503 pFrac = pFrac.multiply(i).divide(n);
504 }
505
506 /*
507 * BigFraction.doubleValue converts numerator to double and the denominator to double and
508 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
509 * digits):
510 */
511 return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
512 }
513
514 /**
515 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
516 *
517 * @param d statistic
518 * @param n sample size
519 * @return \(P(D_n < d)\)
520 */
521 private double roundedK(double d, int n) {
522
523 final int k = (int) Math.ceil(n * d);
524 final RealMatrix h = this.createRoundedH(d, n);
525 final RealMatrix hPower = h.power(n);
526
527 double pFrac = hPower.getEntry(k - 1, k - 1);
528 for (int i = 1; i <= n; ++i) {
529 pFrac *= (double) i / (double) n;
530 }
531
532 return pFrac;
533 }
534
535 /**
536 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
537 *
538 * @param d value of d-statistic (x in [2])
539 * @param n sample size
540 * @return \(P(D_n < d)\)
541 * @since 3.4
542 */
543 public double pelzGood(double d, int n) {
544 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
545 final double sqrtN = JdkMath.sqrt(n);
546 final double z = d * sqrtN;
547 final double z2 = d * d * n;
548 final double z4 = z2 * z2;
549 final double z6 = z4 * z2;
550 final double z8 = z4 * z4;
551
552 // Eventual return value
553 double ret = 0;
554
555 // Compute K_0(z)
556 double sum = 0;
557 double increment = 0;
558 double kTerm = 0;
559 double z2Term = PI_SQUARED / (8 * z2);
560 int k = 1;
561 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
562 kTerm = 2 * k - 1;
563 increment = JdkMath.exp(-z2Term * kTerm * kTerm);
564 sum += increment;
565 if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
566 break;
567 }
568 }
569 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
570 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
571 }
572 ret = sum * JdkMath.sqrt(2 * JdkMath.PI) / z;
573
574 // K_1(z)
575 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
576 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
577 final double twoZ2 = 2 * z2;
578 sum = 0;
579 kTerm = 0;
580 double kTerm2 = 0;
581 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
582 kTerm = k + 0.5;
583 kTerm2 = kTerm * kTerm;
584 increment = (PI_SQUARED * kTerm2 - z2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
585 sum += increment;
586 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
587 break;
588 }
589 }
590 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
591 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
592 }
593 final double sqrtHalfPi = JdkMath.sqrt(JdkMath.PI / 2);
594 // Instead of doubling sum, divide by 3 instead of 6
595 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
596
597 // K_2(z)
598 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
599 final double z4Term = 2 * z4;
600 final double z6Term = 6 * z6;
601 z2Term = 5 * z2;
602 final double pi4 = PI_SQUARED * PI_SQUARED;
603 sum = 0;
604 kTerm = 0;
605 kTerm2 = 0;
606 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
607 kTerm = k + 0.5;
608 kTerm2 = kTerm * kTerm;
609 increment = (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 +
610 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
611 sum += increment;
612 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
613 break;
614 }
615 }
616 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
617 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
618 }
619 double sum2 = 0;
620 kTerm2 = 0;
621 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
622 kTerm2 = (double) k * k;
623 increment = PI_SQUARED * kTerm2 * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
624 sum2 += increment;
625 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
626 break;
627 }
628 }
629 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
630 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
631 }
632 // Again, adjust coefficients instead of doubling sum, sum2
633 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
634
635 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
636 // Multiply coefficient denominators by 2, so omit doubling sums.
637 final double pi6 = pi4 * PI_SQUARED;
638 sum = 0;
639 double kTerm4 = 0;
640 double kTerm6 = 0;
641 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
642 kTerm = k + 0.5;
643 kTerm2 = kTerm * kTerm;
644 kTerm4 = kTerm2 * kTerm2;
645 kTerm6 = kTerm4 * kTerm2;
646 increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
647 PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
648 JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
649 sum += increment;
650 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) {
651 break;
652 }
653 }
654 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
655 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
656 }
657 sum2 = 0;
658 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
659 kTerm2 = (double) k * k;
660 kTerm4 = kTerm2 * kTerm2;
661 increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) *
662 JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2);
663 sum2 += increment;
664 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) {
665 break;
666 }
667 }
668 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
669 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
670 }
671 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
672 sum2 / (108 * z6));
673 }
674
675 /**
676 * Creates {@code H} of size {@code m x m} as described in [1] (see above).
677 *
678 * @param d statistic
679 * @param n sample size
680 * @return H matrix
681 * @throws NumberIsTooLargeException if fractional part is greater than 1.
682 * @throws ArithmeticException if algorithm fails to convert {@code h} to a
683 * {@link BigFraction}.
684 */
685 private FieldDenseMatrix<BigFraction> createExactH(double d,
686 int n) {
687 final int k = (int) Math.ceil(n * d);
688 final int m = 2 * k - 1;
689 final double hDouble = k - n * d;
690 if (hDouble >= 1) {
691 throw new NumberIsTooLargeException(hDouble, 1.0, false);
692 }
693 BigFraction h = null;
694 try {
695 h = BigFraction.from(hDouble, 1e-20, 10000);
696 } catch (final ArithmeticException e1) {
697 try {
698 h = BigFraction.from(hDouble, 1e-10, 10000);
699 } catch (final ArithmeticException e2) {
700 h = BigFraction.from(hDouble, 1e-5, 10000);
701 }
702 }
703 final FieldDenseMatrix<BigFraction> hData = FieldDenseMatrix.create(BigFractionField.get(), m, m);
704
705 /*
706 * Start by filling everything with either 0 or 1.
707 */
708 for (int i = 0; i < m; ++i) {
709 for (int j = 0; j < m; ++j) {
710 if (i - j + 1 < 0) {
711 hData.set(i, j, BigFraction.ZERO);
712 } else {
713 hData.set(i, j, BigFraction.ONE);
714 }
715 }
716 }
717
718 /*
719 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
720 * hPowers[m-1] = h^m
721 */
722 final BigFraction[] hPowers = new BigFraction[m];
723 hPowers[0] = h;
724 for (int i = 1; i < m; i++) {
725 hPowers[i] = h.multiply(hPowers[i - 1]);
726 }
727
728 /*
729 * First column and last row has special values (each other reversed).
730 */
731 for (int i = 0; i < m; ++i) {
732 hData.set(i, 0,
733 hData.get(i, 0).subtract(hPowers[i]));
734 hData.set(m - 1, i,
735 hData.get(m - 1, i).subtract(hPowers[m - i - 1]));
736 }
737
738 /*
739 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
740 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
741 */
742 if (h.compareTo(ONE_HALF) > 0) {
743 hData.set(m - 1, 0,
744 hData.get(m - 1, 0).add(h.multiply(2).subtract(1).pow(m)));
745 }
746
747 /*
748 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
749 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
750 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
751 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
752 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
753 * really necessary.
754 */
755 for (int i = 0; i < m; ++i) {
756 for (int j = 0; j < i + 1; ++j) {
757 if (i - j + 1 > 0) {
758 for (int g = 2; g <= i - j + 1; ++g) {
759 hData.set(i, j,
760 hData.get(i, j).divide(g));
761 }
762 }
763 }
764 }
765 return hData;
766 }
767
768 /***
769 * Creates {@code H} of size {@code m x m} as described in [1] (see above)
770 * using double-precision.
771 *
772 * @param d statistic
773 * @param n sample size
774 * @return H matrix
775 * @throws NumberIsTooLargeException if fractional part is greater than 1
776 */
777 private RealMatrix createRoundedH(double d, int n)
778 throws NumberIsTooLargeException {
779
780 final int k = (int) Math.ceil(n * d);
781 final int m = 2 * k - 1;
782 final double h = k - n * d;
783 if (h >= 1) {
784 throw new NumberIsTooLargeException(h, 1.0, false);
785 }
786 final double[][] hData = new double[m][m];
787
788 /*
789 * Start by filling everything with either 0 or 1.
790 */
791 for (int i = 0; i < m; ++i) {
792 for (int j = 0; j < m; ++j) {
793 if (i - j + 1 < 0) {
794 hData[i][j] = 0;
795 } else {
796 hData[i][j] = 1;
797 }
798 }
799 }
800
801 /*
802 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
803 * hPowers[m-1] = h^m
804 */
805 final double[] hPowers = new double[m];
806 hPowers[0] = h;
807 for (int i = 1; i < m; ++i) {
808 hPowers[i] = h * hPowers[i - 1];
809 }
810
811 /*
812 * First column and last row has special values (each other reversed).
813 */
814 for (int i = 0; i < m; ++i) {
815 hData[i][0] = hData[i][0] - hPowers[i];
816 hData[m - 1][i] -= hPowers[m - i - 1];
817 }
818
819 /*
820 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
821 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
822 */
823 if (Double.compare(h, 0.5) > 0) {
824 hData[m - 1][0] += JdkMath.pow(2 * h - 1, m);
825 }
826
827 /*
828 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
829 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
830 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
831 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
832 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
833 * really necessary.
834 */
835 for (int i = 0; i < m; ++i) {
836 for (int j = 0; j < i + 1; ++j) {
837 if (i - j + 1 > 0) {
838 for (int g = 2; g <= i - j + 1; ++g) {
839 hData[i][j] /= g;
840 }
841 }
842 }
843 }
844 return MatrixUtils.createRealMatrix(hData);
845 }
846
847 /**
848 * Verifies that {@code array} has length at least 2.
849 *
850 * @param array array to test
851 * @throws NullArgumentException if array is null
852 * @throws InsufficientDataException if array is too short
853 */
854 private void checkArray(double[] array) {
855 if (array == null) {
856 throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
857 }
858 if (array.length < 2) {
859 throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
860 2);
861 }
862 }
863
864 /**
865 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
866 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
867 * have been computed. If the sum does not converge before {@code maxIterations} iterations a
868 * {@link TooManyIterationsException} is thrown.
869 *
870 * @param t argument
871 * @param tolerance Cauchy criterion for partial sums
872 * @param maxIterations maximum number of partial sums to compute
873 * @return Kolmogorov sum evaluated at t
874 * @throws TooManyIterationsException if the series does not converge
875 */
876 public double ksSum(double t, double tolerance, int maxIterations) {
877 if (t == 0.0) {
878 return 0.0;
879 }
880
881 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
882 // from class javadoc should be used.
883
884 final double x = -2 * t * t;
885 int sign = -1;
886 long i = 1;
887 double partialSum = 0.5d;
888 double delta = 1;
889 while (delta > tolerance && i < maxIterations) {
890 delta = JdkMath.exp(x * i * i);
891 partialSum += sign * delta;
892 sign *= -1;
893 i++;
894 }
895 if (i == maxIterations) {
896 throw new TooManyIterationsException(maxIterations);
897 }
898 return partialSum * 2;
899 }
900
901 /**
902 * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
903 * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
904 * comparison with other integral d-statistics. Depending whether {@code strict} is
905 * {@code true} or not, the returned value divided by (n*m) is greater than
906 * (resp greater than or equal to) the given d value (allowing some tolerance).
907 *
908 * @param d a d-statistic in the range [0, 1]
909 * @param n first sample size
910 * @param m second sample size
911 * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
912 * @return the integral d-statistic in the range [0, n*m]
913 */
914 private static long calculateIntegralD(double d, int n, int m, boolean strict) {
915 final double tol = 1e-12; // d-values within tol of one another are considered equal
916 long nm = n * (long)m;
917 long upperBound = (long)JdkMath.ceil((d - tol) * nm);
918 long lowerBound = (long)JdkMath.floor((d + tol) * nm);
919 if (strict && lowerBound == upperBound) {
920 return upperBound + 1L;
921 } else {
922 return upperBound;
923 }
924 }
925
926 /**
927 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
928 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
929 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
930 * <p>
931 * The returned probability is exact, implemented by unwinding the recursive function
932 * definitions presented in [4] (class javadoc).
933 * </p>
934 *
935 * @param d D-statistic value
936 * @param n first sample size
937 * @param m second sample size
938 * @param strict whether or not the probability to compute is expressed as a strict inequality
939 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
940 * greater than (resp. greater than or equal to) {@code d}
941 */
942 public double exactP(double d, int n, int m, boolean strict) {
943 return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
944 BinomialCoefficientDouble.value(n + m, m);
945 }
946
947 /**
948 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
949 * is the 2-sample Kolmogorov-Smirnov statistic. See
950 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
951 * <p>
952 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
953 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
954 * details on how convergence of the sum is determined.
955 * </p>
956 *
957 * @param d D-statistic value
958 * @param n first sample size
959 * @param m second sample size
960 * @return approximate probability that a randomly selected m-n partition of m + n generates
961 * \(D_{n,m}\) greater than {@code d}
962 */
963 public double approximateP(double d, int n, int m) {
964 return 1 - ksSum(d * JdkMath.sqrt(((double) m * (double) n) / ((double) m + (double) n)),
965 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
966 }
967
968 /**
969 * Fills a boolean array randomly with a fixed number of {@code true} values.
970 * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
971 * By processing first the {@code true} values followed by the remaining {@code false} values
972 * less random numbers need to be generated. The method is optimized for the case
973 * that the number of {@code true} values is larger than or equal to the number of
974 * {@code false} values.
975 *
976 * @param b boolean array
977 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
978 * @param rng random data generator
979 */
980 private static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final UniformRandomProvider rng) {
981 Arrays.fill(b, true);
982 for (int k = numberOfTrueValues; k < b.length; k++) {
983 final int r = rng.nextInt(k + 1);
984 b[(b[r]) ? r : k] = false;
985 }
986 }
987
988 /**
989 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
990 * 2-sample Kolmogorov-Smirnov statistic. See
991 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
992 * <p>
993 * The simulation generates {@code iterations} random partitions of {@code m + n} into an
994 * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
995 * the proportion of values that are greater than {@code d}, or greater than or equal to
996 * {@code d} if {@code strict} is {@code false}.
997 * </p>
998 *
999 * @param d D-statistic value.
1000 * @param n First sample size.
1001 * @param m Second sample size.
1002 * @param iterations Number of random partitions to generate.
1003 * @param strict whether or not the probability to compute is expressed as a strict inequality
1004 * @param rng RNG used for generating the partitions.
1005 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1006 * greater than (resp. greater than or equal to) {@code d}.
1007 */
1008 public double monteCarloP(final double d,
1009 final int n,
1010 final int m,
1011 final boolean strict,
1012 final int iterations,
1013 UniformRandomProvider rng) {
1014 return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations, rng);
1015 }
1016
1017 /**
1018 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d / (n * m))\)
1019 * where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
1020 * <p>
1021 * Here {@code d} is the D-statistic represented as long value.
1022 * The real D-statistic is obtained by dividing {@code d} by {@code n * m}.
1023 * See also {@link #monteCarloP(double,int,int,boolean,int,UniformRandomProvider)}.
1024 *
1025 * @param d Integral D-statistic.
1026 * @param n First sample size.
1027 * @param m Second sample size.
1028 * @param iterations Number of random partitions to generate.
1029 * @param rng RNG used for generating the partitions.
1030 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
1031 * greater than or equal to {@code d / (n * m))}.
1032 */
1033 private double integralMonteCarloP(final long d,
1034 final int n,
1035 final int m,
1036 final int iterations,
1037 UniformRandomProvider rng) {
1038 // ensure that nn is always the max of (n, m) to require fewer random numbers
1039 final int nn = JdkMath.max(n, m);
1040 final int mm = JdkMath.min(n, m);
1041 final int sum = nn + mm;
1042
1043 int tail = 0;
1044 final boolean[] b = new boolean[sum];
1045 for (int i = 0; i < iterations; i++) {
1046 fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
1047 long curD = 0L;
1048 for(int j = 0; j < b.length; ++j) {
1049 if (b[j]) {
1050 curD += mm;
1051 if (curD >= d) {
1052 tail++;
1053 break;
1054 }
1055 } else {
1056 curD -= nn;
1057 if (curD <= -d) {
1058 tail++;
1059 break;
1060 }
1061 }
1062 }
1063 }
1064 return (double) tail / iterations;
1065 }
1066
1067 /**
1068 * If there are no ties in the combined dataset formed from x and y,
1069 * this method is a no-op.
1070 * If there are ties, a uniform random deviate in
1071 * is added to each value in x and y, and this method overwrites the
1072 * data in x and y with the jittered values.
1073 *
1074 * @param x First sample.
1075 * @param y Second sample.
1076 * @throw NotANumberException if any of the input arrays contain
1077 * a NaN value.
1078 */
1079 private static void fixTies(double[] x, double[] y) {
1080 if (hasTies(x, y)) {
1081 // Add jitter using a fixed seed (so same arguments always give same results),
1082 // low-initialization-overhead generator.
1083 final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(876543217L);
1084
1085 // It is theoretically possible that jitter does not break ties, so repeat
1086 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
1087 int fixedRangeTrials = 10;
1088 int jitterRange = 10;
1089 int ct = 0;
1090 boolean ties = true;
1091 do {
1092 // Nudge the data using a fixed range.
1093 jitter(x, rng, jitterRange);
1094 jitter(y, rng, jitterRange);
1095 ties = hasTies(x, y);
1096 ++ct;
1097 } while (ties && ct < fixedRangeTrials);
1098
1099 if (ties) {
1100 throw new MaxCountExceededException(fixedRangeTrials);
1101 }
1102 }
1103 }
1104
1105 /**
1106 * Returns true iff there are ties in the combined sample formed from
1107 * x and y.
1108 *
1109 * @param x First sample.
1110 * @param y Second sample.
1111 * @return true if x and y together contain ties.
1112 * @throw InsufficientDataException if the input arrays only contain infinite values.
1113 * @throw NotANumberException if any of the input arrays contain a NaN value.
1114 */
1115 private static boolean hasTies(double[] x, double[] y) {
1116 final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
1117
1118 // "unique" moves NaN to the head of the output array.
1119 if (Double.isNaN(values[0])) {
1120 throw new NotANumberException();
1121 }
1122
1123 int infCount = 0;
1124 for (double v : values) {
1125 if (Double.isInfinite(v)) {
1126 ++infCount;
1127 }
1128 }
1129 if (infCount == values.length) {
1130 throw new InsufficientDataException();
1131 }
1132
1133 return values.length != x.length + y.length; // There are no ties.
1134 }
1135
1136 /**
1137 * Adds random jitter to {@code data} using deviates sampled from {@code dist}.
1138 * <p>
1139 * Note that jitter is applied in-place - i.e., the array
1140 * values are overwritten with the result of applying jitter.</p>
1141 *
1142 * @param data input/output data array - entries overwritten by the method
1143 * @param rng probability distribution to sample for jitter values
1144 * @param ulp ulp used when generating random numbers
1145 * @throws NullPointerException if either of the parameters is null
1146 */
1147 private static void jitter(double[] data,
1148 UniformRandomProvider rng,
1149 int ulp) {
1150 final int range = ulp * 2;
1151 for (int i = 0; i < data.length; i++) {
1152 final double orig = data[i];
1153 if (Double.isInfinite(orig)) {
1154 continue; // Avoid NaN creation.
1155 }
1156
1157 final int rand = rng.nextInt(range) - ulp;
1158 final double ulps = Math.ulp(orig);
1159 final double r = rand * ulps;
1160
1161 data[i] += r;
1162 }
1163 }
1164
1165 /**
1166 * The function C(i, j) defined in [4] (class javadoc), formula (5.5).
1167 * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
1168 * and recoded as a long to avoid rounding errors in comparison tests, so what
1169 * is actually tested is |im - jn| <= cmn.
1170 *
1171 * @param i first path parameter
1172 * @param j second path paramter
1173 * @param m first sample size
1174 * @param n second sample size
1175 * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1176 * @param strict whether or not the null hypothesis uses strict inequality
1177 * @return C(i,j) for given m, n, c
1178 */
1179 private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
1180 if (strict) {
1181 return JdkMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
1182 }
1183 return JdkMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
1184 }
1185
1186 /**
1187 * The function N(i, j) defined in [4] (class javadoc).
1188 * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
1189 * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
1190 * The return value is integral, but subject to overflow, so it is maintained and
1191 * returned as a double.
1192 *
1193 * @param i first path parameter
1194 * @param j second path parameter
1195 * @param m first sample size
1196 * @param n second sample size
1197 * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
1198 * @param strict whether or not the null hypothesis uses strict inequality
1199 * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
1200 */
1201 private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
1202 /*
1203 * Unwind the recursive definition given in [4].
1204 * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
1205 * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
1206 */
1207 final double[] lag = new double[n];
1208 double last = 0;
1209 for (int k = 0; k < n; k++) {
1210 lag[k] = c(0, k + 1, m, n, cnm, strict);
1211 }
1212 for (int k = 1; k <= i; k++) {
1213 last = c(k, 0, m, n, cnm, strict);
1214 for (int l = 1; l <= j; l++) {
1215 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
1216 last = lag[l - 1];
1217 }
1218 }
1219 return last;
1220 }
1221 }