1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.math4.legacy.stat.inference; 19 20 import java.math.RoundingMode; 21 import java.util.Arrays; 22 23 import org.apache.commons.rng.simple.RandomSource; 24 import org.apache.commons.rng.UniformRandomProvider; 25 import org.apache.commons.statistics.distribution.ContinuousDistribution; 26 import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble; 27 import org.apache.commons.numbers.fraction.BigFraction; 28 import org.apache.commons.numbers.field.BigFractionField; 29 import org.apache.commons.math4.legacy.distribution.EnumeratedRealDistribution; 30 import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution; 31 import org.apache.commons.math4.legacy.exception.InsufficientDataException; 32 import org.apache.commons.math4.legacy.exception.MathArithmeticException; 33 import org.apache.commons.math4.legacy.exception.MaxCountExceededException; 34 import org.apache.commons.math4.legacy.exception.NullArgumentException; 35 import org.apache.commons.math4.legacy.exception.NumberIsTooLargeException; 36 import org.apache.commons.math4.legacy.exception.OutOfRangeException; 37 import org.apache.commons.math4.legacy.exception.TooManyIterationsException; 38 import org.apache.commons.math4.legacy.exception.NotANumberException; 39 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; 40 import org.apache.commons.math4.legacy.linear.MatrixUtils; 41 import org.apache.commons.math4.legacy.linear.RealMatrix; 42 import org.apache.commons.math4.core.jdkmath.JdkMath; 43 import org.apache.commons.math4.legacy.core.MathArrays; 44 import org.apache.commons.math4.legacy.field.linalg.FieldDenseMatrix; 45 46 /** 47 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 48 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. 49 * <p> 50 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of 51 * sample data points from the distribution expected under the null hypothesis. For one-sample tests 52 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the 53 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and 54 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of 55 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values 56 * given in [2]. 57 * </p> 58 * <p> 59 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples 60 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test 61 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is 62 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of 63 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The 64 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as 65 * follows: 66 * <ul> 67 * <li>When the product of the sample sizes is less than 10000, the method presented in [4] 68 * is used to compute the exact p-value for the 2-sample test.</li> 69 * <li>When the product of the sample sizes is larger, the asymptotic 70 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on 71 * the approximation.</li> 72 * </ul><p> 73 * For small samples (former case), if the data contains ties, random jitter is added 74 * to the sample data to break ties before applying the algorithm above. Alternatively, 75 * the {@link #bootstrap(double[],double[],int,boolean,UniformRandomProvider)} 76 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a> 77 * in the R Matching package [3], can be used if ties are known to be present in the data. 78 * </p> 79 * <p> 80 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value 81 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} \ge d \) 82 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean 83 * {@code strict} parameter. This parameter is ignored for large samples. 84 * </p> 85 * <p> 86 * The methods used by the 2-sample default implementation are also exposed directly: 87 * <ul> 88 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> 89 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} 90 * arguments in the first two methods allow the probability used to estimate the p-value to be 91 * expressed using strict or non-strict inequality. See 92 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li> 93 * </ul> 94 * <p> 95 * References: 96 * <ul> 97 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by 98 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> 99 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov 100 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> 101 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07"> 102 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: 103 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li> 104 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing, 105 * Chapter 5, 3rd Ed. Academic Press.</li> 106 * </ul> 107 * <br> 108 * Note that [1] contains an error in computing h, refer to <a 109 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 110 * 111 * @since 3.3 112 */ 113 public class KolmogorovSmirnovTest { 114 /** pi^2. */ 115 private static final double PI_SQUARED = JdkMath.PI * JdkMath.PI; 116 /** 117 * Bound on the number of partial sums in {@link #ksSum(double, double, int)}. 118 */ 119 private static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000; 120 /** Convergence criterion for {@link #ksSum(double, double, int)}. */ 121 private static final double KS_SUM_CAUCHY_CRITERION = 1e-20; 122 /** Convergence criterion for the sums in {@link #pelzGood(double, int)}. */ 123 private static final double PG_SUM_RELATIVE_ERROR = 1e-10; 124 /** 1/2. */ 125 private static final BigFraction ONE_HALF = BigFraction.of(1, 2); 126 127 /** 128 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic 129 * distribution to compute the p-value. 130 */ 131 private static final int LARGE_SAMPLE_PRODUCT = 10000; 132 133 /** 134 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 135 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 136 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If 137 * {@code exact} is true, the distribution used to compute the p-value is computed using 138 * extended precision. See {@link #cdfExact(double, int)}. 139 * 140 * @param distribution reference distribution 141 * @param data sample being being evaluated 142 * @param exact whether or not to force exact computation of the p-value 143 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 144 * {@code distribution} 145 * @throws InsufficientDataException if {@code data} does not have length at least 2 146 * @throws NullArgumentException if {@code data} is null 147 */ 148 public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, boolean exact) { 149 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); 150 } 151 152 /** 153 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where 154 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the 155 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at 156 * each of the values in {@code data}. 157 * 158 * @param distribution reference distribution 159 * @param data sample being evaluated 160 * @return Kolmogorov-Smirnov statistic \(D_n\) 161 * @throws InsufficientDataException if {@code data} does not have length at least 2 162 * @throws NullArgumentException if {@code data} is null 163 */ 164 public double kolmogorovSmirnovStatistic(ContinuousDistribution distribution, double[] data) { 165 checkArray(data); 166 final int n = data.length; 167 final double nd = n; 168 final double[] dataCopy = new double[n]; 169 System.arraycopy(data, 0, dataCopy, 0, n); 170 Arrays.sort(dataCopy); 171 double d = 0d; 172 for (int i = 1; i <= n; i++) { 173 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); 174 final double currD = JdkMath.max(yi - (i - 1) / nd, i / nd - yi); 175 if (currD > d) { 176 d = currD; 177 } 178 } 179 return d; 180 } 181 182 /** 183 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 184 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 185 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 186 * probability distribution. Specifically, what is returned is an estimate of the probability 187 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly 188 * selected partition of the combined sample into subsamples of sizes {@code x.length} and 189 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as 190 * large as (if {@code strict} is {@code false}) as {@code kolmogorovSmirnovStatistic(x, y)}. 191 * 192 * @param x first sample dataset. 193 * @param y second sample dataset. 194 * @param strict whether or not the probability to compute is expressed as 195 * a strict inequality (ignored for large samples). 196 * @return p-value associated with the null hypothesis that {@code x} and 197 * {@code y} represent samples from the same distribution. 198 * @throws InsufficientDataException if either {@code x} or {@code y} does 199 * not have length at least 2. 200 * @throws NullArgumentException if either {@code x} or {@code y} is null. 201 * @throws NotANumberException if the input arrays contain NaN values. 202 * 203 * @see #bootstrap(double[],double[],int,boolean,UniformRandomProvider) 204 */ 205 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { 206 final long lengthProduct = (long) x.length * y.length; 207 double[] xa = null; 208 double[] ya = null; 209 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { 210 xa = Arrays.copyOf(x, x.length); 211 ya = Arrays.copyOf(y, y.length); 212 fixTies(xa, ya); 213 } else { 214 xa = x; 215 ya = y; 216 } 217 if (lengthProduct < LARGE_SAMPLE_PRODUCT) { 218 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); 219 } 220 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); 221 } 222 223 /** 224 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 225 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 226 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 227 * probability distribution. Assumes the strict form of the inequality used to compute the 228 * p-value. See {@link #kolmogorovSmirnovTest(ContinuousDistribution, double[], boolean)}. 229 * 230 * @param x first sample dataset 231 * @param y second sample dataset 232 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 233 * samples from the same distribution 234 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 235 * least 2 236 * @throws NullArgumentException if either {@code x} or {@code y} is null 237 */ 238 public double kolmogorovSmirnovTest(double[] x, double[] y) { 239 return kolmogorovSmirnovTest(x, y, true); 240 } 241 242 /** 243 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 244 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 245 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 246 * is the empirical distribution of the {@code y} values. 247 * 248 * @param x first sample 249 * @param y second sample 250 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 251 * {@code y} represent samples from the same underlying distribution 252 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 253 * least 2 254 * @throws NullArgumentException if either {@code x} or {@code y} is null 255 */ 256 public double kolmogorovSmirnovStatistic(double[] x, double[] y) { 257 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length)); 258 } 259 260 /** 261 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 262 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 263 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 264 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned 265 * as long value. 266 * 267 * @param x first sample 268 * @param y second sample 269 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 270 * {@code y} represent samples from the same underlying distribution 271 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 272 * least 2 273 * @throws NullArgumentException if either {@code x} or {@code y} is null 274 */ 275 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) { 276 checkArray(x); 277 checkArray(y); 278 // Copy and sort the sample arrays 279 final double[] sx = Arrays.copyOf(x, x.length); 280 final double[] sy = Arrays.copyOf(y, y.length); 281 Arrays.sort(sx); 282 Arrays.sort(sy); 283 final int n = sx.length; 284 final int m = sy.length; 285 286 int rankX = 0; 287 int rankY = 0; 288 long curD = 0L; 289 290 // Find the max difference between cdf_x and cdf_y 291 long supD = 0L; 292 do { 293 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; 294 while(rankX < n && Double.compare(sx[rankX], z) == 0) { 295 rankX += 1; 296 curD += m; 297 } 298 while(rankY < m && Double.compare(sy[rankY], z) == 0) { 299 rankY += 1; 300 curD -= n; 301 } 302 if (curD > supD) { 303 supD = curD; 304 } else if (-curD > supD) { 305 supD = -curD; 306 } 307 } while(rankX < n && rankY < m); 308 return supD; 309 } 310 311 /** 312 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 313 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 314 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 315 * 316 * @param distribution reference distribution 317 * @param data sample being being evaluated 318 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 319 * {@code distribution} 320 * @throws InsufficientDataException if {@code data} does not have length at least 2 321 * @throws NullArgumentException if {@code data} is null 322 */ 323 public double kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data) { 324 return kolmogorovSmirnovTest(distribution, data, false); 325 } 326 327 /** 328 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov 329 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 330 * 331 * @param distribution reference distribution 332 * @param data sample being being evaluated 333 * @param alpha significance level of the test 334 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution} 335 * can be rejected with confidence 1 - {@code alpha} 336 * @throws InsufficientDataException if {@code data} does not have length at least 2 337 * @throws NullArgumentException if {@code data} is null 338 */ 339 public boolean kolmogorovSmirnovTest(ContinuousDistribution distribution, double[] data, double alpha) { 340 if (alpha <= 0 || alpha > 0.5) { 341 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); 342 } 343 return kolmogorovSmirnovTest(distribution, data) < alpha; 344 } 345 346 /** 347 * Estimates the <i>p-value</i> of a two-sample 348 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov-Smirnov test</a> 349 * evaluating the null hypothesis that {@code x} and {@code y} are samples 350 * drawn from the same probability distribution. 351 * This method estimates the p-value by repeatedly sampling sets of size 352 * {@code x.length} and {@code y.length} from the empirical distribution 353 * of the combined sample. 354 * When {@code strict} is true, this is equivalent to the algorithm implemented 355 * in the R function {@code ks.boot}, described in <pre> 356 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching 357 * Software with Automated Balance Optimization: The Matching package for R.' 358 * Journal of Statistical Software, 42(7): 1-52. 359 * </pre> 360 * 361 * @param x First sample. 362 * @param y Second sample. 363 * @param iterations Number of bootstrap resampling iterations. 364 * @param strict Whether or not the null hypothesis is expressed as a strict inequality. 365 * @param rng RNG for creating the sampling sets. 366 * @return the estimated p-value. 367 */ 368 public double bootstrap(double[] x, 369 double[] y, 370 int iterations, 371 boolean strict, 372 UniformRandomProvider rng) { 373 final int xLength = x.length; 374 final int yLength = y.length; 375 final double[] combined = new double[xLength + yLength]; 376 System.arraycopy(x, 0, combined, 0, xLength); 377 System.arraycopy(y, 0, combined, xLength, yLength); 378 final ContinuousDistribution.Sampler sampler = new EnumeratedRealDistribution(combined).createSampler(rng); 379 final long d = integralKolmogorovSmirnovStatistic(x, y); 380 int greaterCount = 0; 381 int equalCount = 0; 382 double[] curX; 383 double[] curY; 384 long curD; 385 for (int i = 0; i < iterations; i++) { 386 curX = AbstractRealDistribution.sample(xLength, sampler); 387 curY = AbstractRealDistribution.sample(yLength, sampler); 388 curD = integralKolmogorovSmirnovStatistic(curX, curY); 389 if (curD > d) { 390 greaterCount++; 391 } else if (curD == d) { 392 equalCount++; 393 } 394 } 395 return strict ? greaterCount / (double) iterations : 396 (greaterCount + equalCount) / (double) iterations; 397 } 398 399 /** 400 * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme 401 * values given in [2] (see above). The result is not exact as with 402 * {@link #cdfExact(double, int)} because calculations are based on 403 * {@code double} rather than {@link BigFraction}. 404 * 405 * @param d statistic 406 * @param n sample size 407 * @return \(P(D_n < d)\) 408 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 409 * {@link BigFraction} in expressing {@code d} as 410 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 411 */ 412 public double cdf(double d, int n) { 413 return cdf(d, n, false); 414 } 415 416 /** 417 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is 418 * used everywhere at the expense of very slow execution time. Almost never choose this in real 419 * applications unless you are very sure; this is almost solely for verification purposes. 420 * Normally, you would choose {@link #cdf(double, int)}. See the class 421 * javadoc for definitions and algorithm description. 422 * 423 * @param d statistic 424 * @param n sample size 425 * @return \(P(D_n < d)\) 426 * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a 427 * {@link BigFraction} in expressing {@code d} as 428 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 429 */ 430 public double cdfExact(double d, int n) { 431 return cdf(d, n, true); 432 } 433 434 /** 435 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme 436 * values given in [2] (see above). 437 * 438 * @param d statistic 439 * @param n sample size 440 * @param exact whether the probability should be calculated exact using 441 * {@link BigFraction} everywhere at the expense of 442 * very slow execution time, or if {@code double} should be used convenient places to 443 * gain speed. Almost never choose {@code true} in real applications unless you are very 444 * sure; {@code true} is almost solely for verification purposes. 445 * @return \(P(D_n < d)\) 446 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 447 * {@link BigFraction} in expressing {@code d} as 448 * \((k - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 449 */ 450 public double cdf(double d, int n, boolean exact) { 451 final double ninv = 1 / ((double) n); 452 final double ninvhalf = 0.5 * ninv; 453 454 if (d <= ninvhalf) { 455 return 0; 456 } else if (ninvhalf < d && d <= ninv) { 457 double res = 1; 458 final double f = 2 * d - ninv; 459 // n! f^n = n*f * (n-1)*f * ... * 1*x 460 for (int i = 1; i <= n; ++i) { 461 res *= i * f; 462 } 463 return res; 464 } else if (1 - ninv <= d && d < 1) { 465 return 1 - 2 * Math.pow(1 - d, n); 466 } else if (1 <= d) { 467 return 1; 468 } 469 if (exact) { 470 return exactK(d, n); 471 } 472 if (n <= 140) { 473 return roundedK(d, n); 474 } 475 return pelzGood(d, n); 476 } 477 478 /** 479 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference 480 * in class javadoc above) and {@link BigFraction} (see above). 481 * 482 * @param d statistic 483 * @param n sample size 484 * @return the two-sided probability of \(P(D_n < d)\) 485 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 486 * {@link BigFraction}. 487 */ 488 private double exactK(double d, int n) { 489 final int k = (int) Math.ceil(n * d); 490 491 final FieldDenseMatrix<BigFraction> h; 492 try { 493 h = createExactH(d, n); 494 } catch (ArithmeticException e) { 495 throw new MathArithmeticException(LocalizedFormats.FRACTION); 496 } 497 498 final FieldDenseMatrix<BigFraction> hPower = h.pow(n); 499 500 BigFraction pFrac = hPower.get(k - 1, k - 1); 501 502 for (int i = 1; i <= n; ++i) { 503 pFrac = pFrac.multiply(i).divide(n); 504 } 505 506 /* 507 * BigFraction.doubleValue converts numerator to double and the denominator to double and 508 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of 509 * digits): 510 */ 511 return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue(); 512 } 513 514 /** 515 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). 516 * 517 * @param d statistic 518 * @param n sample size 519 * @return \(P(D_n < d)\) 520 */ 521 private double roundedK(double d, int n) { 522 523 final int k = (int) Math.ceil(n * d); 524 final RealMatrix h = this.createRoundedH(d, n); 525 final RealMatrix hPower = h.power(n); 526 527 double pFrac = hPower.getEntry(k - 1, k - 1); 528 for (int i = 1; i <= n; ++i) { 529 pFrac *= (double) i / (double) n; 530 } 531 532 return pFrac; 533 } 534 535 /** 536 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc. 537 * 538 * @param d value of d-statistic (x in [2]) 539 * @param n sample size 540 * @return \(P(D_n < d)\) 541 * @since 3.4 542 */ 543 public double pelzGood(double d, int n) { 544 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n) 545 final double sqrtN = JdkMath.sqrt(n); 546 final double z = d * sqrtN; 547 final double z2 = d * d * n; 548 final double z4 = z2 * z2; 549 final double z6 = z4 * z2; 550 final double z8 = z4 * z4; 551 552 // Eventual return value 553 double ret = 0; 554 555 // Compute K_0(z) 556 double sum = 0; 557 double increment = 0; 558 double kTerm = 0; 559 double z2Term = PI_SQUARED / (8 * z2); 560 int k = 1; 561 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 562 kTerm = 2 * k - 1; 563 increment = JdkMath.exp(-z2Term * kTerm * kTerm); 564 sum += increment; 565 if (increment <= PG_SUM_RELATIVE_ERROR * sum) { 566 break; 567 } 568 } 569 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 570 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 571 } 572 ret = sum * JdkMath.sqrt(2 * JdkMath.PI) / z; 573 574 // K_1(z) 575 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have 576 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...) 577 final double twoZ2 = 2 * z2; 578 sum = 0; 579 kTerm = 0; 580 double kTerm2 = 0; 581 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 582 kTerm = k + 0.5; 583 kTerm2 = kTerm * kTerm; 584 increment = (PI_SQUARED * kTerm2 - z2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 585 sum += increment; 586 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) { 587 break; 588 } 589 } 590 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 591 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 592 } 593 final double sqrtHalfPi = JdkMath.sqrt(JdkMath.PI / 2); 594 // Instead of doubling sum, divide by 3 instead of 6 595 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN); 596 597 // K_2(z) 598 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers. 599 final double z4Term = 2 * z4; 600 final double z6Term = 6 * z6; 601 z2Term = 5 * z2; 602 final double pi4 = PI_SQUARED * PI_SQUARED; 603 sum = 0; 604 kTerm = 0; 605 kTerm2 = 0; 606 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 607 kTerm = k + 0.5; 608 kTerm2 = kTerm * kTerm; 609 increment = (z6Term + z4Term + PI_SQUARED * (z4Term - z2Term) * kTerm2 + 610 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 611 sum += increment; 612 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) { 613 break; 614 } 615 } 616 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 617 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 618 } 619 double sum2 = 0; 620 kTerm2 = 0; 621 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 622 kTerm2 = (double) k * k; 623 increment = PI_SQUARED * kTerm2 * JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 624 sum2 += increment; 625 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) { 626 break; 627 } 628 } 629 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 630 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 631 } 632 // Again, adjust coefficients instead of doubling sum, sum2 633 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z)); 634 635 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even. 636 // Multiply coefficient denominators by 2, so omit doubling sums. 637 final double pi6 = pi4 * PI_SQUARED; 638 sum = 0; 639 double kTerm4 = 0; 640 double kTerm6 = 0; 641 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 642 kTerm = k + 0.5; 643 kTerm2 = kTerm * kTerm; 644 kTerm4 = kTerm2 * kTerm2; 645 kTerm6 = kTerm4 * kTerm2; 646 increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) + 647 PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) * 648 JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 649 sum += increment; 650 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum)) { 651 break; 652 } 653 } 654 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 655 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 656 } 657 sum2 = 0; 658 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 659 kTerm2 = (double) k * k; 660 kTerm4 = kTerm2 * kTerm2; 661 increment = (-pi4 * kTerm4 + 3 * PI_SQUARED * kTerm2 * z2) * 662 JdkMath.exp(-PI_SQUARED * kTerm2 / twoZ2); 663 sum2 += increment; 664 if (JdkMath.abs(increment) < PG_SUM_RELATIVE_ERROR * JdkMath.abs(sum2)) { 665 break; 666 } 667 } 668 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 669 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 670 } 671 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) + 672 sum2 / (108 * z6)); 673 } 674 675 /** 676 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 677 * 678 * @param d statistic 679 * @param n sample size 680 * @return H matrix 681 * @throws NumberIsTooLargeException if fractional part is greater than 1. 682 * @throws ArithmeticException if algorithm fails to convert {@code h} to a 683 * {@link BigFraction}. 684 */ 685 private FieldDenseMatrix<BigFraction> createExactH(double d, 686 int n) { 687 final int k = (int) Math.ceil(n * d); 688 final int m = 2 * k - 1; 689 final double hDouble = k - n * d; 690 if (hDouble >= 1) { 691 throw new NumberIsTooLargeException(hDouble, 1.0, false); 692 } 693 BigFraction h = null; 694 try { 695 h = BigFraction.from(hDouble, 1e-20, 10000); 696 } catch (final ArithmeticException e1) { 697 try { 698 h = BigFraction.from(hDouble, 1e-10, 10000); 699 } catch (final ArithmeticException e2) { 700 h = BigFraction.from(hDouble, 1e-5, 10000); 701 } 702 } 703 final FieldDenseMatrix<BigFraction> hData = FieldDenseMatrix.create(BigFractionField.get(), m, m); 704 705 /* 706 * Start by filling everything with either 0 or 1. 707 */ 708 for (int i = 0; i < m; ++i) { 709 for (int j = 0; j < m; ++j) { 710 if (i - j + 1 < 0) { 711 hData.set(i, j, BigFraction.ZERO); 712 } else { 713 hData.set(i, j, BigFraction.ONE); 714 } 715 } 716 } 717 718 /* 719 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 720 * hPowers[m-1] = h^m 721 */ 722 final BigFraction[] hPowers = new BigFraction[m]; 723 hPowers[0] = h; 724 for (int i = 1; i < m; i++) { 725 hPowers[i] = h.multiply(hPowers[i - 1]); 726 } 727 728 /* 729 * First column and last row has special values (each other reversed). 730 */ 731 for (int i = 0; i < m; ++i) { 732 hData.set(i, 0, 733 hData.get(i, 0).subtract(hPowers[i])); 734 hData.set(m - 1, i, 735 hData.get(m - 1, i).subtract(hPowers[m - i - 1])); 736 } 737 738 /* 739 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 740 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 741 */ 742 if (h.compareTo(ONE_HALF) > 0) { 743 hData.set(m - 1, 0, 744 hData.get(m - 1, 0).add(h.multiply(2).subtract(1).pow(m))); 745 } 746 747 /* 748 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 749 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 750 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 751 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 752 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 753 * really necessary. 754 */ 755 for (int i = 0; i < m; ++i) { 756 for (int j = 0; j < i + 1; ++j) { 757 if (i - j + 1 > 0) { 758 for (int g = 2; g <= i - j + 1; ++g) { 759 hData.set(i, j, 760 hData.get(i, j).divide(g)); 761 } 762 } 763 } 764 } 765 return hData; 766 } 767 768 /*** 769 * Creates {@code H} of size {@code m x m} as described in [1] (see above) 770 * using double-precision. 771 * 772 * @param d statistic 773 * @param n sample size 774 * @return H matrix 775 * @throws NumberIsTooLargeException if fractional part is greater than 1 776 */ 777 private RealMatrix createRoundedH(double d, int n) 778 throws NumberIsTooLargeException { 779 780 final int k = (int) Math.ceil(n * d); 781 final int m = 2 * k - 1; 782 final double h = k - n * d; 783 if (h >= 1) { 784 throw new NumberIsTooLargeException(h, 1.0, false); 785 } 786 final double[][] hData = new double[m][m]; 787 788 /* 789 * Start by filling everything with either 0 or 1. 790 */ 791 for (int i = 0; i < m; ++i) { 792 for (int j = 0; j < m; ++j) { 793 if (i - j + 1 < 0) { 794 hData[i][j] = 0; 795 } else { 796 hData[i][j] = 1; 797 } 798 } 799 } 800 801 /* 802 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 803 * hPowers[m-1] = h^m 804 */ 805 final double[] hPowers = new double[m]; 806 hPowers[0] = h; 807 for (int i = 1; i < m; ++i) { 808 hPowers[i] = h * hPowers[i - 1]; 809 } 810 811 /* 812 * First column and last row has special values (each other reversed). 813 */ 814 for (int i = 0; i < m; ++i) { 815 hData[i][0] = hData[i][0] - hPowers[i]; 816 hData[m - 1][i] -= hPowers[m - i - 1]; 817 } 818 819 /* 820 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 821 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 822 */ 823 if (Double.compare(h, 0.5) > 0) { 824 hData[m - 1][0] += JdkMath.pow(2 * h - 1, m); 825 } 826 827 /* 828 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 829 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 830 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 831 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 832 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 833 * really necessary. 834 */ 835 for (int i = 0; i < m; ++i) { 836 for (int j = 0; j < i + 1; ++j) { 837 if (i - j + 1 > 0) { 838 for (int g = 2; g <= i - j + 1; ++g) { 839 hData[i][j] /= g; 840 } 841 } 842 } 843 } 844 return MatrixUtils.createRealMatrix(hData); 845 } 846 847 /** 848 * Verifies that {@code array} has length at least 2. 849 * 850 * @param array array to test 851 * @throws NullArgumentException if array is null 852 * @throws InsufficientDataException if array is too short 853 */ 854 private void checkArray(double[] array) { 855 if (array == null) { 856 throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED); 857 } 858 if (array.length < 2) { 859 throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length, 860 2); 861 } 862 } 863 864 /** 865 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial 866 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums 867 * have been computed. If the sum does not converge before {@code maxIterations} iterations a 868 * {@link TooManyIterationsException} is thrown. 869 * 870 * @param t argument 871 * @param tolerance Cauchy criterion for partial sums 872 * @param maxIterations maximum number of partial sums to compute 873 * @return Kolmogorov sum evaluated at t 874 * @throws TooManyIterationsException if the series does not converge 875 */ 876 public double ksSum(double t, double tolerance, int maxIterations) { 877 if (t == 0.0) { 878 return 0.0; 879 } 880 881 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] 882 // from class javadoc should be used. 883 884 final double x = -2 * t * t; 885 int sign = -1; 886 long i = 1; 887 double partialSum = 0.5d; 888 double delta = 1; 889 while (delta > tolerance && i < maxIterations) { 890 delta = JdkMath.exp(x * i * i); 891 partialSum += sign * delta; 892 sign *= -1; 893 i++; 894 } 895 if (i == maxIterations) { 896 throw new TooManyIterationsException(maxIterations); 897 } 898 return partialSum * 2; 899 } 900 901 /** 902 * Given a d-statistic in the range [0, 1] and the two sample sizes n and m, 903 * an integral d-statistic in the range [0, n*m] is calculated, that can be used for 904 * comparison with other integral d-statistics. Depending whether {@code strict} is 905 * {@code true} or not, the returned value divided by (n*m) is greater than 906 * (resp greater than or equal to) the given d value (allowing some tolerance). 907 * 908 * @param d a d-statistic in the range [0, 1] 909 * @param n first sample size 910 * @param m second sample size 911 * @param strict whether the returned value divided by (n*m) is allowed to be equal to d 912 * @return the integral d-statistic in the range [0, n*m] 913 */ 914 private static long calculateIntegralD(double d, int n, int m, boolean strict) { 915 final double tol = 1e-12; // d-values within tol of one another are considered equal 916 long nm = n * (long)m; 917 long upperBound = (long)JdkMath.ceil((d - tol) * nm); 918 long lowerBound = (long)JdkMath.floor((d + tol) * nm); 919 if (strict && lowerBound == upperBound) { 920 return upperBound + 1L; 921 } else { 922 return upperBound; 923 } 924 } 925 926 /** 927 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 928 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 929 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 930 * <p> 931 * The returned probability is exact, implemented by unwinding the recursive function 932 * definitions presented in [4] (class javadoc). 933 * </p> 934 * 935 * @param d D-statistic value 936 * @param n first sample size 937 * @param m second sample size 938 * @param strict whether or not the probability to compute is expressed as a strict inequality 939 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 940 * greater than (resp. greater than or equal to) {@code d} 941 */ 942 public double exactP(double d, int n, int m, boolean strict) { 943 return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / 944 BinomialCoefficientDouble.value(n + m, m); 945 } 946 947 /** 948 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 949 * is the 2-sample Kolmogorov-Smirnov statistic. See 950 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 951 * <p> 952 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 953 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for 954 * details on how convergence of the sum is determined. 955 * </p> 956 * 957 * @param d D-statistic value 958 * @param n first sample size 959 * @param m second sample size 960 * @return approximate probability that a randomly selected m-n partition of m + n generates 961 * \(D_{n,m}\) greater than {@code d} 962 */ 963 public double approximateP(double d, int n, int m) { 964 return 1 - ksSum(d * JdkMath.sqrt(((double) m * (double) n) / ((double) m + (double) n)), 965 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT); 966 } 967 968 /** 969 * Fills a boolean array randomly with a fixed number of {@code true} values. 970 * The method uses a simplified version of the Fisher-Yates shuffle algorithm. 971 * By processing first the {@code true} values followed by the remaining {@code false} values 972 * less random numbers need to be generated. The method is optimized for the case 973 * that the number of {@code true} values is larger than or equal to the number of 974 * {@code false} values. 975 * 976 * @param b boolean array 977 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have 978 * @param rng random data generator 979 */ 980 private static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final UniformRandomProvider rng) { 981 Arrays.fill(b, true); 982 for (int k = numberOfTrueValues; k < b.length; k++) { 983 final int r = rng.nextInt(k + 1); 984 b[(b[r]) ? r : k] = false; 985 } 986 } 987 988 /** 989 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 990 * 2-sample Kolmogorov-Smirnov statistic. See 991 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 992 * <p> 993 * The simulation generates {@code iterations} random partitions of {@code m + n} into an 994 * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning 995 * the proportion of values that are greater than {@code d}, or greater than or equal to 996 * {@code d} if {@code strict} is {@code false}. 997 * </p> 998 * 999 * @param d D-statistic value. 1000 * @param n First sample size. 1001 * @param m Second sample size. 1002 * @param iterations Number of random partitions to generate. 1003 * @param strict whether or not the probability to compute is expressed as a strict inequality 1004 * @param rng RNG used for generating the partitions. 1005 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1006 * greater than (resp. greater than or equal to) {@code d}. 1007 */ 1008 public double monteCarloP(final double d, 1009 final int n, 1010 final int m, 1011 final boolean strict, 1012 final int iterations, 1013 UniformRandomProvider rng) { 1014 return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations, rng); 1015 } 1016 1017 /** 1018 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d / (n * m))\) 1019 * where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. 1020 * <p> 1021 * Here {@code d} is the D-statistic represented as long value. 1022 * The real D-statistic is obtained by dividing {@code d} by {@code n * m}. 1023 * See also {@link #monteCarloP(double,int,int,boolean,int,UniformRandomProvider)}. 1024 * 1025 * @param d Integral D-statistic. 1026 * @param n First sample size. 1027 * @param m Second sample size. 1028 * @param iterations Number of random partitions to generate. 1029 * @param rng RNG used for generating the partitions. 1030 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1031 * greater than or equal to {@code d / (n * m))}. 1032 */ 1033 private double integralMonteCarloP(final long d, 1034 final int n, 1035 final int m, 1036 final int iterations, 1037 UniformRandomProvider rng) { 1038 // ensure that nn is always the max of (n, m) to require fewer random numbers 1039 final int nn = JdkMath.max(n, m); 1040 final int mm = JdkMath.min(n, m); 1041 final int sum = nn + mm; 1042 1043 int tail = 0; 1044 final boolean[] b = new boolean[sum]; 1045 for (int i = 0; i < iterations; i++) { 1046 fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng); 1047 long curD = 0L; 1048 for(int j = 0; j < b.length; ++j) { 1049 if (b[j]) { 1050 curD += mm; 1051 if (curD >= d) { 1052 tail++; 1053 break; 1054 } 1055 } else { 1056 curD -= nn; 1057 if (curD <= -d) { 1058 tail++; 1059 break; 1060 } 1061 } 1062 } 1063 } 1064 return (double) tail / iterations; 1065 } 1066 1067 /** 1068 * If there are no ties in the combined dataset formed from x and y, 1069 * this method is a no-op. 1070 * If there are ties, a uniform random deviate in 1071 * is added to each value in x and y, and this method overwrites the 1072 * data in x and y with the jittered values. 1073 * 1074 * @param x First sample. 1075 * @param y Second sample. 1076 * @throw NotANumberException if any of the input arrays contain 1077 * a NaN value. 1078 */ 1079 private static void fixTies(double[] x, double[] y) { 1080 if (hasTies(x, y)) { 1081 // Add jitter using a fixed seed (so same arguments always give same results), 1082 // low-initialization-overhead generator. 1083 final UniformRandomProvider rng = RandomSource.SPLIT_MIX_64.create(876543217L); 1084 1085 // It is theoretically possible that jitter does not break ties, so repeat 1086 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded. 1087 int fixedRangeTrials = 10; 1088 int jitterRange = 10; 1089 int ct = 0; 1090 boolean ties = true; 1091 do { 1092 // Nudge the data using a fixed range. 1093 jitter(x, rng, jitterRange); 1094 jitter(y, rng, jitterRange); 1095 ties = hasTies(x, y); 1096 ++ct; 1097 } while (ties && ct < fixedRangeTrials); 1098 1099 if (ties) { 1100 throw new MaxCountExceededException(fixedRangeTrials); 1101 } 1102 } 1103 } 1104 1105 /** 1106 * Returns true iff there are ties in the combined sample formed from 1107 * x and y. 1108 * 1109 * @param x First sample. 1110 * @param y Second sample. 1111 * @return true if x and y together contain ties. 1112 * @throw InsufficientDataException if the input arrays only contain infinite values. 1113 * @throw NotANumberException if any of the input arrays contain a NaN value. 1114 */ 1115 private static boolean hasTies(double[] x, double[] y) { 1116 final double[] values = MathArrays.unique(MathArrays.concatenate(x, y)); 1117 1118 // "unique" moves NaN to the head of the output array. 1119 if (Double.isNaN(values[0])) { 1120 throw new NotANumberException(); 1121 } 1122 1123 int infCount = 0; 1124 for (double v : values) { 1125 if (Double.isInfinite(v)) { 1126 ++infCount; 1127 } 1128 } 1129 if (infCount == values.length) { 1130 throw new InsufficientDataException(); 1131 } 1132 1133 return values.length != x.length + y.length; // There are no ties. 1134 } 1135 1136 /** 1137 * Adds random jitter to {@code data} using deviates sampled from {@code dist}. 1138 * <p> 1139 * Note that jitter is applied in-place - i.e., the array 1140 * values are overwritten with the result of applying jitter.</p> 1141 * 1142 * @param data input/output data array - entries overwritten by the method 1143 * @param rng probability distribution to sample for jitter values 1144 * @param ulp ulp used when generating random numbers 1145 * @throws NullPointerException if either of the parameters is null 1146 */ 1147 private static void jitter(double[] data, 1148 UniformRandomProvider rng, 1149 int ulp) { 1150 final int range = ulp * 2; 1151 for (int i = 0; i < data.length; i++) { 1152 final double orig = data[i]; 1153 if (Double.isInfinite(orig)) { 1154 continue; // Avoid NaN creation. 1155 } 1156 1157 final int rand = rng.nextInt(range) - ulp; 1158 final double ulps = Math.ulp(orig); 1159 final double r = rand * ulps; 1160 1161 data[i] += r; 1162 } 1163 } 1164 1165 /** 1166 * The function C(i, j) defined in [4] (class javadoc), formula (5.5). 1167 * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up 1168 * and recoded as a long to avoid rounding errors in comparison tests, so what 1169 * is actually tested is |im - jn| <= cmn. 1170 * 1171 * @param i first path parameter 1172 * @param j second path paramter 1173 * @param m first sample size 1174 * @param n second sample size 1175 * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1176 * @param strict whether or not the null hypothesis uses strict inequality 1177 * @return C(i,j) for given m, n, c 1178 */ 1179 private static int c(int i, int j, int m, int n, long cmn, boolean strict) { 1180 if (strict) { 1181 return JdkMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0; 1182 } 1183 return JdkMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0; 1184 } 1185 1186 /** 1187 * The function N(i, j) defined in [4] (class javadoc). 1188 * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m} 1189 * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path. 1190 * The return value is integral, but subject to overflow, so it is maintained and 1191 * returned as a double. 1192 * 1193 * @param i first path parameter 1194 * @param j second path parameter 1195 * @param m first sample size 1196 * @param n second sample size 1197 * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1198 * @param strict whether or not the null hypothesis uses strict inequality 1199 * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n 1200 */ 1201 private static double n(int i, int j, int m, int n, long cnm, boolean strict) { 1202 /* 1203 * Unwind the recursive definition given in [4]. 1204 * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time. 1205 * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *). 1206 */ 1207 final double[] lag = new double[n]; 1208 double last = 0; 1209 for (int k = 0; k < n; k++) { 1210 lag[k] = c(0, k + 1, m, n, cnm, strict); 1211 } 1212 for (int k = 1; k <= i; k++) { 1213 last = c(k, 0, m, n, cnm, strict); 1214 for (int l = 1; l <= j; l++) { 1215 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]); 1216 last = lag[l - 1]; 1217 } 1218 } 1219 return last; 1220 } 1221 }