001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.stat.inference; 019 020import java.math.BigDecimal; 021import java.util.Arrays; 022import java.util.HashSet; 023 024import org.apache.commons.math3.distribution.EnumeratedRealDistribution; 025import org.apache.commons.math3.distribution.RealDistribution; 026import org.apache.commons.math3.distribution.UniformRealDistribution; 027import org.apache.commons.math3.exception.InsufficientDataException; 028import org.apache.commons.math3.exception.MathArithmeticException; 029import org.apache.commons.math3.exception.MathInternalError; 030import org.apache.commons.math3.exception.NullArgumentException; 031import org.apache.commons.math3.exception.NumberIsTooLargeException; 032import org.apache.commons.math3.exception.OutOfRangeException; 033import org.apache.commons.math3.exception.TooManyIterationsException; 034import org.apache.commons.math3.exception.util.LocalizedFormats; 035import org.apache.commons.math3.fraction.BigFraction; 036import org.apache.commons.math3.fraction.BigFractionField; 037import org.apache.commons.math3.fraction.FractionConversionException; 038import org.apache.commons.math3.linear.Array2DRowFieldMatrix; 039import org.apache.commons.math3.linear.FieldMatrix; 040import org.apache.commons.math3.linear.MatrixUtils; 041import org.apache.commons.math3.linear.RealMatrix; 042import org.apache.commons.math3.random.JDKRandomGenerator; 043import org.apache.commons.math3.random.RandomGenerator; 044import org.apache.commons.math3.random.Well19937c; 045import org.apache.commons.math3.util.CombinatoricsUtils; 046import org.apache.commons.math3.util.FastMath; 047import org.apache.commons.math3.util.MathArrays; 048import org.apache.commons.math3.util.MathUtils; 049 050/** 051 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 052 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. 053 * <p> 054 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of 055 * sample data points from the distribution expected under the null hypothesis. For one-sample tests 056 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the 057 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and 058 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of 059 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values 060 * given in [2]. 061 * </p> 062 * <p> 063 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples 064 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test 065 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is 066 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of 067 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The 068 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as 069 * follows: 070 * <ul> 071 * <li>For small samples (where the product of the sample sizes is less than 072 * {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the 073 * exact p-value for the 2-sample test.</li> 074 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic 075 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on 076 * the approximation.</li> 077 * </ul></p><p> 078 * If the product of the sample sizes is less than {@value #LARGE_SAMPLE_PRODUCT} and the sample 079 * data contains ties, random jitter is added to the sample data to break ties before applying 080 * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)} 081 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a> 082 * in the R Matching package [3], can be used if ties are known to be present in the data. 083 * </p> 084 * <p> 085 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value 086 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \) 087 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean 088 * {@code strict} parameter. This parameter is ignored for large samples. 089 * </p> 090 * <p> 091 * The methods used by the 2-sample default implementation are also exposed directly: 092 * <ul> 093 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> 094 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} 095 * arguments in the first two methods allow the probability used to estimate the p-value to be 096 * expressed using strict or non-strict inequality. See 097 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li> 098 * </ul> 099 * </p> 100 * <p> 101 * References: 102 * <ul> 103 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by 104 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> 105 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov 106 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> 107 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07"> 108 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: 109 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li> 110 * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing, 111 * Chapter 5, 3rd Ed. Academic Press.</li> 112 * </ul> 113 * <br/> 114 * Note that [1] contains an error in computing h, refer to <a 115 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 116 * </p> 117 * 118 * @since 3.3 119 */ 120public class KolmogorovSmirnovTest { 121 122 /** 123 * Bound on the number of partial sums in {@link #ksSum(double, double, int)} 124 */ 125 protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000; 126 127 /** Convergence criterion for {@link #ksSum(double, double, int)} */ 128 protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20; 129 130 /** Convergence criterion for the sums in #pelzGood(double, double, int)} */ 131 protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10; 132 133 /** No longer used. */ 134 @Deprecated 135 protected static final int SMALL_SAMPLE_PRODUCT = 200; 136 137 /** 138 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic 139 * distribution to compute the p-value. 140 */ 141 protected static final int LARGE_SAMPLE_PRODUCT = 10000; 142 143 /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}. 144 * Deprecated as of version 3.6, as this method is no longer needed. */ 145 @Deprecated 146 protected static final int MONTE_CARLO_ITERATIONS = 1000000; 147 148 /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */ 149 private final RandomGenerator rng; 150 151 /** 152 * Construct a KolmogorovSmirnovTest instance with a default random data generator. 153 */ 154 public KolmogorovSmirnovTest() { 155 rng = new Well19937c(); 156 } 157 158 /** 159 * Construct a KolmogorovSmirnovTest with the provided random data generator. 160 * The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this 161 * constructor is deprecated as of version 3.6. 162 * 163 * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} 164 */ 165 @Deprecated 166 public KolmogorovSmirnovTest(RandomGenerator rng) { 167 this.rng = rng; 168 } 169 170 /** 171 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 172 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 173 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If 174 * {@code exact} is true, the distribution used to compute the p-value is computed using 175 * extended precision. See {@link #cdfExact(double, int)}. 176 * 177 * @param distribution reference distribution 178 * @param data sample being being evaluated 179 * @param exact whether or not to force exact computation of the p-value 180 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 181 * {@code distribution} 182 * @throws InsufficientDataException if {@code data} does not have length at least 2 183 * @throws NullArgumentException if {@code data} is null 184 */ 185 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { 186 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); 187 } 188 189 /** 190 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where 191 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the 192 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at 193 * each of the values in {@code data}. 194 * 195 * @param distribution reference distribution 196 * @param data sample being evaluated 197 * @return Kolmogorov-Smirnov statistic \(D_n\) 198 * @throws InsufficientDataException if {@code data} does not have length at least 2 199 * @throws NullArgumentException if {@code data} is null 200 */ 201 public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) { 202 checkArray(data); 203 final int n = data.length; 204 final double nd = n; 205 final double[] dataCopy = new double[n]; 206 System.arraycopy(data, 0, dataCopy, 0, n); 207 Arrays.sort(dataCopy); 208 double d = 0d; 209 for (int i = 1; i <= n; i++) { 210 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); 211 final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi); 212 if (currD > d) { 213 d = currD; 214 } 215 } 216 return d; 217 } 218 219 /** 220 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 221 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 222 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 223 * probability distribution. Specifically, what is returned is an estimate of the probability 224 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly 225 * selected partition of the combined sample into subsamples of sizes {@code x.length} and 226 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as 227 * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}. 228 * <ul> 229 * <li>For small samples (where the product of the sample sizes is less than 230 * {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented 231 * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li> 232 * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the 233 * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} 234 * for details on the approximation.</li> 235 * </ul><p> 236 * If {@code x.length * y.length} < {@value #LARGE_SAMPLE_PRODUCT} and the combined set of values in 237 * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to 238 * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed 239 * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between 240 * values in the combined sample.</p> 241 * <p> 242 * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)} 243 * may be used as an alternative method for estimating the p-value.</p> 244 * 245 * @param x first sample dataset 246 * @param y second sample dataset 247 * @param strict whether or not the probability to compute is expressed as a strict inequality 248 * (ignored for large samples) 249 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 250 * samples from the same distribution 251 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 252 * least 2 253 * @throws NullArgumentException if either {@code x} or {@code y} is null 254 * @see #bootstrap(double[], double[], int, boolean) 255 */ 256 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { 257 final long lengthProduct = (long) x.length * y.length; 258 double[] xa = null; 259 double[] ya = null; 260 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { 261 xa = MathArrays.copyOf(x); 262 ya = MathArrays.copyOf(y); 263 fixTies(xa, ya); 264 } else { 265 xa = x; 266 ya = y; 267 } 268 if (lengthProduct < LARGE_SAMPLE_PRODUCT) { 269 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); 270 } 271 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); 272 } 273 274 /** 275 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 276 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 277 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 278 * probability distribution. Assumes the strict form of the inequality used to compute the 279 * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}. 280 * 281 * @param x first sample dataset 282 * @param y second sample dataset 283 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 284 * samples from the same distribution 285 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 286 * least 2 287 * @throws NullArgumentException if either {@code x} or {@code y} is null 288 */ 289 public double kolmogorovSmirnovTest(double[] x, double[] y) { 290 return kolmogorovSmirnovTest(x, y, true); 291 } 292 293 /** 294 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 295 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 296 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 297 * is the empirical distribution of the {@code y} values. 298 * 299 * @param x first sample 300 * @param y second sample 301 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 302 * {@code y} represent samples from the same underlying distribution 303 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 304 * least 2 305 * @throws NullArgumentException if either {@code x} or {@code y} is null 306 */ 307 public double kolmogorovSmirnovStatistic(double[] x, double[] y) { 308 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length)); 309 } 310 311 /** 312 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 313 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 314 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 315 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned 316 * as long value. 317 * 318 * @param x first sample 319 * @param y second sample 320 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 321 * {@code y} represent samples from the same underlying distribution 322 * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at 323 * least 2 324 * @throws NullArgumentException if either {@code x} or {@code y} is null 325 */ 326 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) { 327 checkArray(x); 328 checkArray(y); 329 // Copy and sort the sample arrays 330 final double[] sx = MathArrays.copyOf(x); 331 final double[] sy = MathArrays.copyOf(y); 332 Arrays.sort(sx); 333 Arrays.sort(sy); 334 final int n = sx.length; 335 final int m = sy.length; 336 337 int rankX = 0; 338 int rankY = 0; 339 long curD = 0l; 340 341 // Find the max difference between cdf_x and cdf_y 342 long supD = 0l; 343 do { 344 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; 345 while(rankX < n && Double.compare(sx[rankX], z) == 0) { 346 rankX += 1; 347 curD += m; 348 } 349 while(rankY < m && Double.compare(sy[rankY], z) == 0) { 350 rankY += 1; 351 curD -= n; 352 } 353 if (curD > supD) { 354 supD = curD; 355 } 356 else if (-curD > supD) { 357 supD = -curD; 358 } 359 } while(rankX < n && rankY < m); 360 return supD; 361 } 362 363 /** 364 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 365 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 366 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 367 * 368 * @param distribution reference distribution 369 * @param data sample being being evaluated 370 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 371 * {@code distribution} 372 * @throws InsufficientDataException if {@code data} does not have length at least 2 373 * @throws NullArgumentException if {@code data} is null 374 */ 375 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) { 376 return kolmogorovSmirnovTest(distribution, data, false); 377 } 378 379 /** 380 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov 381 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 382 * 383 * @param distribution reference distribution 384 * @param data sample being being evaluated 385 * @param alpha significance level of the test 386 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution} 387 * can be rejected with confidence 1 - {@code alpha} 388 * @throws InsufficientDataException if {@code data} does not have length at least 2 389 * @throws NullArgumentException if {@code data} is null 390 */ 391 public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) { 392 if ((alpha <= 0) || (alpha > 0.5)) { 393 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); 394 } 395 return kolmogorovSmirnovTest(distribution, data) < alpha; 396 } 397 398 /** 399 * Estimates the <i>p-value</i> of a two-sample 400 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 401 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 402 * probability distribution. This method estimates the p-value by repeatedly sampling sets of size 403 * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample. 404 * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function 405 * {@code ks.boot}, described in <pre> 406 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching 407 * Software with Automated Balance Optimization: The Matching package for R.' 408 * Journal of Statistical Software, 42(7): 1-52. 409 * </pre> 410 * @param x first sample 411 * @param y second sample 412 * @param iterations number of bootstrap resampling iterations 413 * @param strict whether or not the null hypothesis is expressed as a strict inequality 414 * @return estimated p-value 415 */ 416 public double bootstrap(double[] x, double[] y, int iterations, boolean strict) { 417 final int xLength = x.length; 418 final int yLength = y.length; 419 final double[] combined = new double[xLength + yLength]; 420 System.arraycopy(x, 0, combined, 0, xLength); 421 System.arraycopy(y, 0, combined, xLength, yLength); 422 final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(rng, combined); 423 final long d = integralKolmogorovSmirnovStatistic(x, y); 424 int greaterCount = 0; 425 int equalCount = 0; 426 double[] curX; 427 double[] curY; 428 long curD; 429 for (int i = 0; i < iterations; i++) { 430 curX = dist.sample(xLength); 431 curY = dist.sample(yLength); 432 curD = integralKolmogorovSmirnovStatistic(curX, curY); 433 if (curD > d) { 434 greaterCount++; 435 } else if (curD == d) { 436 equalCount++; 437 } 438 } 439 return strict ? greaterCount / (double) iterations : 440 (greaterCount + equalCount) / (double) iterations; 441 } 442 443 /** 444 * Computes {@code bootstrap(x, y, iterations, true)}. 445 * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching 446 * package function. See #bootstrap(double[], double[], int, boolean). 447 * 448 * @param x first sample 449 * @param y second sample 450 * @param iterations number of bootstrap resampling iterations 451 * @return estimated p-value 452 */ 453 public double bootstrap(double[] x, double[] y, int iterations) { 454 return bootstrap(x, y, iterations, true); 455 } 456 457 /** 458 * Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme 459 * values given in [2] (see above). The result is not exact as with 460 * {@link #cdfExact(double, int)} because calculations are based on 461 * {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}. 462 * 463 * @param d statistic 464 * @param n sample size 465 * @return \(P(D_n < d)\) 466 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 467 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 468 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 469 */ 470 public double cdf(double d, int n) 471 throws MathArithmeticException { 472 return cdf(d, n, false); 473 } 474 475 /** 476 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is 477 * used everywhere at the expense of very slow execution time. Almost never choose this in real 478 * applications unless you are very sure; this is almost solely for verification purposes. 479 * Normally, you would choose {@link #cdf(double, int)}. See the class 480 * javadoc for definitions and algorithm description. 481 * 482 * @param d statistic 483 * @param n sample size 484 * @return \(P(D_n < d)\) 485 * @throws MathArithmeticException if the algorithm fails to convert {@code h} to a 486 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 487 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\) 488 */ 489 public double cdfExact(double d, int n) 490 throws MathArithmeticException { 491 return cdf(d, n, true); 492 } 493 494 /** 495 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme 496 * values given in [2] (see above). 497 * 498 * @param d statistic 499 * @param n sample size 500 * @param exact whether the probability should be calculated exact using 501 * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of 502 * very slow execution time, or if {@code double} should be used convenient places to 503 * gain speed. Almost never choose {@code true} in real applications unless you are very 504 * sure; {@code true} is almost solely for verification purposes. 505 * @return \(P(D_n < d)\) 506 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 507 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 508 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 509 */ 510 public double cdf(double d, int n, boolean exact) 511 throws MathArithmeticException { 512 513 final double ninv = 1 / ((double) n); 514 final double ninvhalf = 0.5 * ninv; 515 516 if (d <= ninvhalf) { 517 return 0; 518 } else if (ninvhalf < d && d <= ninv) { 519 double res = 1; 520 final double f = 2 * d - ninv; 521 // n! f^n = n*f * (n-1)*f * ... * 1*x 522 for (int i = 1; i <= n; ++i) { 523 res *= i * f; 524 } 525 return res; 526 } else if (1 - ninv <= d && d < 1) { 527 return 1 - 2 * Math.pow(1 - d, n); 528 } else if (1 <= d) { 529 return 1; 530 } 531 if (exact) { 532 return exactK(d, n); 533 } 534 if (n <= 140) { 535 return roundedK(d, n); 536 } 537 return pelzGood(d, n); 538 } 539 540 /** 541 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference 542 * in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see 543 * above). 544 * 545 * @param d statistic 546 * @param n sample size 547 * @return the two-sided probability of \(P(D_n < d)\) 548 * @throws MathArithmeticException if algorithm fails to convert {@code h} to a 549 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 550 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 551 */ 552 private double exactK(double d, int n) 553 throws MathArithmeticException { 554 555 final int k = (int) Math.ceil(n * d); 556 557 final FieldMatrix<BigFraction> H = this.createExactH(d, n); 558 final FieldMatrix<BigFraction> Hpower = H.power(n); 559 560 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); 561 562 for (int i = 1; i <= n; ++i) { 563 pFrac = pFrac.multiply(i).divide(n); 564 } 565 566 /* 567 * BigFraction.doubleValue converts numerator to double and the denominator to double and 568 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of 569 * digits): 570 */ 571 return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue(); 572 } 573 574 /** 575 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). 576 * 577 * @param d statistic 578 * @param n sample size 579 * @return \(P(D_n < d)\) 580 */ 581 private double roundedK(double d, int n) { 582 583 final int k = (int) Math.ceil(n * d); 584 final RealMatrix H = this.createRoundedH(d, n); 585 final RealMatrix Hpower = H.power(n); 586 587 double pFrac = Hpower.getEntry(k - 1, k - 1); 588 for (int i = 1; i <= n; ++i) { 589 pFrac *= (double) i / (double) n; 590 } 591 592 return pFrac; 593 } 594 595 /** 596 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc. 597 * 598 * @param d value of d-statistic (x in [2]) 599 * @param n sample size 600 * @return \(P(D_n < d)\) 601 * @since 3.4 602 */ 603 public double pelzGood(double d, int n) { 604 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n) 605 final double sqrtN = FastMath.sqrt(n); 606 final double z = d * sqrtN; 607 final double z2 = d * d * n; 608 final double z4 = z2 * z2; 609 final double z6 = z4 * z2; 610 final double z8 = z4 * z4; 611 612 // Eventual return value 613 double ret = 0; 614 615 // Compute K_0(z) 616 double sum = 0; 617 double increment = 0; 618 double kTerm = 0; 619 double z2Term = MathUtils.PI_SQUARED / (8 * z2); 620 int k = 1; 621 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 622 kTerm = 2 * k - 1; 623 increment = FastMath.exp(-z2Term * kTerm * kTerm); 624 sum += increment; 625 if (increment <= PG_SUM_RELATIVE_ERROR * sum) { 626 break; 627 } 628 } 629 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 630 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 631 } 632 ret = sum * FastMath.sqrt(2 * FastMath.PI) / z; 633 634 // K_1(z) 635 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have 636 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...) 637 final double twoZ2 = 2 * z2; 638 sum = 0; 639 kTerm = 0; 640 double kTerm2 = 0; 641 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 642 kTerm = k + 0.5; 643 kTerm2 = kTerm * kTerm; 644 increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 645 sum += increment; 646 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 647 break; 648 } 649 } 650 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 651 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 652 } 653 final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2); 654 // Instead of doubling sum, divide by 3 instead of 6 655 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN); 656 657 // K_2(z) 658 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers. 659 final double z4Term = 2 * z4; 660 final double z6Term = 6 * z6; 661 z2Term = 5 * z2; 662 final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED; 663 sum = 0; 664 kTerm = 0; 665 kTerm2 = 0; 666 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 667 kTerm = k + 0.5; 668 kTerm2 = kTerm * kTerm; 669 increment = (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 + 670 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 671 sum += increment; 672 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 673 break; 674 } 675 } 676 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 677 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 678 } 679 double sum2 = 0; 680 kTerm2 = 0; 681 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 682 kTerm2 = k * k; 683 increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 684 sum2 += increment; 685 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 686 break; 687 } 688 } 689 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 690 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 691 } 692 // Again, adjust coefficients instead of doubling sum, sum2 693 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z)); 694 695 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even. 696 // Multiply coefficient denominators by 2, so omit doubling sums. 697 final double pi6 = pi4 * MathUtils.PI_SQUARED; 698 sum = 0; 699 double kTerm4 = 0; 700 double kTerm6 = 0; 701 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 702 kTerm = k + 0.5; 703 kTerm2 = kTerm * kTerm; 704 kTerm4 = kTerm2 * kTerm2; 705 kTerm6 = kTerm4 * kTerm2; 706 increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) + 707 MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) * 708 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 709 sum += increment; 710 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 711 break; 712 } 713 } 714 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 715 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 716 } 717 sum2 = 0; 718 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 719 kTerm2 = k * k; 720 kTerm4 = kTerm2 * kTerm2; 721 increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) * 722 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 723 sum2 += increment; 724 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 725 break; 726 } 727 } 728 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 729 throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT); 730 } 731 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) + 732 + sum2 / (108 * z6)); 733 734 } 735 736 /*** 737 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 738 * 739 * @param d statistic 740 * @param n sample size 741 * @return H matrix 742 * @throws NumberIsTooLargeException if fractional part is greater than 1 743 * @throws FractionConversionException if algorithm fails to convert {@code h} to a 744 * {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k 745 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\). 746 */ 747 private FieldMatrix<BigFraction> createExactH(double d, int n) 748 throws NumberIsTooLargeException, FractionConversionException { 749 750 final int k = (int) Math.ceil(n * d); 751 final int m = 2 * k - 1; 752 final double hDouble = k - n * d; 753 if (hDouble >= 1) { 754 throw new NumberIsTooLargeException(hDouble, 1.0, false); 755 } 756 BigFraction h = null; 757 try { 758 h = new BigFraction(hDouble, 1.0e-20, 10000); 759 } catch (final FractionConversionException e1) { 760 try { 761 h = new BigFraction(hDouble, 1.0e-10, 10000); 762 } catch (final FractionConversionException e2) { 763 h = new BigFraction(hDouble, 1.0e-5, 10000); 764 } 765 } 766 final BigFraction[][] Hdata = new BigFraction[m][m]; 767 768 /* 769 * Start by filling everything with either 0 or 1. 770 */ 771 for (int i = 0; i < m; ++i) { 772 for (int j = 0; j < m; ++j) { 773 if (i - j + 1 < 0) { 774 Hdata[i][j] = BigFraction.ZERO; 775 } else { 776 Hdata[i][j] = BigFraction.ONE; 777 } 778 } 779 } 780 781 /* 782 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 783 * hPowers[m-1] = h^m 784 */ 785 final BigFraction[] hPowers = new BigFraction[m]; 786 hPowers[0] = h; 787 for (int i = 1; i < m; ++i) { 788 hPowers[i] = h.multiply(hPowers[i - 1]); 789 } 790 791 /* 792 * First column and last row has special values (each other reversed). 793 */ 794 for (int i = 0; i < m; ++i) { 795 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); 796 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); 797 } 798 799 /* 800 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 801 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 802 */ 803 if (h.compareTo(BigFraction.ONE_HALF) == 1) { 804 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); 805 } 806 807 /* 808 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 809 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 810 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 811 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 812 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 813 * really necessary. 814 */ 815 for (int i = 0; i < m; ++i) { 816 for (int j = 0; j < i + 1; ++j) { 817 if (i - j + 1 > 0) { 818 for (int g = 2; g <= i - j + 1; ++g) { 819 Hdata[i][j] = Hdata[i][j].divide(g); 820 } 821 } 822 } 823 } 824 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); 825 } 826 827 /*** 828 * Creates {@code H} of size {@code m x m} as described in [1] (see above) 829 * using double-precision. 830 * 831 * @param d statistic 832 * @param n sample size 833 * @return H matrix 834 * @throws NumberIsTooLargeException if fractional part is greater than 1 835 */ 836 private RealMatrix createRoundedH(double d, int n) 837 throws NumberIsTooLargeException { 838 839 final int k = (int) Math.ceil(n * d); 840 final int m = 2 * k - 1; 841 final double h = k - n * d; 842 if (h >= 1) { 843 throw new NumberIsTooLargeException(h, 1.0, false); 844 } 845 final double[][] Hdata = new double[m][m]; 846 847 /* 848 * Start by filling everything with either 0 or 1. 849 */ 850 for (int i = 0; i < m; ++i) { 851 for (int j = 0; j < m; ++j) { 852 if (i - j + 1 < 0) { 853 Hdata[i][j] = 0; 854 } else { 855 Hdata[i][j] = 1; 856 } 857 } 858 } 859 860 /* 861 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 862 * hPowers[m-1] = h^m 863 */ 864 final double[] hPowers = new double[m]; 865 hPowers[0] = h; 866 for (int i = 1; i < m; ++i) { 867 hPowers[i] = h * hPowers[i - 1]; 868 } 869 870 /* 871 * First column and last row has special values (each other reversed). 872 */ 873 for (int i = 0; i < m; ++i) { 874 Hdata[i][0] = Hdata[i][0] - hPowers[i]; 875 Hdata[m - 1][i] -= hPowers[m - i - 1]; 876 } 877 878 /* 879 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 880 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 881 */ 882 if (Double.compare(h, 0.5) > 0) { 883 Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m); 884 } 885 886 /* 887 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 888 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 889 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 890 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 891 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 892 * really necessary. 893 */ 894 for (int i = 0; i < m; ++i) { 895 for (int j = 0; j < i + 1; ++j) { 896 if (i - j + 1 > 0) { 897 for (int g = 2; g <= i - j + 1; ++g) { 898 Hdata[i][j] /= g; 899 } 900 } 901 } 902 } 903 return MatrixUtils.createRealMatrix(Hdata); 904 } 905 906 /** 907 * Verifies that {@code array} has length at least 2. 908 * 909 * @param array array to test 910 * @throws NullArgumentException if array is null 911 * @throws InsufficientDataException if array is too short 912 */ 913 private void checkArray(double[] array) { 914 if (array == null) { 915 throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED); 916 } 917 if (array.length < 2) { 918 throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length, 919 2); 920 } 921 } 922 923 /** 924 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial 925 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums 926 * have been computed. If the sum does not converge before {@code maxIterations} iterations a 927 * {@link TooManyIterationsException} is thrown. 928 * 929 * @param t argument 930 * @param tolerance Cauchy criterion for partial sums 931 * @param maxIterations maximum number of partial sums to compute 932 * @return Kolmogorov sum evaluated at t 933 * @throws TooManyIterationsException if the series does not converge 934 */ 935 public double ksSum(double t, double tolerance, int maxIterations) { 936 if (t == 0.0) { 937 return 0.0; 938 } 939 940 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] 941 // from class javadoc should be used. 942 943 final double x = -2 * t * t; 944 int sign = -1; 945 long i = 1; 946 double partialSum = 0.5d; 947 double delta = 1; 948 while (delta > tolerance && i < maxIterations) { 949 delta = FastMath.exp(x * i * i); 950 partialSum += sign * delta; 951 sign *= -1; 952 i++; 953 } 954 if (i == maxIterations) { 955 throw new TooManyIterationsException(maxIterations); 956 } 957 return partialSum * 2; 958 } 959 960 /** 961 * Given a d-statistic in the range [0, 1] and the two sample sizes n and m, 962 * an integral d-statistic in the range [0, n*m] is calculated, that can be used for 963 * comparison with other integral d-statistics. Depending whether {@code strict} is 964 * {@code true} or not, the returned value divided by (n*m) is greater than 965 * (resp greater than or equal to) the given d value (allowing some tolerance). 966 * 967 * @param d a d-statistic in the range [0, 1] 968 * @param n first sample size 969 * @param m second sample size 970 * @param strict whether the returned value divided by (n*m) is allowed to be equal to d 971 * @return the integral d-statistic in the range [0, n*m] 972 */ 973 private static long calculateIntegralD(double d, int n, int m, boolean strict) { 974 final double tol = 1e-12; // d-values within tol of one another are considered equal 975 long nm = n * (long)m; 976 long upperBound = (long)FastMath.ceil((d - tol) * nm); 977 long lowerBound = (long)FastMath.floor((d + tol) * nm); 978 if (strict && lowerBound == upperBound) { 979 return upperBound + 1l; 980 } 981 else { 982 return upperBound; 983 } 984 } 985 986 /** 987 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 988 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 989 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 990 * <p> 991 * The returned probability is exact, implemented by unwinding the recursive function 992 * definitions presented in [4] (class javadoc). 993 * </p> 994 * 995 * @param d D-statistic value 996 * @param n first sample size 997 * @param m second sample size 998 * @param strict whether or not the probability to compute is expressed as a strict inequality 999 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1000 * greater than (resp. greater than or equal to) {@code d} 1001 */ 1002 public double exactP(double d, int n, int m, boolean strict) { 1003 return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / 1004 CombinatoricsUtils.binomialCoefficientDouble(n + m, m); 1005 } 1006 1007 /** 1008 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 1009 * is the 2-sample Kolmogorov-Smirnov statistic. See 1010 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1011 * <p> 1012 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 1013 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for 1014 * details on how convergence of the sum is determined. This implementation passes {@code ksSum} 1015 * {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and 1016 * {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}. 1017 * </p> 1018 * 1019 * @param d D-statistic value 1020 * @param n first sample size 1021 * @param m second sample size 1022 * @return approximate probability that a randomly selected m-n partition of m + n generates 1023 * \(D_{n,m}\) greater than {@code d} 1024 */ 1025 public double approximateP(double d, int n, int m) { 1026 final double dm = m; 1027 final double dn = n; 1028 return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), 1029 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT); 1030 } 1031 1032 /** 1033 * Fills a boolean array randomly with a fixed number of {@code true} values. 1034 * The method uses a simplified version of the Fisher-Yates shuffle algorithm. 1035 * By processing first the {@code true} values followed by the remaining {@code false} values 1036 * less random numbers need to be generated. The method is optimized for the case 1037 * that the number of {@code true} values is larger than or equal to the number of 1038 * {@code false} values. 1039 * 1040 * @param b boolean array 1041 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have 1042 * @param rng random data generator 1043 */ 1044 static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) { 1045 Arrays.fill(b, true); 1046 for (int k = numberOfTrueValues; k < b.length; k++) { 1047 final int r = rng.nextInt(k + 1); 1048 b[(b[r]) ? r : k] = false; 1049 } 1050 } 1051 1052 /** 1053 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 1054 * 2-sample Kolmogorov-Smirnov statistic. See 1055 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1056 * <p> 1057 * The simulation generates {@code iterations} random partitions of {@code m + n} into an 1058 * {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning 1059 * the proportion of values that are greater than {@code d}, or greater than or equal to 1060 * {@code d} if {@code strict} is {@code false}. 1061 * </p> 1062 * 1063 * @param d D-statistic value 1064 * @param n first sample size 1065 * @param m second sample size 1066 * @param iterations number of random partitions to generate 1067 * @param strict whether or not the probability to compute is expressed as a strict inequality 1068 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1069 * greater than (resp. greater than or equal to) {@code d} 1070 */ 1071 public double monteCarloP(final double d, final int n, final int m, final boolean strict, 1072 final int iterations) { 1073 return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations); 1074 } 1075 1076 /** 1077 * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the 1078 * 2-sample Kolmogorov-Smirnov statistic. 1079 * <p> 1080 * Here d is the D-statistic represented as long value. 1081 * The real D-statistic is obtained by dividing d by n*m. 1082 * See also {@link #monteCarloP(double, int, int, boolean, int)}. 1083 * 1084 * @param d integral D-statistic 1085 * @param n first sample size 1086 * @param m second sample size 1087 * @param iterations number of random partitions to generate 1088 * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) 1089 * greater than or equal to {@code d/(n*m))} 1090 */ 1091 private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) { 1092 1093 // ensure that nn is always the max of (n, m) to require fewer random numbers 1094 final int nn = FastMath.max(n, m); 1095 final int mm = FastMath.min(n, m); 1096 final int sum = nn + mm; 1097 1098 int tail = 0; 1099 final boolean b[] = new boolean[sum]; 1100 for (int i = 0; i < iterations; i++) { 1101 fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng); 1102 long curD = 0l; 1103 for(int j = 0; j < b.length; ++j) { 1104 if (b[j]) { 1105 curD += mm; 1106 if (curD >= d) { 1107 tail++; 1108 break; 1109 } 1110 } else { 1111 curD -= nn; 1112 if (curD <= -d) { 1113 tail++; 1114 break; 1115 } 1116 } 1117 } 1118 } 1119 return (double) tail / iterations; 1120 } 1121 1122 /** 1123 * If there are no ties in the combined dataset formed from x and y, this 1124 * method is a no-op. If there are ties, a uniform random deviate in 1125 * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where 1126 * minDelta is the minimum difference between unequal values in the combined 1127 * sample. A fixed seed is used to generate the jitter, so repeated activations 1128 * with the same input arrays result in the same values. 1129 * 1130 * NOTE: if there are ties in the data, this method overwrites the data in 1131 * x and y with the jittered values. 1132 * 1133 * @param x first sample 1134 * @param y second sample 1135 */ 1136 private static void fixTies(double[] x, double[] y) { 1137 final double[] values = MathArrays.unique(MathArrays.concatenate(x,y)); 1138 if (values.length == x.length + y.length) { 1139 return; // There are no ties 1140 } 1141 1142 // Find the smallest difference between values, or 1 if all values are the same 1143 double minDelta = 1; 1144 double prev = values[0]; 1145 double delta = 1; 1146 for (int i = 1; i < values.length; i++) { 1147 delta = prev - values[i]; 1148 if (delta < minDelta) { 1149 minDelta = delta; 1150 } 1151 prev = values[i]; 1152 } 1153 minDelta /= 2; 1154 1155 // Add jitter using a fixed seed (so same arguments always give same results), 1156 // low-initialization-overhead generator 1157 final RealDistribution dist = 1158 new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta); 1159 1160 // It is theoretically possible that jitter does not break ties, so repeat 1161 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded. 1162 int ct = 0; 1163 boolean ties = true; 1164 do { 1165 jitter(x, dist); 1166 jitter(y, dist); 1167 ties = hasTies(x, y); 1168 ct++; 1169 } while (ties && ct < 1000); 1170 if (ties) { 1171 throw new MathInternalError(); // Should never happen 1172 } 1173 } 1174 1175 /** 1176 * Returns true iff there are ties in the combined sample 1177 * formed from x and y. 1178 * 1179 * @param x first sample 1180 * @param y second sample 1181 * @return true if x and y together contain ties 1182 */ 1183 private static boolean hasTies(double[] x, double[] y) { 1184 final HashSet<Double> values = new HashSet<Double>(); 1185 for (int i = 0; i < x.length; i++) { 1186 if (!values.add(x[i])) { 1187 return true; 1188 } 1189 } 1190 for (int i = 0; i < y.length; i++) { 1191 if (!values.add(y[i])) { 1192 return true; 1193 } 1194 } 1195 return false; 1196 } 1197 1198 /** 1199 * Adds random jitter to {@code data} using deviates sampled from {@code dist}. 1200 * <p> 1201 * Note that jitter is applied in-place - i.e., the array 1202 * values are overwritten with the result of applying jitter.</p> 1203 * 1204 * @param data input/output data array - entries overwritten by the method 1205 * @param dist probability distribution to sample for jitter values 1206 * @throws NullPointerException if either of the parameters is null 1207 */ 1208 private static void jitter(double[] data, RealDistribution dist) { 1209 for (int i = 0; i < data.length; i++) { 1210 data[i] += dist.sample(); 1211 } 1212 } 1213 1214 /** 1215 * The function C(i, j) defined in [4] (class javadoc), formula (5.5). 1216 * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up 1217 * and recoded as a long to avoid rounding errors in comparison tests, so what 1218 * is actually tested is |im - jn| <= cmn. 1219 * 1220 * @param i first path parameter 1221 * @param j second path paramter 1222 * @param m first sample size 1223 * @param n second sample size 1224 * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1225 * @param strict whether or not the null hypothesis uses strict inequality 1226 * @return C(i,j) for given m, n, c 1227 */ 1228 private static int c(int i, int j, int m, int n, long cmn, boolean strict) { 1229 if (strict) { 1230 return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0; 1231 } 1232 return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0; 1233 } 1234 1235 /** 1236 * The function N(i, j) defined in [4] (class javadoc). 1237 * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m} 1238 * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path. 1239 * The return value is integral, but subject to overflow, so it is maintained and 1240 * returned as a double. 1241 * 1242 * @param i first path parameter 1243 * @param j second path parameter 1244 * @param m first sample size 1245 * @param n second sample size 1246 * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) 1247 * @param strict whether or not the null hypothesis uses strict inequality 1248 * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n 1249 */ 1250 private static double n(int i, int j, int m, int n, long cnm, boolean strict) { 1251 /* 1252 * Unwind the recursive definition given in [4]. 1253 * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time. 1254 * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *). 1255 */ 1256 final double[] lag = new double[n]; 1257 double last = 0; 1258 for (int k = 0; k < n; k++) { 1259 lag[k] = c(0, k + 1, m, n, cnm, strict); 1260 } 1261 for (int k = 1; k <= i; k++) { 1262 last = c(k, 0, m, n, cnm, strict); 1263 for (int l = 1; l <= j; l++) { 1264 lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]); 1265 last = lag[l - 1]; 1266 } 1267 } 1268 return last; 1269 } 1270}