001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.distribution; 019 020import java.io.Serializable; 021import java.math.BigDecimal; 022 023import org.apache.commons.math3.exception.MathArithmeticException; 024import org.apache.commons.math3.exception.NotStrictlyPositiveException; 025import org.apache.commons.math3.exception.NumberIsTooLargeException; 026import org.apache.commons.math3.exception.util.LocalizedFormats; 027import org.apache.commons.math3.fraction.BigFraction; 028import org.apache.commons.math3.fraction.BigFractionField; 029import org.apache.commons.math3.fraction.FractionConversionException; 030import org.apache.commons.math3.linear.Array2DRowFieldMatrix; 031import org.apache.commons.math3.linear.Array2DRowRealMatrix; 032import org.apache.commons.math3.linear.FieldMatrix; 033import org.apache.commons.math3.linear.RealMatrix; 034import org.apache.commons.math3.util.FastMath; 035 036/** 037 * Implementation of the Kolmogorov-Smirnov distribution. 038 * 039 * <p> 040 * Treats the distribution of the two-sided {@code P(D_n < d)} where 041 * {@code D_n = sup_x |G(x) - G_n (x)|} for the theoretical cdf {@code G} and 042 * the empirical cdf {@code G_n}. 043 * </p> 044 * <p> 045 * This implementation is based on [1] with certain quick decisions for extreme 046 * values given in [2]. 047 * </p> 048 * <p> 049 * In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is 050 * to write {@code d = (k - h) / n} for positive integer {@code k} and 051 * {@code 0 <= h < 1}. Then {@code P(D_n < d) = (n! / n^n) * t_kk}, where 052 * {@code t_kk} is the {@code (k, k)}'th entry in the special matrix 053 * {@code H^n}, i.e. {@code H} to the {@code n}'th power. 054 * </p> 055 * <p> 056 * References: 057 * <ul> 058 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> 059 * Evaluating Kolmogorov's Distribution</a> by George Marsaglia, Wai 060 * Wan Tsang, and Jingbo Wang</li> 061 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> 062 * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard 063 * and Pierre L'Ecuyer</li> 064 * </ul> 065 * Note that [1] contains an error in computing h, refer to 066 * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 067 * </p> 068 * 069 * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 070 * Kolmogorov-Smirnov test (Wikipedia)</a> 071 * @deprecated to be removed in version 4.0 - 072 * use {@link org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest} 073 */ 074public class KolmogorovSmirnovDistribution implements Serializable { 075 076 /** Serializable version identifier. */ 077 private static final long serialVersionUID = -4670676796862967187L; 078 079 /** Number of observations. */ 080 private int n; 081 082 /** 083 * @param n Number of observations 084 * @throws NotStrictlyPositiveException if {@code n <= 0} 085 */ 086 public KolmogorovSmirnovDistribution(int n) 087 throws NotStrictlyPositiveException { 088 if (n <= 0) { 089 throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n); 090 } 091 092 this.n = n; 093 } 094 095 /** 096 * Calculates {@code P(D_n < d)} using method described in [1] with quick 097 * decisions for extreme values given in [2] (see above). The result is not 098 * exact as with 099 * {@link KolmogorovSmirnovDistribution#cdfExact(double)} because 100 * calculations are based on {@code double} rather than 101 * {@link org.apache.commons.math3.fraction.BigFraction}. 102 * 103 * @param d statistic 104 * @return the two-sided probability of {@code P(D_n < d)} 105 * @throws MathArithmeticException if algorithm fails to convert {@code h} 106 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing 107 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and 108 * {@code 0 <= h < 1}. 109 */ 110 public double cdf(double d) throws MathArithmeticException { 111 return this.cdf(d, false); 112 } 113 114 /** 115 * Calculates {@code P(D_n < d)} using method described in [1] with quick 116 * decisions for extreme values given in [2] (see above). The result is 117 * exact in the sense that BigFraction/BigReal is used everywhere at the 118 * expense of very slow execution time. Almost never choose this in real 119 * applications unless you are very sure; this is almost solely for 120 * verification purposes. Normally, you would choose 121 * {@link KolmogorovSmirnovDistribution#cdf(double)} 122 * 123 * @param d statistic 124 * @return the two-sided probability of {@code P(D_n < d)} 125 * @throws MathArithmeticException if algorithm fails to convert {@code h} 126 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing 127 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and 128 * {@code 0 <= h < 1}. 129 */ 130 public double cdfExact(double d) throws MathArithmeticException { 131 return this.cdf(d, true); 132 } 133 134 /** 135 * Calculates {@code P(D_n < d)} using method described in [1] with quick 136 * decisions for extreme values given in [2] (see above). 137 * 138 * @param d statistic 139 * @param exact whether the probability should be calculated exact using 140 * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the 141 * expense of very slow execution time, or if {@code double} should be used 142 * convenient places to gain speed. Almost never choose {@code true} in real 143 * applications unless you are very sure; {@code true} is almost solely for 144 * verification purposes. 145 * @return the two-sided probability of {@code P(D_n < d)} 146 * @throws MathArithmeticException if algorithm fails to convert {@code h} 147 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing 148 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and 149 * {@code 0 <= h < 1}. 150 */ 151 public double cdf(double d, boolean exact) throws MathArithmeticException { 152 153 final double ninv = 1 / ((double) n); 154 final double ninvhalf = 0.5 * ninv; 155 156 if (d <= ninvhalf) { 157 158 return 0; 159 160 } else if (ninvhalf < d && d <= ninv) { 161 162 double res = 1; 163 double f = 2 * d - ninv; 164 165 // n! f^n = n*f * (n-1)*f * ... * 1*x 166 for (int i = 1; i <= n; ++i) { 167 res *= i * f; 168 } 169 170 return res; 171 172 } else if (1 - ninv <= d && d < 1) { 173 174 return 1 - 2 * FastMath.pow(1 - d, n); 175 176 } else if (1 <= d) { 177 178 return 1; 179 } 180 181 return exact ? exactK(d) : roundedK(d); 182 } 183 184 /** 185 * Calculates the exact value of {@code P(D_n < d)} using method described 186 * in [1] and {@link org.apache.commons.math3.fraction.BigFraction} (see 187 * above). 188 * 189 * @param d statistic 190 * @return the two-sided probability of {@code P(D_n < d)} 191 * @throws MathArithmeticException if algorithm fails to convert {@code h} 192 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing 193 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and 194 * {@code 0 <= h < 1}. 195 */ 196 private double exactK(double d) throws MathArithmeticException { 197 198 final int k = (int) FastMath.ceil(n * d); 199 200 final FieldMatrix<BigFraction> H = this.createH(d); 201 final FieldMatrix<BigFraction> Hpower = H.power(n); 202 203 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); 204 205 for (int i = 1; i <= n; ++i) { 206 pFrac = pFrac.multiply(i).divide(n); 207 } 208 209 /* 210 * BigFraction.doubleValue converts numerator to double and the 211 * denominator to double and divides afterwards. That gives NaN quite 212 * easy. This does not (scale is the number of digits): 213 */ 214 return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue(); 215 } 216 217 /** 218 * Calculates {@code P(D_n < d)} using method described in [1] and doubles 219 * (see above). 220 * 221 * @param d statistic 222 * @return the two-sided probability of {@code P(D_n < d)} 223 * @throws MathArithmeticException if algorithm fails to convert {@code h} 224 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing 225 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and 226 * {@code 0 <= h < 1}. 227 */ 228 private double roundedK(double d) throws MathArithmeticException { 229 230 final int k = (int) FastMath.ceil(n * d); 231 final FieldMatrix<BigFraction> HBigFraction = this.createH(d); 232 final int m = HBigFraction.getRowDimension(); 233 234 /* 235 * Here the rounding part comes into play: use 236 * RealMatrix instead of FieldMatrix<BigFraction> 237 */ 238 final RealMatrix H = new Array2DRowRealMatrix(m, m); 239 240 for (int i = 0; i < m; ++i) { 241 for (int j = 0; j < m; ++j) { 242 H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue()); 243 } 244 } 245 246 final RealMatrix Hpower = H.power(n); 247 248 double pFrac = Hpower.getEntry(k - 1, k - 1); 249 250 for (int i = 1; i <= n; ++i) { 251 pFrac *= (double) i / (double) n; 252 } 253 254 return pFrac; 255 } 256 257 /*** 258 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 259 * 260 * @param d statistic 261 * @return H matrix 262 * @throws NumberIsTooLargeException if fractional part is greater than 1 263 * @throws FractionConversionException if algorithm fails to convert 264 * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in 265 * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and 266 * {@code 0 <= h < 1}. 267 */ 268 private FieldMatrix<BigFraction> createH(double d) 269 throws NumberIsTooLargeException, FractionConversionException { 270 271 int k = (int) FastMath.ceil(n * d); 272 273 int m = 2 * k - 1; 274 double hDouble = k - n * d; 275 276 if (hDouble >= 1) { 277 throw new NumberIsTooLargeException(hDouble, 1.0, false); 278 } 279 280 BigFraction h = null; 281 282 try { 283 h = new BigFraction(hDouble, 1.0e-20, 10000); 284 } catch (FractionConversionException e1) { 285 try { 286 h = new BigFraction(hDouble, 1.0e-10, 10000); 287 } catch (FractionConversionException e2) { 288 h = new BigFraction(hDouble, 1.0e-5, 10000); 289 } 290 } 291 292 final BigFraction[][] Hdata = new BigFraction[m][m]; 293 294 /* 295 * Start by filling everything with either 0 or 1. 296 */ 297 for (int i = 0; i < m; ++i) { 298 for (int j = 0; j < m; ++j) { 299 if (i - j + 1 < 0) { 300 Hdata[i][j] = BigFraction.ZERO; 301 } else { 302 Hdata[i][j] = BigFraction.ONE; 303 } 304 } 305 } 306 307 /* 308 * Setting up power-array to avoid calculating the same value twice: 309 * hPowers[0] = h^1 ... hPowers[m-1] = h^m 310 */ 311 final BigFraction[] hPowers = new BigFraction[m]; 312 hPowers[0] = h; 313 for (int i = 1; i < m; ++i) { 314 hPowers[i] = h.multiply(hPowers[i - 1]); 315 } 316 317 /* 318 * First column and last row has special values (each other reversed). 319 */ 320 for (int i = 0; i < m; ++i) { 321 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); 322 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); 323 } 324 325 /* 326 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix 327 * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 328 * 1/2 is sufficient to check: 329 */ 330 if (h.compareTo(BigFraction.ONE_HALF) == 1) { 331 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); 332 } 333 334 /* 335 * Aside from the first column and last row, the (i, j)-th element is 336 * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already 337 * put, so only division with (i - j + 1)! is needed in the elements 338 * that have 1's. There is no need to calculate (i - j + 1)! and then 339 * divide - small steps avoid overflows. 340 * 341 * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to 342 * m. Also note that it is started at g = 2 because dividing by 1 isn't 343 * really necessary. 344 */ 345 for (int i = 0; i < m; ++i) { 346 for (int j = 0; j < i + 1; ++j) { 347 if (i - j + 1 > 0) { 348 for (int g = 2; g <= i - j + 1; ++g) { 349 Hdata[i][j] = Hdata[i][j].divide(g); 350 } 351 } 352 } 353 } 354 355 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); 356 } 357}