Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import java.io.Serializable;
021import java.math.BigDecimal;
022
023import org.apache.commons.math3.exception.MathArithmeticException;
024import org.apache.commons.math3.exception.NotStrictlyPositiveException;
025import org.apache.commons.math3.exception.NumberIsTooLargeException;
026import org.apache.commons.math3.exception.util.LocalizedFormats;
027import org.apache.commons.math3.fraction.BigFraction;
028import org.apache.commons.math3.fraction.BigFractionField;
029import org.apache.commons.math3.fraction.FractionConversionException;
030import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
031import org.apache.commons.math3.linear.Array2DRowRealMatrix;
032import org.apache.commons.math3.linear.FieldMatrix;
033import org.apache.commons.math3.linear.RealMatrix;
034import org.apache.commons.math3.util.FastMath;
035
036/**
037 * Implementation of the Kolmogorov-Smirnov distribution.
038 *
039 * <p>
040 * Treats the distribution of the two-sided {@code P(D_n < d)} where
041 * {@code D_n = sup_x |G(x) - G_n (x)|} for the theoretical cdf {@code G} and
042 * the empirical cdf {@code G_n}.
043 * </p>
044 * <p>
045 * This implementation is based on [1] with certain quick decisions for extreme
046 * values given in [2].
047 * </p>
048 * <p>
049 * In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is
050 * to write {@code d = (k - h) / n} for positive integer {@code k} and
051 * {@code 0 <= h < 1}. Then {@code P(D_n < d) = (n! / n^n) * t_kk}, where
052 * {@code t_kk} is the {@code (k, k)}'th entry in the special matrix
053 * {@code H^n}, i.e. {@code H} to the {@code n}'th power.
054 * </p>
055 * <p>
056 * References:
057 * <ul>
058 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/">
059 * Evaluating Kolmogorov's Distribution</a> by George Marsaglia, Wai
060 * Wan Tsang, and Jingbo Wang</li>
061 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/">
062 * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard
063 * and Pierre L'Ecuyer</li>
064 * </ul>
065 * Note that [1] contains an error in computing h, refer to
066 * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
067 * </p>
068 *
069 * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
070 * Kolmogorov-Smirnov test (Wikipedia)</a>
071 * @deprecated to be removed in version 4.0 -
072 *  use {@link org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest}
073 */
074public class KolmogorovSmirnovDistribution implements Serializable {
075
076    /** Serializable version identifier. */
077    private static final long serialVersionUID = -4670676796862967187L;
078
079    /** Number of observations. */
080    private int n;
081
082    /**
083     * @param n Number of observations
084     * @throws NotStrictlyPositiveException if {@code n <= 0}
085     */
086    public KolmogorovSmirnovDistribution(int n)
087        throws NotStrictlyPositiveException {
088        if (n <= 0) {
089            throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n);
090        }
091
092        this.n = n;
093    }
094
095    /**
096     * Calculates {@code P(D_n < d)} using method described in [1] with quick
097     * decisions for extreme values given in [2] (see above). The result is not
098     * exact as with
099     * {@link KolmogorovSmirnovDistribution#cdfExact(double)} because
100     * calculations are based on {@code double} rather than
101     * {@link org.apache.commons.math3.fraction.BigFraction}.
102     *
103     * @param d statistic
104     * @return the two-sided probability of {@code P(D_n < d)}
105     * @throws MathArithmeticException if algorithm fails to convert {@code h}
106     * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
107     * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
108     * {@code 0 <= h < 1}.
109     */
110    public double cdf(double d) throws MathArithmeticException {
111        return this.cdf(d, false);
112    }
113
114    /**
115     * Calculates {@code P(D_n < d)} using method described in [1] with quick
116     * decisions for extreme values given in [2] (see above). The result is
117     * exact in the sense that BigFraction/BigReal is used everywhere at the
118     * expense of very slow execution time. Almost never choose this in real
119     * applications unless you are very sure; this is almost solely for
120     * verification purposes. Normally, you would choose
121     * {@link KolmogorovSmirnovDistribution#cdf(double)}
122     *
123     * @param d statistic
124     * @return the two-sided probability of {@code P(D_n < d)}
125     * @throws MathArithmeticException if algorithm fails to convert {@code h}
126     * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
127     * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
128     * {@code 0 <= h < 1}.
129     */
130    public double cdfExact(double d) throws MathArithmeticException {
131        return this.cdf(d, true);
132    }
133
134    /**
135     * Calculates {@code P(D_n < d)} using method described in [1] with quick
136     * decisions for extreme values given in [2] (see above).
137     *
138     * @param d statistic
139     * @param exact whether the probability should be calculated exact using
140     * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the
141     * expense of very slow execution time, or if {@code double} should be used
142     * convenient places to gain speed. Almost never choose {@code true} in real
143     * applications unless you are very sure; {@code true} is almost solely for
144     * verification purposes.
145     * @return the two-sided probability of {@code P(D_n < d)}
146     * @throws MathArithmeticException if algorithm fails to convert {@code h}
147     * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
148     * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
149     * {@code 0 <= h < 1}.
150     */
151    public double cdf(double d, boolean exact) throws MathArithmeticException {
152
153        final double ninv = 1 / ((double) n);
154        final double ninvhalf = 0.5 * ninv;
155
156        if (d <= ninvhalf) {
157
158            return 0;
159
160        } else if (ninvhalf < d && d <= ninv) {
161
162            double res = 1;
163            double f = 2 * d - ninv;
164
165            // n! f^n = n*f * (n-1)*f * ... * 1*x
166            for (int i = 1; i <= n; ++i) {
167                res *= i * f;
168            }
169
170            return res;
171
172        } else if (1 - ninv <= d && d < 1) {
173
174            return 1 - 2 * FastMath.pow(1 - d, n);
175
176        } else if (1 <= d) {
177
178            return 1;
179        }
180
181        return exact ? exactK(d) : roundedK(d);
182    }
183
184    /**
185     * Calculates the exact value of {@code P(D_n < d)} using method described
186     * in [1] and {@link org.apache.commons.math3.fraction.BigFraction} (see
187     * above).
188     *
189     * @param d statistic
190     * @return the two-sided probability of {@code P(D_n < d)}
191     * @throws MathArithmeticException if algorithm fails to convert {@code h}
192     * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
193     * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
194     * {@code 0 <= h < 1}.
195     */
196    private double exactK(double d) throws MathArithmeticException {
197
198        final int k = (int) FastMath.ceil(n * d);
199
200        final FieldMatrix<BigFraction> H = this.createH(d);
201        final FieldMatrix<BigFraction> Hpower = H.power(n);
202
203        BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
204
205        for (int i = 1; i <= n; ++i) {
206            pFrac = pFrac.multiply(i).divide(n);
207        }
208
209        /*
210         * BigFraction.doubleValue converts numerator to double and the
211         * denominator to double and divides afterwards. That gives NaN quite
212         * easy. This does not (scale is the number of digits):
213         */
214        return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
215    }
216
217    /**
218     * Calculates {@code P(D_n < d)} using method described in [1] and doubles
219     * (see above).
220     *
221     * @param d statistic
222     * @return the two-sided probability of {@code P(D_n < d)}
223     * @throws MathArithmeticException if algorithm fails to convert {@code h}
224     * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
225     * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
226     * {@code 0 <= h < 1}.
227     */
228    private double roundedK(double d) throws MathArithmeticException {
229
230        final int k = (int) FastMath.ceil(n * d);
231        final FieldMatrix<BigFraction> HBigFraction = this.createH(d);
232        final int m = HBigFraction.getRowDimension();
233
234        /*
235         * Here the rounding part comes into play: use
236         * RealMatrix instead of FieldMatrix<BigFraction>
237         */
238        final RealMatrix H = new Array2DRowRealMatrix(m, m);
239
240        for (int i = 0; i < m; ++i) {
241            for (int j = 0; j < m; ++j) {
242                H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue());
243            }
244        }
245
246        final RealMatrix Hpower = H.power(n);
247
248        double pFrac = Hpower.getEntry(k - 1, k - 1);
249
250        for (int i = 1; i <= n; ++i) {
251            pFrac *= (double) i / (double) n;
252        }
253
254        return pFrac;
255    }
256
257    /***
258     * Creates {@code H} of size {@code m x m} as described in [1] (see above).
259     *
260     * @param d statistic
261     * @return H matrix
262     * @throws NumberIsTooLargeException if fractional part is greater than 1
263     * @throws FractionConversionException if algorithm fails to convert
264     * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in
265     * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and
266     * {@code 0 <= h < 1}.
267     */
268    private FieldMatrix<BigFraction> createH(double d)
269            throws NumberIsTooLargeException, FractionConversionException {
270
271        int k = (int) FastMath.ceil(n * d);
272
273        int m = 2 * k - 1;
274        double hDouble = k - n * d;
275
276        if (hDouble >= 1) {
277            throw new NumberIsTooLargeException(hDouble, 1.0, false);
278        }
279
280        BigFraction h = null;
281
282        try {
283            h = new BigFraction(hDouble, 1.0e-20, 10000);
284        } catch (FractionConversionException e1) {
285            try {
286                h = new BigFraction(hDouble, 1.0e-10, 10000);
287            } catch (FractionConversionException e2) {
288                h = new BigFraction(hDouble, 1.0e-5, 10000);
289            }
290        }
291
292        final BigFraction[][] Hdata = new BigFraction[m][m];
293
294        /*
295         * Start by filling everything with either 0 or 1.
296         */
297        for (int i = 0; i < m; ++i) {
298            for (int j = 0; j < m; ++j) {
299                if (i - j + 1 < 0) {
300                    Hdata[i][j] = BigFraction.ZERO;
301                } else {
302                    Hdata[i][j] = BigFraction.ONE;
303                }
304            }
305        }
306
307        /*
308         * Setting up power-array to avoid calculating the same value twice:
309         * hPowers[0] = h^1 ... hPowers[m-1] = h^m
310         */
311        final BigFraction[] hPowers = new BigFraction[m];
312        hPowers[0] = h;
313        for (int i = 1; i < m; ++i) {
314            hPowers[i] = h.multiply(hPowers[i - 1]);
315        }
316
317        /*
318         * First column and last row has special values (each other reversed).
319         */
320        for (int i = 0; i < m; ++i) {
321            Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
322            Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
323        }
324
325        /*
326         * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix
327         * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h >
328         * 1/2 is sufficient to check:
329         */
330        if (h.compareTo(BigFraction.ONE_HALF) == 1) {
331            Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
332        }
333
334        /*
335         * Aside from the first column and last row, the (i, j)-th element is
336         * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already
337         * put, so only division with (i - j + 1)! is needed in the elements
338         * that have 1's. There is no need to calculate (i - j + 1)! and then
339         * divide - small steps avoid overflows.
340         *
341         * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to
342         * m. Also note that it is started at g = 2 because dividing by 1 isn't
343         * really necessary.
344         */
345        for (int i = 0; i < m; ++i) {
346            for (int j = 0; j < i + 1; ++j) {
347                if (i - j + 1 > 0) {
348                    for (int g = 2; g <= i - j + 1; ++g) {
349                        Hdata[i][j] = Hdata[i][j].divide(g);
350                    }
351                }
352            }
353        }
354
355        return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
356    }
357}