001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math3.distribution;
019    
020    import java.io.Serializable;
021    import java.math.BigDecimal;
022    
023    import org.apache.commons.math3.exception.MathArithmeticException;
024    import org.apache.commons.math3.exception.NotStrictlyPositiveException;
025    import org.apache.commons.math3.exception.NumberIsTooLargeException;
026    import org.apache.commons.math3.exception.util.LocalizedFormats;
027    import org.apache.commons.math3.fraction.BigFraction;
028    import org.apache.commons.math3.fraction.BigFractionField;
029    import org.apache.commons.math3.fraction.FractionConversionException;
030    import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
031    import org.apache.commons.math3.linear.Array2DRowRealMatrix;
032    import org.apache.commons.math3.linear.FieldMatrix;
033    import org.apache.commons.math3.linear.RealMatrix;
034    
035    /**
036     * Implementation of the Kolmogorov-Smirnov distribution.
037     *
038     * <p>
039     * Treats the distribution of the two-sided {@code P(D_n < d)} where
040     * {@code D_n = sup_x |G(x) - G_n (x)|} for the theoretical cdf {@code G} and
041     * the empirical cdf {@code G_n}.
042     * </p>
043     * <p>
044     * This implementation is based on [1] with certain quick decisions for extreme
045     * values given in [2].
046     * </p>
047     * <p>
048     * In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is
049     * to write {@code d = (k - h) / n} for positive integer {@code k} and
050     * {@code 0 <= h < 1}. Then {@code P(D_n < d) = (n! / n^n) * t_kk}, where
051     * {@code t_kk} is the {@code (k, k)}'th entry in the special matrix
052     * {@code H^n}, i.e. {@code H} to the {@code n}'th power.
053     * </p>
054     * <p>
055     * References:
056     * <ul>
057     * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/">
058     * Evaluating Kolmogorov's Distribution</a> by George Marsaglia, Wai
059     * Wan Tsang, and Jingbo Wang</li>
060     * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/">
061     * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard
062     * and Pierre L'Ecuyer</li>
063     * </ul>
064     * Note that [1] contains an error in computing h, refer to
065     * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
066     * </p>
067     *
068     * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
069     * Kolmogorov-Smirnov test (Wikipedia)</a>
070     * @version $Id: KolmogorovSmirnovDistribution.java 1416643 2012-12-03 19:37:14Z tn $
071     */
072    public class KolmogorovSmirnovDistribution implements Serializable {
073    
074        /** Serializable version identifier. */
075        private static final long serialVersionUID = -4670676796862967187L;
076    
077        /** Number of observations. */
078        private int n;
079    
080        /**
081         * @param n Number of observations
082         * @throws NotStrictlyPositiveException if {@code n <= 0}
083         */
084        public KolmogorovSmirnovDistribution(int n)
085            throws NotStrictlyPositiveException {
086            if (n <= 0) {
087                throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n);
088            }
089    
090            this.n = n;
091        }
092    
093        /**
094         * Calculates {@code P(D_n < d)} using method described in [1] with quick
095         * decisions for extreme values given in [2] (see above). The result is not
096         * exact as with
097         * {@link KolmogorovSmirnovDistribution#cdfExact(double)} because
098         * calculations are based on {@code double} rather than
099         * {@link org.apache.commons.math3.fraction.BigFraction}.
100         *
101         * @param d statistic
102         * @return the two-sided probability of {@code P(D_n < d)}
103         * @throws MathArithmeticException if algorithm fails to convert {@code h}
104         * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
105         * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
106         * {@code 0 <= h < 1}.
107         */
108        public double cdf(double d) throws MathArithmeticException {
109            return this.cdf(d, false);
110        }
111    
112        /**
113         * Calculates {@code P(D_n < d)} using method described in [1] with quick
114         * decisions for extreme values given in [2] (see above). The result is
115         * exact in the sense that BigFraction/BigReal is used everywhere at the
116         * expense of very slow execution time. Almost never choose this in real
117         * applications unless you are very sure; this is almost solely for
118         * verification purposes. Normally, you would choose
119         * {@link KolmogorovSmirnovDistribution#cdf(double)}
120         *
121         * @param d statistic
122         * @return the two-sided probability of {@code P(D_n < d)}
123         * @throws MathArithmeticException if algorithm fails to convert {@code h}
124         * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
125         * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
126         * {@code 0 <= h < 1}.
127         */
128        public double cdfExact(double d) throws MathArithmeticException {
129            return this.cdf(d, true);
130        }
131    
132        /**
133         * Calculates {@code P(D_n < d)} using method described in [1] with quick
134         * decisions for extreme values given in [2] (see above).
135         *
136         * @param d statistic
137         * @param exact whether the probability should be calculated exact using
138         * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the
139         * expense of very slow execution time, or if {@code double} should be used
140         * convenient places to gain speed. Almost never choose {@code true} in real
141         * applications unless you are very sure; {@code true} is almost solely for
142         * verification purposes.
143         * @return the two-sided probability of {@code P(D_n < d)}
144         * @throws MathArithmeticException if algorithm fails to convert {@code h}
145         * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
146         * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
147         * {@code 0 <= h < 1}.
148         */
149        public double cdf(double d, boolean exact) throws MathArithmeticException {
150    
151            final double ninv = 1 / ((double) n);
152            final double ninvhalf = 0.5 * ninv;
153    
154            if (d <= ninvhalf) {
155    
156                return 0;
157    
158            } else if (ninvhalf < d && d <= ninv) {
159    
160                double res = 1;
161                double f = 2 * d - ninv;
162    
163                // n! f^n = n*f * (n-1)*f * ... * 1*x
164                for (int i = 1; i <= n; ++i) {
165                    res *= i * f;
166                }
167    
168                return res;
169    
170            } else if (1 - ninv <= d && d < 1) {
171    
172                return 1 - 2 * Math.pow(1 - d, n);
173    
174            } else if (1 <= d) {
175    
176                return 1;
177            }
178    
179            return exact ? exactK(d) : roundedK(d);
180        }
181    
182        /**
183         * Calculates the exact value of {@code P(D_n < d)} using method described
184         * in [1] and {@link org.apache.commons.math3.fraction.BigFraction} (see
185         * above).
186         *
187         * @param d statistic
188         * @return the two-sided probability of {@code P(D_n < d)}
189         * @throws MathArithmeticException if algorithm fails to convert {@code h}
190         * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
191         * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
192         * {@code 0 <= h < 1}.
193         */
194        private double exactK(double d) throws MathArithmeticException {
195    
196            final int k = (int) Math.ceil(n * d);
197    
198            final FieldMatrix<BigFraction> H = this.createH(d);
199            final FieldMatrix<BigFraction> Hpower = H.power(n);
200    
201            BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
202    
203            for (int i = 1; i <= n; ++i) {
204                pFrac = pFrac.multiply(i).divide(n);
205            }
206    
207            /*
208             * BigFraction.doubleValue converts numerator to double and the
209             * denominator to double and divides afterwards. That gives NaN quite
210             * easy. This does not (scale is the number of digits):
211             */
212            return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
213        }
214    
215        /**
216         * Calculates {@code P(D_n < d)} using method described in [1] and doubles
217         * (see above).
218         *
219         * @param d statistic
220         * @return the two-sided probability of {@code P(D_n < d)}
221         * @throws MathArithmeticException if algorithm fails to convert {@code h}
222         * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
223         * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
224         * {@code 0 <= h < 1}.
225         */
226        private double roundedK(double d) throws MathArithmeticException {
227    
228            final int k = (int) Math.ceil(n * d);
229            final FieldMatrix<BigFraction> HBigFraction = this.createH(d);
230            final int m = HBigFraction.getRowDimension();
231    
232            /*
233             * Here the rounding part comes into play: use
234             * RealMatrix instead of FieldMatrix<BigFraction>
235             */
236            final RealMatrix H = new Array2DRowRealMatrix(m, m);
237    
238            for (int i = 0; i < m; ++i) {
239                for (int j = 0; j < m; ++j) {
240                    H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue());
241                }
242            }
243    
244            final RealMatrix Hpower = H.power(n);
245    
246            double pFrac = Hpower.getEntry(k - 1, k - 1);
247    
248            for (int i = 1; i <= n; ++i) {
249                pFrac *= (double) i / (double) n;
250            }
251    
252            return pFrac;
253        }
254    
255        /***
256         * Creates {@code H} of size {@code m x m} as described in [1] (see above).
257         *
258         * @param d statistic
259         * @return H matrix
260         * @throws NumberIsTooLargeException if fractional part is greater than 1
261         * @throws FractionConversionException if algorithm fails to convert
262         * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in
263         * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and
264         * {@code 0 <= h < 1}.
265         */
266        private FieldMatrix<BigFraction> createH(double d)
267                throws NumberIsTooLargeException, FractionConversionException {
268    
269            int k = (int) Math.ceil(n * d);
270    
271            int m = 2 * k - 1;
272            double hDouble = k - n * d;
273    
274            if (hDouble >= 1) {
275                throw new NumberIsTooLargeException(hDouble, 1.0, false);
276            }
277    
278            BigFraction h = null;
279    
280            try {
281                h = new BigFraction(hDouble, 1.0e-20, 10000);
282            } catch (FractionConversionException e1) {
283                try {
284                    h = new BigFraction(hDouble, 1.0e-10, 10000);
285                } catch (FractionConversionException e2) {
286                    h = new BigFraction(hDouble, 1.0e-5, 10000);
287                }
288            }
289    
290            final BigFraction[][] Hdata = new BigFraction[m][m];
291    
292            /*
293             * Start by filling everything with either 0 or 1.
294             */
295            for (int i = 0; i < m; ++i) {
296                for (int j = 0; j < m; ++j) {
297                    if (i - j + 1 < 0) {
298                        Hdata[i][j] = BigFraction.ZERO;
299                    } else {
300                        Hdata[i][j] = BigFraction.ONE;
301                    }
302                }
303            }
304    
305            /*
306             * Setting up power-array to avoid calculating the same value twice:
307             * hPowers[0] = h^1 ... hPowers[m-1] = h^m
308             */
309            final BigFraction[] hPowers = new BigFraction[m];
310            hPowers[0] = h;
311            for (int i = 1; i < m; ++i) {
312                hPowers[i] = h.multiply(hPowers[i - 1]);
313            }
314    
315            /*
316             * First column and last row has special values (each other reversed).
317             */
318            for (int i = 0; i < m; ++i) {
319                Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
320                Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
321            }
322    
323            /*
324             * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix
325             * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h >
326             * 1/2 is sufficient to check:
327             */
328            if (h.compareTo(BigFraction.ONE_HALF) == 1) {
329                Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
330            }
331    
332            /*
333             * Aside from the first column and last row, the (i, j)-th element is
334             * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already
335             * put, so only division with (i - j + 1)! is needed in the elements
336             * that have 1's. There is no need to calculate (i - j + 1)! and then
337             * divide - small steps avoid overflows.
338             *
339             * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to
340             * m. Also note that it is started at g = 2 because dividing by 1 isn't
341             * really necessary.
342             */
343            for (int i = 0; i < m; ++i) {
344                for (int j = 0; j < i + 1; ++j) {
345                    if (i - j + 1 > 0) {
346                        for (int g = 2; g <= i - j + 1; ++g) {
347                            Hdata[i][j] = Hdata[i][j].divide(g);
348                        }
349                    }
350                }
351            }
352    
353            return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
354        }
355    }