001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math4.legacy.distribution;
019
020import java.util.ArrayList;
021import java.util.List;
022import java.util.function.Function;
023
024import org.apache.commons.statistics.distribution.NormalDistribution;
025import org.apache.commons.statistics.distribution.ContinuousDistribution;
026import org.apache.commons.numbers.core.Precision;
027import org.apache.commons.rng.UniformRandomProvider;
028import org.apache.commons.math4.legacy.exception.OutOfRangeException;
029import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
030import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
031import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
032import org.apache.commons.math4.core.jdkmath.JdkMath;
033
034/**
035 * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
036 * empirical probability distribution</a>: Probability distribution derived
037 * from observed data without making any assumptions about the functional
038 * form of the population distribution that the data come from.</p>
039 *
040 * <p>An {@code EmpiricalDistribution} maintains data structures called
041 * <i>distribution digests</i> that describe empirical distributions and
042 * support the following operations:
043 * <ul>
044 *  <li>loading the distribution from "observed" data values</li>
045 *  <li>dividing the input data into "bin ranges" and reporting bin
046 *      frequency counts (data for histogram)</li>
047 *  <li>reporting univariate statistics describing the full set of data
048 *      values as well as the observations within each bin</li>
049 *  <li>generating random values from the distribution</li>
050 * </ul>
051 *
052 * Applications can use {@code EmpiricalDistribution} to build grouped
053 * frequency histograms representing the input data or to generate random
054 * values "like" those in the input, i.e. the values generated will follow
055 * the distribution of the values in the file.
056 *
057 * <p>The implementation uses what amounts to the
058 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
059 * Variable Kernel Method</a> with Gaussian smoothing:<p>
060 * <strong>Digesting the input file</strong>
061 * <ol>
062 *  <li>Pass the file once to compute min and max.</li>
063 *  <li>Divide the range from min to max into {@code binCount} bins.</li>
064 *  <li>Pass the data file again, computing bin counts and univariate
065 *      statistics (mean and std dev.) for each bin.</li>
066 *  <li>Divide the interval (0,1) into subintervals associated with the bins,
067 *      with the length of a bin's subinterval proportional to its count.</li>
068 * </ol>
069 * <strong>Generating random values from the distribution</strong>
070 * <ol>
071 *  <li>Generate a uniformly distributed value in (0,1) </li>
072 *  <li>Select the subinterval to which the value belongs.
073 *  <li>Generate a random Gaussian value with mean = mean of the associated
074 *      bin and std dev = std dev of associated bin.</li>
075 * </ol>
076 *
077 * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
078 * as follows.  Given x within the range of values in the dataset, let B
079 * be the bin containing x and let K be the within-bin kernel for B.  Let P(B-)
080 * be the sum of the probabilities of the bins below B and let K(B) be the
081 * mass of B under K (i.e., the integral of the kernel density over B).  Then
082 * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the
083 * kernel distribution evaluated at x. This results in a cdf that matches the
084 * grouped frequency distribution at the bin endpoints and interpolates within
085 * bins using within-bin kernels.</p>
086 *
087 * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[])
088 * bin count} is about one tenth of the size of the input array.
089 */
090public final class EmpiricalDistribution extends AbstractRealDistribution
091    implements ContinuousDistribution {
092    /** Bins characteristics. */
093    private final List<SummaryStatistics> binStats;
094    /** Sample statistics. */
095    private final SummaryStatistics sampleStats;
096    /** Max loaded value. */
097    private final double max;
098    /** Min loaded value. */
099    private final double min;
100    /** Grid size. */
101    private final double delta;
102    /** Number of bins. */
103    private final int binCount;
104    /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
105    private final double[] upperBounds;
106    /** Kernel factory. */
107    private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;
108
109    /**
110     * Creates a new instance with the specified data.
111     *
112     * @param binCount Number of bins.  Must be strictly positive.
113     * @param input Input data.  Cannot be {@code null}.
114     * @param kernelFactory Kernel factory.
115     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
116     */
117    private EmpiricalDistribution(int binCount,
118                                  double[] input,
119                                  Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
120        if (binCount <= 0) {
121            throw new NotStrictlyPositiveException(binCount);
122        }
123        this.binCount = binCount;
124
125        // First pass through the data.
126        sampleStats = new SummaryStatistics();
127        for (int i = 0; i < input.length; i++) {
128            sampleStats.addValue(input[i]);
129        }
130
131        // Set up grid.
132        min = sampleStats.getMin();
133        max = sampleStats.getMax();
134        delta = (max - min) / binCount;
135
136        // Second pass through the data.
137        binStats = createBinStats(input);
138
139        // Assign upper bounds based on bin counts.
140        upperBounds = new double[binCount];
141        final double n = sampleStats.getN();
142        upperBounds[0] = binStats.get(0).getN() / n;
143        for (int i = 1; i < binCount - 1; i++) {
144            upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
145        }
146        upperBounds[binCount - 1] = 1d;
147
148        this.kernelFactory = kernelFactory;
149     }
150
151    /**
152     * Factory that creates a new instance from the specified data.
153     *
154     * @param binCount Number of bins.  Must be strictly positive.
155     * @param input Input data.  Cannot be {@code null}.
156     * @param kernelFactory Factory for creating within-bin kernels.
157     * @return a new instance.
158     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
159     */
160    public static EmpiricalDistribution from(int binCount,
161                                             double[] input,
162                                             Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
163        return new EmpiricalDistribution(binCount,
164                                         input,
165                                         kernelFactory);
166    }
167
168    /**
169     * Factory that creates a new instance from the specified data.
170     *
171     * @param binCount Number of bins.  Must be strictly positive.
172     * @param input Input data.  Cannot be {@code null}.
173     * @return a new instance.
174     * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
175     */
176    public static EmpiricalDistribution from(int binCount,
177                                             double[] input) {
178        return from(binCount, input, defaultKernel());
179    }
180
181    /**
182     * Create statistics (second pass through the data).
183     *
184     * @param input Input data.
185     * @return bins statistics.
186     */
187    private List<SummaryStatistics> createBinStats(double[] input) {
188        final List<SummaryStatistics> stats = new ArrayList<>();
189
190        for (int i = 0; i < binCount; i++) {
191            stats.add(i, new SummaryStatistics());
192        }
193
194        // Second pass though the data.
195        for (int i = 0; i < input.length; i++) {
196            final double v = input[i];
197            stats.get(findBin(v)).addValue(v);
198        }
199
200        return stats;
201    }
202
203    /**
204     * Returns the index of the bin to which the given value belongs.
205     *
206     * @param value Value whose bin we are trying to find.
207     * @return the index of the bin containing the value.
208     */
209    private int findBin(double value) {
210        return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1,
211                                 0),
212                        binCount - 1);
213    }
214
215    /**
216     * Returns a {@link StatisticalSummary} describing this distribution.
217     * <strong>Preconditions:</strong><ul>
218     * <li>the distribution must be loaded before invoking this method</li></ul>
219     *
220     * @return the sample statistics
221     * @throws IllegalStateException if the distribution has not been loaded
222     */
223    public StatisticalSummary getSampleStats() {
224        return sampleStats.copy();
225    }
226
227    /**
228     * Returns the number of bins.
229     *
230     * @return the number of bins.
231     */
232    public int getBinCount() {
233        return binCount;
234    }
235
236    /**
237     * Returns a copy of the {@link SummaryStatistics} instances containing
238     * statistics describing the values in each of the bins.
239     * The list is indexed on the bin number.
240     *
241     * @return the bins statistics.
242     */
243    public List<SummaryStatistics> getBinStats() {
244        final List<SummaryStatistics> copy = new ArrayList<>();
245        for (SummaryStatistics s : binStats) {
246            copy.add(s.copy());
247        }
248        return copy;
249    }
250
251    /**
252     * Returns the upper bounds of the bins.
253     *
254     * Assuming array {@code u} is returned by this method, the bins are:
255     * <ul>
256     *  <li>{@code (min, u[0])},</li>
257     *  <li>{@code (u[0], u[1])},</li>
258     *  <li>... ,</li>
259     *  <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
260     * </ul>
261     *
262     * @return the bins upper bounds.
263     *
264     * @since 2.1
265     */
266    public double[] getUpperBounds() {
267        double[] binUpperBounds = new double[binCount];
268        for (int i = 0; i < binCount - 1; i++) {
269            binUpperBounds[i] = min + delta * (i + 1);
270        }
271        binUpperBounds[binCount - 1] = max;
272        return binUpperBounds;
273    }
274
275    /**
276     * Returns the upper bounds of the subintervals of [0, 1] used in generating
277     * data from the empirical distribution.
278     * Subintervals correspond to bins with lengths proportional to bin counts.
279     *
280     * <strong>Preconditions:</strong><ul>
281     * <li>the distribution must be loaded before invoking this method</li></ul>
282     *
283     * @return array of upper bounds of subintervals used in data generation
284     * @throws NullPointerException unless a {@code load} method has been
285     * called beforehand.
286     *
287     * @since 2.1
288     */
289    public double[] getGeneratorUpperBounds() {
290        int len = upperBounds.length;
291        double[] out = new double[len];
292        System.arraycopy(upperBounds, 0, out, 0, len);
293        return out;
294    }
295
296    // Distribution methods.
297
298    /**
299     * {@inheritDoc}
300     *
301     * Returns the kernel density normalized so that its integral over each bin
302     * equals the bin mass.
303     *
304     * Algorithm description:
305     * <ol>
306     *  <li>Find the bin B that x belongs to.</li>
307     *  <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
308     *   integral of the kernel density over B).</li>
309     *  <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
310     *   and P(B) is the mass of B.</li>
311     * </ol>
312     *
313     * @since 3.1
314     */
315    @Override
316    public double density(double x) {
317        if (x < min || x > max) {
318            return 0d;
319        }
320        final int binIndex = findBin(x);
321        final ContinuousDistribution kernel = getKernel(binStats.get(binIndex));
322        return kernel.density(x) * pB(binIndex) / kB(binIndex);
323    }
324
325    /**
326     * {@inheritDoc}
327     *
328     * Algorithm description:
329     * <ol>
330     *  <li>Find the bin B that x belongs to.</li>
331     *  <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
332     *  <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
333     *   and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
334     *  <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
335     *   K(x) is the within-bin kernel distribution function evaluated at x.</li>
336     * </ol>
337     * If K is a constant distribution, we return P(B-) + P(B) (counting the full
338     * mass of B).
339     *
340     * @since 3.1
341     */
342    @Override
343    public double cumulativeProbability(double x) {
344        if (x < min) {
345            return 0d;
346        } else if (x >= max) {
347            return 1d;
348        }
349        final int binIndex = findBin(x);
350        final double pBminus = pBminus(binIndex);
351        final double pB = pB(binIndex);
352        final ContinuousDistribution kernel = k(x);
353        if (kernel instanceof ConstantContinuousDistribution) {
354            if (x < kernel.getMean()) {
355                return pBminus;
356            } else {
357                return pBminus + pB;
358            }
359        }
360        final double[] binBounds = getUpperBounds();
361        final double kB = kB(binIndex);
362        final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
363        final double withinBinCum =
364            (kernel.cumulativeProbability(x) -  kernel.cumulativeProbability(lower)) / kB;
365        return pBminus + pB * withinBinCum;
366    }
367
368    /**
369     * {@inheritDoc}
370     *
371     * Algorithm description:
372     * <ol>
373     *  <li>Find the smallest i such that the sum of the masses of the bins
374     *   through i is at least p.</li>
375     *  <li>
376     *   <ol>
377     *    <li>Let K be the within-bin kernel distribution for bin i.</li>
378     *    <li>Let K(B) be the mass of B under K.</li>
379     *    <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
380     *     mass of the bins below B under K).</li>
381     *    <li>Let P(B) be the probability of bin i.</li>
382     *    <li>Let P(B-) be the sum of the bin masses below bin i.</li>
383     *    <li>Let pCrit = p - P(B-)</li>
384     *   </ol>
385     *  </li>
386     *  <li>Return the inverse of K evaluated at
387     *    K(B-) + pCrit * K(B) / P(B) </li>
388     * </ol>
389     *
390     * @since 3.1
391     */
392    @Override
393    public double inverseCumulativeProbability(final double p) {
394        if (p < 0 ||
395            p > 1) {
396            throw new OutOfRangeException(p, 0, 1);
397        }
398
399        if (p == 0) {
400            return getSupportLowerBound();
401        }
402
403        if (p == 1) {
404            return getSupportUpperBound();
405        }
406
407        int i = 0;
408        while (cumBinP(i) < p) {
409            ++i;
410        }
411
412        final SummaryStatistics stats = binStats.get(i);
413        final ContinuousDistribution kernel = getKernel(stats);
414        final double kB = kB(i);
415        final double[] binBounds = getUpperBounds();
416        final double lower = i == 0 ? min : binBounds[i - 1];
417        final double kBminus = kernel.cumulativeProbability(lower);
418        final double pB = pB(i);
419        final double pBminus = pBminus(i);
420        final double pCrit = p - pBminus;
421        if (pCrit <= 0) {
422            return lower;
423        }
424
425        final double cP = kBminus + pCrit * kB / pB;
426
427        return Precision.equals(cP, 1d) ?
428            kernel.inverseCumulativeProbability(1d) :
429            kernel.inverseCumulativeProbability(cP);
430    }
431
432    /**
433     * {@inheritDoc}
434     * @since 3.1
435     */
436    @Override
437    public double getMean() {
438       return sampleStats.getMean();
439    }
440
441    /**
442     * {@inheritDoc}
443     * @since 3.1
444     */
445    @Override
446    public double getVariance() {
447        return sampleStats.getVariance();
448    }
449
450    /**
451     * {@inheritDoc}
452     * @since 3.1
453     */
454    @Override
455    public double getSupportLowerBound() {
456       return min;
457    }
458
459    /**
460     * {@inheritDoc}
461     * @since 3.1
462     */
463    @Override
464    public double getSupportUpperBound() {
465        return max;
466    }
467
468    /**
469     * The probability of bin i.
470     *
471     * @param i the index of the bin
472     * @return the probability that selection begins in bin i
473     */
474    private double pB(int i) {
475        return i == 0 ? upperBounds[0] :
476            upperBounds[i] - upperBounds[i - 1];
477    }
478
479    /**
480     * The combined probability of the bins up to but not including bin i.
481     *
482     * @param i the index of the bin
483     * @return the probability that selection begins in a bin below bin i.
484     */
485    private double pBminus(int i) {
486        return i == 0 ? 0 : upperBounds[i - 1];
487    }
488
489    /**
490     * Mass of bin i under the within-bin kernel of the bin.
491     *
492     * @param i index of the bin
493     * @return the difference in the within-bin kernel cdf between the
494     * upper and lower endpoints of bin i
495     */
496    private double kB(int i) {
497        final double[] binBounds = getUpperBounds();
498        final ContinuousDistribution kernel = getKernel(binStats.get(i));
499        return i == 0 ? kernel.probability(min, binBounds[0]) :
500            kernel.probability(binBounds[i - 1], binBounds[i]);
501    }
502
503    /**
504     * The within-bin kernel of the bin that x belongs to.
505     *
506     * @param x the value to locate within a bin
507     * @return the within-bin kernel of the bin containing x
508     */
509    private ContinuousDistribution k(double x) {
510        final int binIndex = findBin(x);
511        return getKernel(binStats.get(binIndex));
512    }
513
514    /**
515     * The combined probability of the bins up to and including binIndex.
516     *
517     * @param binIndex maximum bin index
518     * @return sum of the probabilities of bins through binIndex
519     */
520    private double cumBinP(int binIndex) {
521        return upperBounds[binIndex];
522    }
523
524    /**
525     * @param stats Bin statistics.
526     * @return the within-bin kernel.
527     */
528    private ContinuousDistribution getKernel(SummaryStatistics stats) {
529        return kernelFactory.apply(stats);
530    }
531
532    /**
533     * The within-bin smoothing kernel: A Gaussian distribution
534     * (unless the bin contains 0 or 1 observation, in which case
535     * a constant distribution is returned).
536     *
537     * @return the within-bin kernel factory.
538     */
539    private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
540        return stats -> {
541            if (stats.getN() <= 3 ||
542                stats.getVariance() == 0) {
543                return new ConstantContinuousDistribution(stats.getMean());
544            } else {
545                return NormalDistribution.of(stats.getMean(),
546                                             stats.getStandardDeviation());
547            }
548        };
549    }
550
551    /**
552     * Constant distribution.
553     */
554    private static final class ConstantContinuousDistribution implements ContinuousDistribution {
555        /** Constant value of the distribution. */
556        private final double value;
557
558        /**
559         * Create a constant real distribution with the given value.
560         *
561         * @param value Value of this distribution.
562         */
563        ConstantContinuousDistribution(double value) {
564            this.value = value;
565        }
566
567        /** {@inheritDoc} */
568        @Override
569        public double density(double x) {
570            return x == value ? 1 : 0;
571        }
572
573        /** {@inheritDoc} */
574        @Override
575        public double cumulativeProbability(double x)  {
576            return x < value ? 0 : 1;
577        }
578
579        /** {@inheritDoc} */
580        @Override
581        public double inverseCumulativeProbability(final double p) {
582            if (p < 0 ||
583                p > 1) {
584                // Should never happen.
585                throw new IllegalArgumentException("Internal error");
586            }
587            return value;
588        }
589
590        /** {@inheritDoc} */
591        @Override
592        public double getMean() {
593            return value;
594        }
595
596        /** {@inheritDoc} */
597        @Override
598        public double getVariance() {
599            return 0;
600        }
601
602        /**{@inheritDoc} */
603        @Override
604        public double getSupportLowerBound() {
605            return value;
606        }
607
608        /** {@inheritDoc} */
609        @Override
610        public double getSupportUpperBound() {
611            return value;
612        }
613
614        /**
615         * {@inheritDoc}
616         *
617         * @param rng Not used: distribution contains a single value.
618         * @return the value of the distribution.
619         */
620        @Override
621        public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
622            return this::getSupportLowerBound;
623        }
624    }
625}