View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math4.legacy.distribution;
19  
20  import java.util.ArrayList;
21  import java.util.List;
22  import java.util.function.Function;
23  
24  import org.apache.commons.statistics.distribution.NormalDistribution;
25  import org.apache.commons.statistics.distribution.ContinuousDistribution;
26  import org.apache.commons.numbers.core.Precision;
27  import org.apache.commons.rng.UniformRandomProvider;
28  import org.apache.commons.math4.legacy.exception.OutOfRangeException;
29  import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
30  import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
31  import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
32  import org.apache.commons.math4.core.jdkmath.JdkMath;
33  
34  /**
35   * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
36   * empirical probability distribution</a>: Probability distribution derived
37   * from observed data without making any assumptions about the functional
38   * form of the population distribution that the data come from.</p>
39   *
40   * <p>An {@code EmpiricalDistribution} maintains data structures called
41   * <i>distribution digests</i> that describe empirical distributions and
42   * support the following operations:
43   * <ul>
44   *  <li>loading the distribution from "observed" data values</li>
45   *  <li>dividing the input data into "bin ranges" and reporting bin
46   *      frequency counts (data for histogram)</li>
47   *  <li>reporting univariate statistics describing the full set of data
48   *      values as well as the observations within each bin</li>
49   *  <li>generating random values from the distribution</li>
50   * </ul>
51   *
52   * Applications can use {@code EmpiricalDistribution} to build grouped
53   * frequency histograms representing the input data or to generate random
54   * values "like" those in the input, i.e. the values generated will follow
55   * the distribution of the values in the file.
56   *
57   * <p>The implementation uses what amounts to the
58   * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
59   * Variable Kernel Method</a> with Gaussian smoothing:<p>
60   * <strong>Digesting the input file</strong>
61   * <ol>
62   *  <li>Pass the file once to compute min and max.</li>
63   *  <li>Divide the range from min to max into {@code binCount} bins.</li>
64   *  <li>Pass the data file again, computing bin counts and univariate
65   *      statistics (mean and std dev.) for each bin.</li>
66   *  <li>Divide the interval (0,1) into subintervals associated with the bins,
67   *      with the length of a bin's subinterval proportional to its count.</li>
68   * </ol>
69   * <strong>Generating random values from the distribution</strong>
70   * <ol>
71   *  <li>Generate a uniformly distributed value in (0,1) </li>
72   *  <li>Select the subinterval to which the value belongs.
73   *  <li>Generate a random Gaussian value with mean = mean of the associated
74   *      bin and std dev = std dev of associated bin.</li>
75   * </ol>
76   *
77   * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
78   * as follows.  Given x within the range of values in the dataset, let B
79   * be the bin containing x and let K be the within-bin kernel for B.  Let P(B-)
80   * be the sum of the probabilities of the bins below B and let K(B) be the
81   * mass of B under K (i.e., the integral of the kernel density over B).  Then
82   * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the
83   * kernel distribution evaluated at x. This results in a cdf that matches the
84   * grouped frequency distribution at the bin endpoints and interpolates within
85   * bins using within-bin kernels.</p>
86   *
87   * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[])
88   * bin count} is about one tenth of the size of the input array.
89   */
90  public final class EmpiricalDistribution extends AbstractRealDistribution
91      implements ContinuousDistribution {
92      /** Bins characteristics. */
93      private final List<SummaryStatistics> binStats;
94      /** Sample statistics. */
95      private final SummaryStatistics sampleStats;
96      /** Max loaded value. */
97      private final double max;
98      /** Min loaded value. */
99      private final double min;
100     /** Grid size. */
101     private final double delta;
102     /** Number of bins. */
103     private final int binCount;
104     /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
105     private final double[] upperBounds;
106     /** Kernel factory. */
107     private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;
108 
109     /**
110      * Creates a new instance with the specified data.
111      *
112      * @param binCount Number of bins.  Must be strictly positive.
113      * @param input Input data.  Cannot be {@code null}.
114      * @param kernelFactory Kernel factory.
115      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
116      */
117     private EmpiricalDistribution(int binCount,
118                                   double[] input,
119                                   Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
120         if (binCount <= 0) {
121             throw new NotStrictlyPositiveException(binCount);
122         }
123         this.binCount = binCount;
124 
125         // First pass through the data.
126         sampleStats = new SummaryStatistics();
127         for (int i = 0; i < input.length; i++) {
128             sampleStats.addValue(input[i]);
129         }
130 
131         // Set up grid.
132         min = sampleStats.getMin();
133         max = sampleStats.getMax();
134         delta = (max - min) / binCount;
135 
136         // Second pass through the data.
137         binStats = createBinStats(input);
138 
139         // Assign upper bounds based on bin counts.
140         upperBounds = new double[binCount];
141         final double n = sampleStats.getN();
142         upperBounds[0] = binStats.get(0).getN() / n;
143         for (int i = 1; i < binCount - 1; i++) {
144             upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
145         }
146         upperBounds[binCount - 1] = 1d;
147 
148         this.kernelFactory = kernelFactory;
149      }
150 
151     /**
152      * Factory that creates a new instance from the specified data.
153      *
154      * @param binCount Number of bins.  Must be strictly positive.
155      * @param input Input data.  Cannot be {@code null}.
156      * @param kernelFactory Factory for creating within-bin kernels.
157      * @return a new instance.
158      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
159      */
160     public static EmpiricalDistribution from(int binCount,
161                                              double[] input,
162                                              Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
163         return new EmpiricalDistribution(binCount,
164                                          input,
165                                          kernelFactory);
166     }
167 
168     /**
169      * Factory that creates a new instance from the specified data.
170      *
171      * @param binCount Number of bins.  Must be strictly positive.
172      * @param input Input data.  Cannot be {@code null}.
173      * @return a new instance.
174      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
175      */
176     public static EmpiricalDistribution from(int binCount,
177                                              double[] input) {
178         return from(binCount, input, defaultKernel());
179     }
180 
181     /**
182      * Create statistics (second pass through the data).
183      *
184      * @param input Input data.
185      * @return bins statistics.
186      */
187     private List<SummaryStatistics> createBinStats(double[] input) {
188         final List<SummaryStatistics> stats = new ArrayList<>();
189 
190         for (int i = 0; i < binCount; i++) {
191             stats.add(i, new SummaryStatistics());
192         }
193 
194         // Second pass though the data.
195         for (int i = 0; i < input.length; i++) {
196             final double v = input[i];
197             stats.get(findBin(v)).addValue(v);
198         }
199 
200         return stats;
201     }
202 
203     /**
204      * Returns the index of the bin to which the given value belongs.
205      *
206      * @param value Value whose bin we are trying to find.
207      * @return the index of the bin containing the value.
208      */
209     private int findBin(double value) {
210         return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1,
211                                  0),
212                         binCount - 1);
213     }
214 
215     /**
216      * Returns a {@link StatisticalSummary} describing this distribution.
217      * <strong>Preconditions:</strong><ul>
218      * <li>the distribution must be loaded before invoking this method</li></ul>
219      *
220      * @return the sample statistics
221      * @throws IllegalStateException if the distribution has not been loaded
222      */
223     public StatisticalSummary getSampleStats() {
224         return sampleStats.copy();
225     }
226 
227     /**
228      * Returns the number of bins.
229      *
230      * @return the number of bins.
231      */
232     public int getBinCount() {
233         return binCount;
234     }
235 
236     /**
237      * Returns a copy of the {@link SummaryStatistics} instances containing
238      * statistics describing the values in each of the bins.
239      * The list is indexed on the bin number.
240      *
241      * @return the bins statistics.
242      */
243     public List<SummaryStatistics> getBinStats() {
244         final List<SummaryStatistics> copy = new ArrayList<>();
245         for (SummaryStatistics s : binStats) {
246             copy.add(s.copy());
247         }
248         return copy;
249     }
250 
251     /**
252      * Returns the upper bounds of the bins.
253      *
254      * Assuming array {@code u} is returned by this method, the bins are:
255      * <ul>
256      *  <li>{@code (min, u[0])},</li>
257      *  <li>{@code (u[0], u[1])},</li>
258      *  <li>... ,</li>
259      *  <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
260      * </ul>
261      *
262      * @return the bins upper bounds.
263      *
264      * @since 2.1
265      */
266     public double[] getUpperBounds() {
267         double[] binUpperBounds = new double[binCount];
268         for (int i = 0; i < binCount - 1; i++) {
269             binUpperBounds[i] = min + delta * (i + 1);
270         }
271         binUpperBounds[binCount - 1] = max;
272         return binUpperBounds;
273     }
274 
275     /**
276      * Returns the upper bounds of the subintervals of [0, 1] used in generating
277      * data from the empirical distribution.
278      * Subintervals correspond to bins with lengths proportional to bin counts.
279      *
280      * <strong>Preconditions:</strong><ul>
281      * <li>the distribution must be loaded before invoking this method</li></ul>
282      *
283      * @return array of upper bounds of subintervals used in data generation
284      * @throws NullPointerException unless a {@code load} method has been
285      * called beforehand.
286      *
287      * @since 2.1
288      */
289     public double[] getGeneratorUpperBounds() {
290         int len = upperBounds.length;
291         double[] out = new double[len];
292         System.arraycopy(upperBounds, 0, out, 0, len);
293         return out;
294     }
295 
296     // Distribution methods.
297 
298     /**
299      * {@inheritDoc}
300      *
301      * Returns the kernel density normalized so that its integral over each bin
302      * equals the bin mass.
303      *
304      * Algorithm description:
305      * <ol>
306      *  <li>Find the bin B that x belongs to.</li>
307      *  <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
308      *   integral of the kernel density over B).</li>
309      *  <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
310      *   and P(B) is the mass of B.</li>
311      * </ol>
312      *
313      * @since 3.1
314      */
315     @Override
316     public double density(double x) {
317         if (x < min || x > max) {
318             return 0d;
319         }
320         final int binIndex = findBin(x);
321         final ContinuousDistribution kernel = getKernel(binStats.get(binIndex));
322         return kernel.density(x) * pB(binIndex) / kB(binIndex);
323     }
324 
325     /**
326      * {@inheritDoc}
327      *
328      * Algorithm description:
329      * <ol>
330      *  <li>Find the bin B that x belongs to.</li>
331      *  <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
332      *  <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
333      *   and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
334      *  <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
335      *   K(x) is the within-bin kernel distribution function evaluated at x.</li>
336      * </ol>
337      * If K is a constant distribution, we return P(B-) + P(B) (counting the full
338      * mass of B).
339      *
340      * @since 3.1
341      */
342     @Override
343     public double cumulativeProbability(double x) {
344         if (x < min) {
345             return 0d;
346         } else if (x >= max) {
347             return 1d;
348         }
349         final int binIndex = findBin(x);
350         final double pBminus = pBminus(binIndex);
351         final double pB = pB(binIndex);
352         final ContinuousDistribution kernel = k(x);
353         if (kernel instanceof ConstantContinuousDistribution) {
354             if (x < kernel.getMean()) {
355                 return pBminus;
356             } else {
357                 return pBminus + pB;
358             }
359         }
360         final double[] binBounds = getUpperBounds();
361         final double kB = kB(binIndex);
362         final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
363         final double withinBinCum =
364             (kernel.cumulativeProbability(x) -  kernel.cumulativeProbability(lower)) / kB;
365         return pBminus + pB * withinBinCum;
366     }
367 
368     /**
369      * {@inheritDoc}
370      *
371      * Algorithm description:
372      * <ol>
373      *  <li>Find the smallest i such that the sum of the masses of the bins
374      *   through i is at least p.</li>
375      *  <li>
376      *   <ol>
377      *    <li>Let K be the within-bin kernel distribution for bin i.</li>
378      *    <li>Let K(B) be the mass of B under K.</li>
379      *    <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
380      *     mass of the bins below B under K).</li>
381      *    <li>Let P(B) be the probability of bin i.</li>
382      *    <li>Let P(B-) be the sum of the bin masses below bin i.</li>
383      *    <li>Let pCrit = p - P(B-)</li>
384      *   </ol>
385      *  </li>
386      *  <li>Return the inverse of K evaluated at
387      *    K(B-) + pCrit * K(B) / P(B) </li>
388      * </ol>
389      *
390      * @since 3.1
391      */
392     @Override
393     public double inverseCumulativeProbability(final double p) {
394         if (p < 0 ||
395             p > 1) {
396             throw new OutOfRangeException(p, 0, 1);
397         }
398 
399         if (p == 0) {
400             return getSupportLowerBound();
401         }
402 
403         if (p == 1) {
404             return getSupportUpperBound();
405         }
406 
407         int i = 0;
408         while (cumBinP(i) < p) {
409             ++i;
410         }
411 
412         final SummaryStatistics stats = binStats.get(i);
413         final ContinuousDistribution kernel = getKernel(stats);
414         final double kB = kB(i);
415         final double[] binBounds = getUpperBounds();
416         final double lower = i == 0 ? min : binBounds[i - 1];
417         final double kBminus = kernel.cumulativeProbability(lower);
418         final double pB = pB(i);
419         final double pBminus = pBminus(i);
420         final double pCrit = p - pBminus;
421         if (pCrit <= 0) {
422             return lower;
423         }
424 
425         final double cP = kBminus + pCrit * kB / pB;
426 
427         return Precision.equals(cP, 1d) ?
428             kernel.inverseCumulativeProbability(1d) :
429             kernel.inverseCumulativeProbability(cP);
430     }
431 
432     /**
433      * {@inheritDoc}
434      * @since 3.1
435      */
436     @Override
437     public double getMean() {
438        return sampleStats.getMean();
439     }
440 
441     /**
442      * {@inheritDoc}
443      * @since 3.1
444      */
445     @Override
446     public double getVariance() {
447         return sampleStats.getVariance();
448     }
449 
450     /**
451      * {@inheritDoc}
452      * @since 3.1
453      */
454     @Override
455     public double getSupportLowerBound() {
456        return min;
457     }
458 
459     /**
460      * {@inheritDoc}
461      * @since 3.1
462      */
463     @Override
464     public double getSupportUpperBound() {
465         return max;
466     }
467 
468     /**
469      * The probability of bin i.
470      *
471      * @param i the index of the bin
472      * @return the probability that selection begins in bin i
473      */
474     private double pB(int i) {
475         return i == 0 ? upperBounds[0] :
476             upperBounds[i] - upperBounds[i - 1];
477     }
478 
479     /**
480      * The combined probability of the bins up to but not including bin i.
481      *
482      * @param i the index of the bin
483      * @return the probability that selection begins in a bin below bin i.
484      */
485     private double pBminus(int i) {
486         return i == 0 ? 0 : upperBounds[i - 1];
487     }
488 
489     /**
490      * Mass of bin i under the within-bin kernel of the bin.
491      *
492      * @param i index of the bin
493      * @return the difference in the within-bin kernel cdf between the
494      * upper and lower endpoints of bin i
495      */
496     private double kB(int i) {
497         final double[] binBounds = getUpperBounds();
498         final ContinuousDistribution kernel = getKernel(binStats.get(i));
499         return i == 0 ? kernel.probability(min, binBounds[0]) :
500             kernel.probability(binBounds[i - 1], binBounds[i]);
501     }
502 
503     /**
504      * The within-bin kernel of the bin that x belongs to.
505      *
506      * @param x the value to locate within a bin
507      * @return the within-bin kernel of the bin containing x
508      */
509     private ContinuousDistribution k(double x) {
510         final int binIndex = findBin(x);
511         return getKernel(binStats.get(binIndex));
512     }
513 
514     /**
515      * The combined probability of the bins up to and including binIndex.
516      *
517      * @param binIndex maximum bin index
518      * @return sum of the probabilities of bins through binIndex
519      */
520     private double cumBinP(int binIndex) {
521         return upperBounds[binIndex];
522     }
523 
524     /**
525      * @param stats Bin statistics.
526      * @return the within-bin kernel.
527      */
528     private ContinuousDistribution getKernel(SummaryStatistics stats) {
529         return kernelFactory.apply(stats);
530     }
531 
532     /**
533      * The within-bin smoothing kernel: A Gaussian distribution
534      * (unless the bin contains 0 or 1 observation, in which case
535      * a constant distribution is returned).
536      *
537      * @return the within-bin kernel factory.
538      */
539     private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
540         return stats -> {
541             if (stats.getN() <= 3 ||
542                 stats.getVariance() == 0) {
543                 return new ConstantContinuousDistribution(stats.getMean());
544             } else {
545                 return NormalDistribution.of(stats.getMean(),
546                                              stats.getStandardDeviation());
547             }
548         };
549     }
550 
551     /**
552      * Constant distribution.
553      */
554     private static final class ConstantContinuousDistribution implements ContinuousDistribution {
555         /** Constant value of the distribution. */
556         private final double value;
557 
558         /**
559          * Create a constant real distribution with the given value.
560          *
561          * @param value Value of this distribution.
562          */
563         ConstantContinuousDistribution(double value) {
564             this.value = value;
565         }
566 
567         /** {@inheritDoc} */
568         @Override
569         public double density(double x) {
570             return x == value ? 1 : 0;
571         }
572 
573         /** {@inheritDoc} */
574         @Override
575         public double cumulativeProbability(double x)  {
576             return x < value ? 0 : 1;
577         }
578 
579         /** {@inheritDoc} */
580         @Override
581         public double inverseCumulativeProbability(final double p) {
582             if (p < 0 ||
583                 p > 1) {
584                 // Should never happen.
585                 throw new IllegalArgumentException("Internal error");
586             }
587             return value;
588         }
589 
590         /** {@inheritDoc} */
591         @Override
592         public double getMean() {
593             return value;
594         }
595 
596         /** {@inheritDoc} */
597         @Override
598         public double getVariance() {
599             return 0;
600         }
601 
602         /**{@inheritDoc} */
603         @Override
604         public double getSupportLowerBound() {
605             return value;
606         }
607 
608         /** {@inheritDoc} */
609         @Override
610         public double getSupportUpperBound() {
611             return value;
612         }
613 
614         /**
615          * {@inheritDoc}
616          *
617          * @param rng Not used: distribution contains a single value.
618          * @return the value of the distribution.
619          */
620         @Override
621         public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
622             return this::getSupportLowerBound;
623         }
624     }
625 }