1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math4.legacy.distribution;
19
20 import java.util.ArrayList;
21 import java.util.List;
22 import java.util.function.Function;
23
24 import org.apache.commons.statistics.distribution.NormalDistribution;
25 import org.apache.commons.statistics.distribution.ContinuousDistribution;
26 import org.apache.commons.numbers.core.Precision;
27 import org.apache.commons.rng.UniformRandomProvider;
28 import org.apache.commons.math4.legacy.exception.OutOfRangeException;
29 import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
30 import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
31 import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
32 import org.apache.commons.math4.core.jdkmath.JdkMath;
33
34 /**
35 * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
36 * empirical probability distribution</a>: Probability distribution derived
37 * from observed data without making any assumptions about the functional
38 * form of the population distribution that the data come from.</p>
39 *
40 * <p>An {@code EmpiricalDistribution} maintains data structures called
41 * <i>distribution digests</i> that describe empirical distributions and
42 * support the following operations:
43 * <ul>
44 * <li>loading the distribution from "observed" data values</li>
45 * <li>dividing the input data into "bin ranges" and reporting bin
46 * frequency counts (data for histogram)</li>
47 * <li>reporting univariate statistics describing the full set of data
48 * values as well as the observations within each bin</li>
49 * <li>generating random values from the distribution</li>
50 * </ul>
51 *
52 * Applications can use {@code EmpiricalDistribution} to build grouped
53 * frequency histograms representing the input data or to generate random
54 * values "like" those in the input, i.e. the values generated will follow
55 * the distribution of the values in the file.
56 *
57 * <p>The implementation uses what amounts to the
58 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
59 * Variable Kernel Method</a> with Gaussian smoothing:<p>
60 * <strong>Digesting the input file</strong>
61 * <ol>
62 * <li>Pass the file once to compute min and max.</li>
63 * <li>Divide the range from min to max into {@code binCount} bins.</li>
64 * <li>Pass the data file again, computing bin counts and univariate
65 * statistics (mean and std dev.) for each bin.</li>
66 * <li>Divide the interval (0,1) into subintervals associated with the bins,
67 * with the length of a bin's subinterval proportional to its count.</li>
68 * </ol>
69 * <strong>Generating random values from the distribution</strong>
70 * <ol>
71 * <li>Generate a uniformly distributed value in (0,1) </li>
72 * <li>Select the subinterval to which the value belongs.
73 * <li>Generate a random Gaussian value with mean = mean of the associated
74 * bin and std dev = std dev of associated bin.</li>
75 * </ol>
76 *
77 * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
78 * as follows. Given x within the range of values in the dataset, let B
79 * be the bin containing x and let K be the within-bin kernel for B. Let P(B-)
80 * be the sum of the probabilities of the bins below B and let K(B) be the
81 * mass of B under K (i.e., the integral of the kernel density over B). Then
82 * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the
83 * kernel distribution evaluated at x. This results in a cdf that matches the
84 * grouped frequency distribution at the bin endpoints and interpolates within
85 * bins using within-bin kernels.</p>
86 *
87 * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[])
88 * bin count} is about one tenth of the size of the input array.
89 */
90 public final class EmpiricalDistribution extends AbstractRealDistribution
91 implements ContinuousDistribution {
92 /** Bins characteristics. */
93 private final List<SummaryStatistics> binStats;
94 /** Sample statistics. */
95 private final SummaryStatistics sampleStats;
96 /** Max loaded value. */
97 private final double max;
98 /** Min loaded value. */
99 private final double min;
100 /** Grid size. */
101 private final double delta;
102 /** Number of bins. */
103 private final int binCount;
104 /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
105 private final double[] upperBounds;
106 /** Kernel factory. */
107 private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;
108
109 /**
110 * Creates a new instance with the specified data.
111 *
112 * @param binCount Number of bins. Must be strictly positive.
113 * @param input Input data. Cannot be {@code null}.
114 * @param kernelFactory Kernel factory.
115 * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
116 */
117 private EmpiricalDistribution(int binCount,
118 double[] input,
119 Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
120 if (binCount <= 0) {
121 throw new NotStrictlyPositiveException(binCount);
122 }
123 this.binCount = binCount;
124
125 // First pass through the data.
126 sampleStats = new SummaryStatistics();
127 for (int i = 0; i < input.length; i++) {
128 sampleStats.addValue(input[i]);
129 }
130
131 // Set up grid.
132 min = sampleStats.getMin();
133 max = sampleStats.getMax();
134 delta = (max - min) / binCount;
135
136 // Second pass through the data.
137 binStats = createBinStats(input);
138
139 // Assign upper bounds based on bin counts.
140 upperBounds = new double[binCount];
141 final double n = sampleStats.getN();
142 upperBounds[0] = binStats.get(0).getN() / n;
143 for (int i = 1; i < binCount - 1; i++) {
144 upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
145 }
146 upperBounds[binCount - 1] = 1d;
147
148 this.kernelFactory = kernelFactory;
149 }
150
151 /**
152 * Factory that creates a new instance from the specified data.
153 *
154 * @param binCount Number of bins. Must be strictly positive.
155 * @param input Input data. Cannot be {@code null}.
156 * @param kernelFactory Factory for creating within-bin kernels.
157 * @return a new instance.
158 * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
159 */
160 public static EmpiricalDistribution from(int binCount,
161 double[] input,
162 Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
163 return new EmpiricalDistribution(binCount,
164 input,
165 kernelFactory);
166 }
167
168 /**
169 * Factory that creates a new instance from the specified data.
170 *
171 * @param binCount Number of bins. Must be strictly positive.
172 * @param input Input data. Cannot be {@code null}.
173 * @return a new instance.
174 * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
175 */
176 public static EmpiricalDistribution from(int binCount,
177 double[] input) {
178 return from(binCount, input, defaultKernel());
179 }
180
181 /**
182 * Create statistics (second pass through the data).
183 *
184 * @param input Input data.
185 * @return bins statistics.
186 */
187 private List<SummaryStatistics> createBinStats(double[] input) {
188 final List<SummaryStatistics> stats = new ArrayList<>();
189
190 for (int i = 0; i < binCount; i++) {
191 stats.add(i, new SummaryStatistics());
192 }
193
194 // Second pass though the data.
195 for (int i = 0; i < input.length; i++) {
196 final double v = input[i];
197 stats.get(findBin(v)).addValue(v);
198 }
199
200 return stats;
201 }
202
203 /**
204 * Returns the index of the bin to which the given value belongs.
205 *
206 * @param value Value whose bin we are trying to find.
207 * @return the index of the bin containing the value.
208 */
209 private int findBin(double value) {
210 return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1,
211 0),
212 binCount - 1);
213 }
214
215 /**
216 * Returns a {@link StatisticalSummary} describing this distribution.
217 * <strong>Preconditions:</strong><ul>
218 * <li>the distribution must be loaded before invoking this method</li></ul>
219 *
220 * @return the sample statistics
221 * @throws IllegalStateException if the distribution has not been loaded
222 */
223 public StatisticalSummary getSampleStats() {
224 return sampleStats.copy();
225 }
226
227 /**
228 * Returns the number of bins.
229 *
230 * @return the number of bins.
231 */
232 public int getBinCount() {
233 return binCount;
234 }
235
236 /**
237 * Returns a copy of the {@link SummaryStatistics} instances containing
238 * statistics describing the values in each of the bins.
239 * The list is indexed on the bin number.
240 *
241 * @return the bins statistics.
242 */
243 public List<SummaryStatistics> getBinStats() {
244 final List<SummaryStatistics> copy = new ArrayList<>();
245 for (SummaryStatistics s : binStats) {
246 copy.add(s.copy());
247 }
248 return copy;
249 }
250
251 /**
252 * Returns the upper bounds of the bins.
253 *
254 * Assuming array {@code u} is returned by this method, the bins are:
255 * <ul>
256 * <li>{@code (min, u[0])},</li>
257 * <li>{@code (u[0], u[1])},</li>
258 * <li>... ,</li>
259 * <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
260 * </ul>
261 *
262 * @return the bins upper bounds.
263 *
264 * @since 2.1
265 */
266 public double[] getUpperBounds() {
267 double[] binUpperBounds = new double[binCount];
268 for (int i = 0; i < binCount - 1; i++) {
269 binUpperBounds[i] = min + delta * (i + 1);
270 }
271 binUpperBounds[binCount - 1] = max;
272 return binUpperBounds;
273 }
274
275 /**
276 * Returns the upper bounds of the subintervals of [0, 1] used in generating
277 * data from the empirical distribution.
278 * Subintervals correspond to bins with lengths proportional to bin counts.
279 *
280 * <strong>Preconditions:</strong><ul>
281 * <li>the distribution must be loaded before invoking this method</li></ul>
282 *
283 * @return array of upper bounds of subintervals used in data generation
284 * @throws NullPointerException unless a {@code load} method has been
285 * called beforehand.
286 *
287 * @since 2.1
288 */
289 public double[] getGeneratorUpperBounds() {
290 int len = upperBounds.length;
291 double[] out = new double[len];
292 System.arraycopy(upperBounds, 0, out, 0, len);
293 return out;
294 }
295
296 // Distribution methods.
297
298 /**
299 * {@inheritDoc}
300 *
301 * Returns the kernel density normalized so that its integral over each bin
302 * equals the bin mass.
303 *
304 * Algorithm description:
305 * <ol>
306 * <li>Find the bin B that x belongs to.</li>
307 * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
308 * integral of the kernel density over B).</li>
309 * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
310 * and P(B) is the mass of B.</li>
311 * </ol>
312 *
313 * @since 3.1
314 */
315 @Override
316 public double density(double x) {
317 if (x < min || x > max) {
318 return 0d;
319 }
320 final int binIndex = findBin(x);
321 final ContinuousDistribution kernel = getKernel(binStats.get(binIndex));
322 return kernel.density(x) * pB(binIndex) / kB(binIndex);
323 }
324
325 /**
326 * {@inheritDoc}
327 *
328 * Algorithm description:
329 * <ol>
330 * <li>Find the bin B that x belongs to.</li>
331 * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
332 * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
333 * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
334 * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
335 * K(x) is the within-bin kernel distribution function evaluated at x.</li>
336 * </ol>
337 * If K is a constant distribution, we return P(B-) + P(B) (counting the full
338 * mass of B).
339 *
340 * @since 3.1
341 */
342 @Override
343 public double cumulativeProbability(double x) {
344 if (x < min) {
345 return 0d;
346 } else if (x >= max) {
347 return 1d;
348 }
349 final int binIndex = findBin(x);
350 final double pBminus = pBminus(binIndex);
351 final double pB = pB(binIndex);
352 final ContinuousDistribution kernel = k(x);
353 if (kernel instanceof ConstantContinuousDistribution) {
354 if (x < kernel.getMean()) {
355 return pBminus;
356 } else {
357 return pBminus + pB;
358 }
359 }
360 final double[] binBounds = getUpperBounds();
361 final double kB = kB(binIndex);
362 final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
363 final double withinBinCum =
364 (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB;
365 return pBminus + pB * withinBinCum;
366 }
367
368 /**
369 * {@inheritDoc}
370 *
371 * Algorithm description:
372 * <ol>
373 * <li>Find the smallest i such that the sum of the masses of the bins
374 * through i is at least p.</li>
375 * <li>
376 * <ol>
377 * <li>Let K be the within-bin kernel distribution for bin i.</li>
378 * <li>Let K(B) be the mass of B under K.</li>
379 * <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
380 * mass of the bins below B under K).</li>
381 * <li>Let P(B) be the probability of bin i.</li>
382 * <li>Let P(B-) be the sum of the bin masses below bin i.</li>
383 * <li>Let pCrit = p - P(B-)</li>
384 * </ol>
385 * </li>
386 * <li>Return the inverse of K evaluated at
387 * K(B-) + pCrit * K(B) / P(B) </li>
388 * </ol>
389 *
390 * @since 3.1
391 */
392 @Override
393 public double inverseCumulativeProbability(final double p) {
394 if (p < 0 ||
395 p > 1) {
396 throw new OutOfRangeException(p, 0, 1);
397 }
398
399 if (p == 0) {
400 return getSupportLowerBound();
401 }
402
403 if (p == 1) {
404 return getSupportUpperBound();
405 }
406
407 int i = 0;
408 while (cumBinP(i) < p) {
409 ++i;
410 }
411
412 final SummaryStatistics stats = binStats.get(i);
413 final ContinuousDistribution kernel = getKernel(stats);
414 final double kB = kB(i);
415 final double[] binBounds = getUpperBounds();
416 final double lower = i == 0 ? min : binBounds[i - 1];
417 final double kBminus = kernel.cumulativeProbability(lower);
418 final double pB = pB(i);
419 final double pBminus = pBminus(i);
420 final double pCrit = p - pBminus;
421 if (pCrit <= 0) {
422 return lower;
423 }
424
425 final double cP = kBminus + pCrit * kB / pB;
426
427 return Precision.equals(cP, 1d) ?
428 kernel.inverseCumulativeProbability(1d) :
429 kernel.inverseCumulativeProbability(cP);
430 }
431
432 /**
433 * {@inheritDoc}
434 * @since 3.1
435 */
436 @Override
437 public double getMean() {
438 return sampleStats.getMean();
439 }
440
441 /**
442 * {@inheritDoc}
443 * @since 3.1
444 */
445 @Override
446 public double getVariance() {
447 return sampleStats.getVariance();
448 }
449
450 /**
451 * {@inheritDoc}
452 * @since 3.1
453 */
454 @Override
455 public double getSupportLowerBound() {
456 return min;
457 }
458
459 /**
460 * {@inheritDoc}
461 * @since 3.1
462 */
463 @Override
464 public double getSupportUpperBound() {
465 return max;
466 }
467
468 /**
469 * The probability of bin i.
470 *
471 * @param i the index of the bin
472 * @return the probability that selection begins in bin i
473 */
474 private double pB(int i) {
475 return i == 0 ? upperBounds[0] :
476 upperBounds[i] - upperBounds[i - 1];
477 }
478
479 /**
480 * The combined probability of the bins up to but not including bin i.
481 *
482 * @param i the index of the bin
483 * @return the probability that selection begins in a bin below bin i.
484 */
485 private double pBminus(int i) {
486 return i == 0 ? 0 : upperBounds[i - 1];
487 }
488
489 /**
490 * Mass of bin i under the within-bin kernel of the bin.
491 *
492 * @param i index of the bin
493 * @return the difference in the within-bin kernel cdf between the
494 * upper and lower endpoints of bin i
495 */
496 private double kB(int i) {
497 final double[] binBounds = getUpperBounds();
498 final ContinuousDistribution kernel = getKernel(binStats.get(i));
499 return i == 0 ? kernel.probability(min, binBounds[0]) :
500 kernel.probability(binBounds[i - 1], binBounds[i]);
501 }
502
503 /**
504 * The within-bin kernel of the bin that x belongs to.
505 *
506 * @param x the value to locate within a bin
507 * @return the within-bin kernel of the bin containing x
508 */
509 private ContinuousDistribution k(double x) {
510 final int binIndex = findBin(x);
511 return getKernel(binStats.get(binIndex));
512 }
513
514 /**
515 * The combined probability of the bins up to and including binIndex.
516 *
517 * @param binIndex maximum bin index
518 * @return sum of the probabilities of bins through binIndex
519 */
520 private double cumBinP(int binIndex) {
521 return upperBounds[binIndex];
522 }
523
524 /**
525 * @param stats Bin statistics.
526 * @return the within-bin kernel.
527 */
528 private ContinuousDistribution getKernel(SummaryStatistics stats) {
529 return kernelFactory.apply(stats);
530 }
531
532 /**
533 * The within-bin smoothing kernel: A Gaussian distribution
534 * (unless the bin contains 0 or 1 observation, in which case
535 * a constant distribution is returned).
536 *
537 * @return the within-bin kernel factory.
538 */
539 private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
540 return stats -> {
541 if (stats.getN() <= 3 ||
542 stats.getVariance() == 0) {
543 return new ConstantContinuousDistribution(stats.getMean());
544 } else {
545 return NormalDistribution.of(stats.getMean(),
546 stats.getStandardDeviation());
547 }
548 };
549 }
550
551 /**
552 * Constant distribution.
553 */
554 private static final class ConstantContinuousDistribution implements ContinuousDistribution {
555 /** Constant value of the distribution. */
556 private final double value;
557
558 /**
559 * Create a constant real distribution with the given value.
560 *
561 * @param value Value of this distribution.
562 */
563 ConstantContinuousDistribution(double value) {
564 this.value = value;
565 }
566
567 /** {@inheritDoc} */
568 @Override
569 public double density(double x) {
570 return x == value ? 1 : 0;
571 }
572
573 /** {@inheritDoc} */
574 @Override
575 public double cumulativeProbability(double x) {
576 return x < value ? 0 : 1;
577 }
578
579 /** {@inheritDoc} */
580 @Override
581 public double inverseCumulativeProbability(final double p) {
582 if (p < 0 ||
583 p > 1) {
584 // Should never happen.
585 throw new IllegalArgumentException("Internal error");
586 }
587 return value;
588 }
589
590 /** {@inheritDoc} */
591 @Override
592 public double getMean() {
593 return value;
594 }
595
596 /** {@inheritDoc} */
597 @Override
598 public double getVariance() {
599 return 0;
600 }
601
602 /**{@inheritDoc} */
603 @Override
604 public double getSupportLowerBound() {
605 return value;
606 }
607
608 /** {@inheritDoc} */
609 @Override
610 public double getSupportUpperBound() {
611 return value;
612 }
613
614 /**
615 * {@inheritDoc}
616 *
617 * @param rng Not used: distribution contains a single value.
618 * @return the value of the distribution.
619 */
620 @Override
621 public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
622 return this::getSupportLowerBound;
623 }
624 }
625 }