001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math4.legacy.distribution; 019 020import java.util.ArrayList; 021import java.util.List; 022import java.util.function.Function; 023 024import org.apache.commons.statistics.distribution.NormalDistribution; 025import org.apache.commons.statistics.distribution.ContinuousDistribution; 026import org.apache.commons.numbers.core.Precision; 027import org.apache.commons.rng.UniformRandomProvider; 028import org.apache.commons.math4.legacy.exception.OutOfRangeException; 029import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException; 030import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary; 031import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics; 032import org.apache.commons.math4.core.jdkmath.JdkMath; 033 034/** 035 * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function"> 036 * empirical probability distribution</a>: Probability distribution derived 037 * from observed data without making any assumptions about the functional 038 * form of the population distribution that the data come from.</p> 039 * 040 * <p>An {@code EmpiricalDistribution} maintains data structures called 041 * <i>distribution digests</i> that describe empirical distributions and 042 * support the following operations: 043 * <ul> 044 * <li>loading the distribution from "observed" data values</li> 045 * <li>dividing the input data into "bin ranges" and reporting bin 046 * frequency counts (data for histogram)</li> 047 * <li>reporting univariate statistics describing the full set of data 048 * values as well as the observations within each bin</li> 049 * <li>generating random values from the distribution</li> 050 * </ul> 051 * 052 * Applications can use {@code EmpiricalDistribution} to build grouped 053 * frequency histograms representing the input data or to generate random 054 * values "like" those in the input, i.e. the values generated will follow 055 * the distribution of the values in the file. 056 * 057 * <p>The implementation uses what amounts to the 058 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html"> 059 * Variable Kernel Method</a> with Gaussian smoothing:<p> 060 * <strong>Digesting the input file</strong> 061 * <ol> 062 * <li>Pass the file once to compute min and max.</li> 063 * <li>Divide the range from min to max into {@code binCount} bins.</li> 064 * <li>Pass the data file again, computing bin counts and univariate 065 * statistics (mean and std dev.) for each bin.</li> 066 * <li>Divide the interval (0,1) into subintervals associated with the bins, 067 * with the length of a bin's subinterval proportional to its count.</li> 068 * </ol> 069 * <strong>Generating random values from the distribution</strong> 070 * <ol> 071 * <li>Generate a uniformly distributed value in (0,1) </li> 072 * <li>Select the subinterval to which the value belongs. 073 * <li>Generate a random Gaussian value with mean = mean of the associated 074 * bin and std dev = std dev of associated bin.</li> 075 * </ol> 076 * 077 * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface 078 * as follows. Given x within the range of values in the dataset, let B 079 * be the bin containing x and let K be the within-bin kernel for B. Let P(B-) 080 * be the sum of the probabilities of the bins below B and let K(B) be the 081 * mass of B under K (i.e., the integral of the kernel density over B). Then 082 * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the 083 * kernel distribution evaluated at x. This results in a cdf that matches the 084 * grouped frequency distribution at the bin endpoints and interpolates within 085 * bins using within-bin kernels.</p> 086 * 087 * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[]) 088 * bin count} is about one tenth of the size of the input array. 089 */ 090public final class EmpiricalDistribution extends AbstractRealDistribution 091 implements ContinuousDistribution { 092 /** Bins characteristics. */ 093 private final List<SummaryStatistics> binStats; 094 /** Sample statistics. */ 095 private final SummaryStatistics sampleStats; 096 /** Max loaded value. */ 097 private final double max; 098 /** Min loaded value. */ 099 private final double min; 100 /** Grid size. */ 101 private final double delta; 102 /** Number of bins. */ 103 private final int binCount; 104 /** Upper bounds of subintervals in (0, 1) belonging to the bins. */ 105 private final double[] upperBounds; 106 /** Kernel factory. */ 107 private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory; 108 109 /** 110 * Creates a new instance with the specified data. 111 * 112 * @param binCount Number of bins. Must be strictly positive. 113 * @param input Input data. Cannot be {@code null}. 114 * @param kernelFactory Kernel factory. 115 * @throws NotStrictlyPositiveException if {@code binCount <= 0}. 116 */ 117 private EmpiricalDistribution(int binCount, 118 double[] input, 119 Function<SummaryStatistics, ContinuousDistribution> kernelFactory) { 120 if (binCount <= 0) { 121 throw new NotStrictlyPositiveException(binCount); 122 } 123 this.binCount = binCount; 124 125 // First pass through the data. 126 sampleStats = new SummaryStatistics(); 127 for (int i = 0; i < input.length; i++) { 128 sampleStats.addValue(input[i]); 129 } 130 131 // Set up grid. 132 min = sampleStats.getMin(); 133 max = sampleStats.getMax(); 134 delta = (max - min) / binCount; 135 136 // Second pass through the data. 137 binStats = createBinStats(input); 138 139 // Assign upper bounds based on bin counts. 140 upperBounds = new double[binCount]; 141 final double n = sampleStats.getN(); 142 upperBounds[0] = binStats.get(0).getN() / n; 143 for (int i = 1; i < binCount - 1; i++) { 144 upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n; 145 } 146 upperBounds[binCount - 1] = 1d; 147 148 this.kernelFactory = kernelFactory; 149 } 150 151 /** 152 * Factory that creates a new instance from the specified data. 153 * 154 * @param binCount Number of bins. Must be strictly positive. 155 * @param input Input data. Cannot be {@code null}. 156 * @param kernelFactory Factory for creating within-bin kernels. 157 * @return a new instance. 158 * @throws NotStrictlyPositiveException if {@code binCount <= 0}. 159 */ 160 public static EmpiricalDistribution from(int binCount, 161 double[] input, 162 Function<SummaryStatistics, ContinuousDistribution> kernelFactory) { 163 return new EmpiricalDistribution(binCount, 164 input, 165 kernelFactory); 166 } 167 168 /** 169 * Factory that creates a new instance from the specified data. 170 * 171 * @param binCount Number of bins. Must be strictly positive. 172 * @param input Input data. Cannot be {@code null}. 173 * @return a new instance. 174 * @throws NotStrictlyPositiveException if {@code binCount <= 0}. 175 */ 176 public static EmpiricalDistribution from(int binCount, 177 double[] input) { 178 return from(binCount, input, defaultKernel()); 179 } 180 181 /** 182 * Create statistics (second pass through the data). 183 * 184 * @param input Input data. 185 * @return bins statistics. 186 */ 187 private List<SummaryStatistics> createBinStats(double[] input) { 188 final List<SummaryStatistics> stats = new ArrayList<>(); 189 190 for (int i = 0; i < binCount; i++) { 191 stats.add(i, new SummaryStatistics()); 192 } 193 194 // Second pass though the data. 195 for (int i = 0; i < input.length; i++) { 196 final double v = input[i]; 197 stats.get(findBin(v)).addValue(v); 198 } 199 200 return stats; 201 } 202 203 /** 204 * Returns the index of the bin to which the given value belongs. 205 * 206 * @param value Value whose bin we are trying to find. 207 * @return the index of the bin containing the value. 208 */ 209 private int findBin(double value) { 210 return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1, 211 0), 212 binCount - 1); 213 } 214 215 /** 216 * Returns a {@link StatisticalSummary} describing this distribution. 217 * <strong>Preconditions:</strong><ul> 218 * <li>the distribution must be loaded before invoking this method</li></ul> 219 * 220 * @return the sample statistics 221 * @throws IllegalStateException if the distribution has not been loaded 222 */ 223 public StatisticalSummary getSampleStats() { 224 return sampleStats.copy(); 225 } 226 227 /** 228 * Returns the number of bins. 229 * 230 * @return the number of bins. 231 */ 232 public int getBinCount() { 233 return binCount; 234 } 235 236 /** 237 * Returns a copy of the {@link SummaryStatistics} instances containing 238 * statistics describing the values in each of the bins. 239 * The list is indexed on the bin number. 240 * 241 * @return the bins statistics. 242 */ 243 public List<SummaryStatistics> getBinStats() { 244 final List<SummaryStatistics> copy = new ArrayList<>(); 245 for (SummaryStatistics s : binStats) { 246 copy.add(s.copy()); 247 } 248 return copy; 249 } 250 251 /** 252 * Returns the upper bounds of the bins. 253 * 254 * Assuming array {@code u} is returned by this method, the bins are: 255 * <ul> 256 * <li>{@code (min, u[0])},</li> 257 * <li>{@code (u[0], u[1])},</li> 258 * <li>... ,</li> 259 * <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li> 260 * </ul> 261 * 262 * @return the bins upper bounds. 263 * 264 * @since 2.1 265 */ 266 public double[] getUpperBounds() { 267 double[] binUpperBounds = new double[binCount]; 268 for (int i = 0; i < binCount - 1; i++) { 269 binUpperBounds[i] = min + delta * (i + 1); 270 } 271 binUpperBounds[binCount - 1] = max; 272 return binUpperBounds; 273 } 274 275 /** 276 * Returns the upper bounds of the subintervals of [0, 1] used in generating 277 * data from the empirical distribution. 278 * Subintervals correspond to bins with lengths proportional to bin counts. 279 * 280 * <strong>Preconditions:</strong><ul> 281 * <li>the distribution must be loaded before invoking this method</li></ul> 282 * 283 * @return array of upper bounds of subintervals used in data generation 284 * @throws NullPointerException unless a {@code load} method has been 285 * called beforehand. 286 * 287 * @since 2.1 288 */ 289 public double[] getGeneratorUpperBounds() { 290 int len = upperBounds.length; 291 double[] out = new double[len]; 292 System.arraycopy(upperBounds, 0, out, 0, len); 293 return out; 294 } 295 296 // Distribution methods. 297 298 /** 299 * {@inheritDoc} 300 * 301 * Returns the kernel density normalized so that its integral over each bin 302 * equals the bin mass. 303 * 304 * Algorithm description: 305 * <ol> 306 * <li>Find the bin B that x belongs to.</li> 307 * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the 308 * integral of the kernel density over B).</li> 309 * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density 310 * and P(B) is the mass of B.</li> 311 * </ol> 312 * 313 * @since 3.1 314 */ 315 @Override 316 public double density(double x) { 317 if (x < min || x > max) { 318 return 0d; 319 } 320 final int binIndex = findBin(x); 321 final ContinuousDistribution kernel = getKernel(binStats.get(binIndex)); 322 return kernel.density(x) * pB(binIndex) / kB(binIndex); 323 } 324 325 /** 326 * {@inheritDoc} 327 * 328 * Algorithm description: 329 * <ol> 330 * <li>Find the bin B that x belongs to.</li> 331 * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li> 332 * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel 333 * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li> 334 * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where 335 * K(x) is the within-bin kernel distribution function evaluated at x.</li> 336 * </ol> 337 * If K is a constant distribution, we return P(B-) + P(B) (counting the full 338 * mass of B). 339 * 340 * @since 3.1 341 */ 342 @Override 343 public double cumulativeProbability(double x) { 344 if (x < min) { 345 return 0d; 346 } else if (x >= max) { 347 return 1d; 348 } 349 final int binIndex = findBin(x); 350 final double pBminus = pBminus(binIndex); 351 final double pB = pB(binIndex); 352 final ContinuousDistribution kernel = k(x); 353 if (kernel instanceof ConstantContinuousDistribution) { 354 if (x < kernel.getMean()) { 355 return pBminus; 356 } else { 357 return pBminus + pB; 358 } 359 } 360 final double[] binBounds = getUpperBounds(); 361 final double kB = kB(binIndex); 362 final double lower = binIndex == 0 ? min : binBounds[binIndex - 1]; 363 final double withinBinCum = 364 (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB; 365 return pBminus + pB * withinBinCum; 366 } 367 368 /** 369 * {@inheritDoc} 370 * 371 * Algorithm description: 372 * <ol> 373 * <li>Find the smallest i such that the sum of the masses of the bins 374 * through i is at least p.</li> 375 * <li> 376 * <ol> 377 * <li>Let K be the within-bin kernel distribution for bin i.</li> 378 * <li>Let K(B) be the mass of B under K.</li> 379 * <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined 380 * mass of the bins below B under K).</li> 381 * <li>Let P(B) be the probability of bin i.</li> 382 * <li>Let P(B-) be the sum of the bin masses below bin i.</li> 383 * <li>Let pCrit = p - P(B-)</li> 384 * </ol> 385 * </li> 386 * <li>Return the inverse of K evaluated at 387 * K(B-) + pCrit * K(B) / P(B) </li> 388 * </ol> 389 * 390 * @since 3.1 391 */ 392 @Override 393 public double inverseCumulativeProbability(final double p) { 394 if (p < 0 || 395 p > 1) { 396 throw new OutOfRangeException(p, 0, 1); 397 } 398 399 if (p == 0) { 400 return getSupportLowerBound(); 401 } 402 403 if (p == 1) { 404 return getSupportUpperBound(); 405 } 406 407 int i = 0; 408 while (cumBinP(i) < p) { 409 ++i; 410 } 411 412 final SummaryStatistics stats = binStats.get(i); 413 final ContinuousDistribution kernel = getKernel(stats); 414 final double kB = kB(i); 415 final double[] binBounds = getUpperBounds(); 416 final double lower = i == 0 ? min : binBounds[i - 1]; 417 final double kBminus = kernel.cumulativeProbability(lower); 418 final double pB = pB(i); 419 final double pBminus = pBminus(i); 420 final double pCrit = p - pBminus; 421 if (pCrit <= 0) { 422 return lower; 423 } 424 425 final double cP = kBminus + pCrit * kB / pB; 426 427 return Precision.equals(cP, 1d) ? 428 kernel.inverseCumulativeProbability(1d) : 429 kernel.inverseCumulativeProbability(cP); 430 } 431 432 /** 433 * {@inheritDoc} 434 * @since 3.1 435 */ 436 @Override 437 public double getMean() { 438 return sampleStats.getMean(); 439 } 440 441 /** 442 * {@inheritDoc} 443 * @since 3.1 444 */ 445 @Override 446 public double getVariance() { 447 return sampleStats.getVariance(); 448 } 449 450 /** 451 * {@inheritDoc} 452 * @since 3.1 453 */ 454 @Override 455 public double getSupportLowerBound() { 456 return min; 457 } 458 459 /** 460 * {@inheritDoc} 461 * @since 3.1 462 */ 463 @Override 464 public double getSupportUpperBound() { 465 return max; 466 } 467 468 /** 469 * The probability of bin i. 470 * 471 * @param i the index of the bin 472 * @return the probability that selection begins in bin i 473 */ 474 private double pB(int i) { 475 return i == 0 ? upperBounds[0] : 476 upperBounds[i] - upperBounds[i - 1]; 477 } 478 479 /** 480 * The combined probability of the bins up to but not including bin i. 481 * 482 * @param i the index of the bin 483 * @return the probability that selection begins in a bin below bin i. 484 */ 485 private double pBminus(int i) { 486 return i == 0 ? 0 : upperBounds[i - 1]; 487 } 488 489 /** 490 * Mass of bin i under the within-bin kernel of the bin. 491 * 492 * @param i index of the bin 493 * @return the difference in the within-bin kernel cdf between the 494 * upper and lower endpoints of bin i 495 */ 496 private double kB(int i) { 497 final double[] binBounds = getUpperBounds(); 498 final ContinuousDistribution kernel = getKernel(binStats.get(i)); 499 return i == 0 ? kernel.probability(min, binBounds[0]) : 500 kernel.probability(binBounds[i - 1], binBounds[i]); 501 } 502 503 /** 504 * The within-bin kernel of the bin that x belongs to. 505 * 506 * @param x the value to locate within a bin 507 * @return the within-bin kernel of the bin containing x 508 */ 509 private ContinuousDistribution k(double x) { 510 final int binIndex = findBin(x); 511 return getKernel(binStats.get(binIndex)); 512 } 513 514 /** 515 * The combined probability of the bins up to and including binIndex. 516 * 517 * @param binIndex maximum bin index 518 * @return sum of the probabilities of bins through binIndex 519 */ 520 private double cumBinP(int binIndex) { 521 return upperBounds[binIndex]; 522 } 523 524 /** 525 * @param stats Bin statistics. 526 * @return the within-bin kernel. 527 */ 528 private ContinuousDistribution getKernel(SummaryStatistics stats) { 529 return kernelFactory.apply(stats); 530 } 531 532 /** 533 * The within-bin smoothing kernel: A Gaussian distribution 534 * (unless the bin contains 0 or 1 observation, in which case 535 * a constant distribution is returned). 536 * 537 * @return the within-bin kernel factory. 538 */ 539 private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() { 540 return stats -> { 541 if (stats.getN() <= 3 || 542 stats.getVariance() == 0) { 543 return new ConstantContinuousDistribution(stats.getMean()); 544 } else { 545 return NormalDistribution.of(stats.getMean(), 546 stats.getStandardDeviation()); 547 } 548 }; 549 } 550 551 /** 552 * Constant distribution. 553 */ 554 private static class ConstantContinuousDistribution implements ContinuousDistribution { 555 /** Constant value of the distribution. */ 556 private final double value; 557 558 /** 559 * Create a constant real distribution with the given value. 560 * 561 * @param value Value of this distribution. 562 */ 563 ConstantContinuousDistribution(double value) { 564 this.value = value; 565 } 566 567 /** {@inheritDoc} */ 568 @Override 569 public double density(double x) { 570 return x == value ? 1 : 0; 571 } 572 573 /** {@inheritDoc} */ 574 @Override 575 public double cumulativeProbability(double x) { 576 return x < value ? 0 : 1; 577 } 578 579 /** {@inheritDoc} */ 580 @Override 581 public double inverseCumulativeProbability(final double p) { 582 if (p < 0 || 583 p > 1) { 584 // Should never happen. 585 throw new IllegalArgumentException("Internal error"); 586 } 587 return value; 588 } 589 590 /** {@inheritDoc} */ 591 @Override 592 public double getMean() { 593 return value; 594 } 595 596 /** {@inheritDoc} */ 597 @Override 598 public double getVariance() { 599 return 0; 600 } 601 602 /**{@inheritDoc} */ 603 @Override 604 public double getSupportLowerBound() { 605 return value; 606 } 607 608 /** {@inheritDoc} */ 609 @Override 610 public double getSupportUpperBound() { 611 return value; 612 } 613 614 /** 615 * {@inheritDoc} 616 * 617 * @param rng Not used: distribution contains a single value. 618 * @return the value of the distribution. 619 */ 620 @Override 621 public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) { 622 return this::getSupportLowerBound; 623 } 624 } 625}