EmpiricalDistribution.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.math4.legacy.distribution;

  18. import java.util.ArrayList;
  19. import java.util.List;
  20. import java.util.function.Function;

  21. import org.apache.commons.statistics.distribution.NormalDistribution;
  22. import org.apache.commons.statistics.distribution.ContinuousDistribution;
  23. import org.apache.commons.numbers.core.Precision;
  24. import org.apache.commons.rng.UniformRandomProvider;
  25. import org.apache.commons.math4.legacy.exception.OutOfRangeException;
  26. import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
  27. import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
  28. import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
  29. import org.apache.commons.math4.core.jdkmath.JdkMath;

  30. /**
  31.  * <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
  32.  * empirical probability distribution</a>: Probability distribution derived
  33.  * from observed data without making any assumptions about the functional
  34.  * form of the population distribution that the data come from.</p>
  35.  *
  36.  * <p>An {@code EmpiricalDistribution} maintains data structures called
  37.  * <i>distribution digests</i> that describe empirical distributions and
  38.  * support the following operations:
  39.  * <ul>
  40.  *  <li>loading the distribution from "observed" data values</li>
  41.  *  <li>dividing the input data into "bin ranges" and reporting bin
  42.  *      frequency counts (data for histogram)</li>
  43.  *  <li>reporting univariate statistics describing the full set of data
  44.  *      values as well as the observations within each bin</li>
  45.  *  <li>generating random values from the distribution</li>
  46.  * </ul>
  47.  *
  48.  * Applications can use {@code EmpiricalDistribution} to build grouped
  49.  * frequency histograms representing the input data or to generate random
  50.  * values "like" those in the input, i.e. the values generated will follow
  51.  * the distribution of the values in the file.
  52.  *
  53.  * <p>The implementation uses what amounts to the
  54.  * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
  55.  * Variable Kernel Method</a> with Gaussian smoothing:<p>
  56.  * <strong>Digesting the input file</strong>
  57.  * <ol>
  58.  *  <li>Pass the file once to compute min and max.</li>
  59.  *  <li>Divide the range from min to max into {@code binCount} bins.</li>
  60.  *  <li>Pass the data file again, computing bin counts and univariate
  61.  *      statistics (mean and std dev.) for each bin.</li>
  62.  *  <li>Divide the interval (0,1) into subintervals associated with the bins,
  63.  *      with the length of a bin's subinterval proportional to its count.</li>
  64.  * </ol>
  65.  * <strong>Generating random values from the distribution</strong>
  66.  * <ol>
  67.  *  <li>Generate a uniformly distributed value in (0,1) </li>
  68.  *  <li>Select the subinterval to which the value belongs.
  69.  *  <li>Generate a random Gaussian value with mean = mean of the associated
  70.  *      bin and std dev = std dev of associated bin.</li>
  71.  * </ol>
  72.  *
  73.  * <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
  74.  * as follows.  Given x within the range of values in the dataset, let B
  75.  * be the bin containing x and let K be the within-bin kernel for B.  Let P(B-)
  76.  * be the sum of the probabilities of the bins below B and let K(B) be the
  77.  * mass of B under K (i.e., the integral of the kernel density over B).  Then
  78.  * set {@code P(X < x) = P(B-) + P(B) * K(x) / K(B)} where {@code K(x)} is the
  79.  * kernel distribution evaluated at x. This results in a cdf that matches the
  80.  * grouped frequency distribution at the bin endpoints and interpolates within
  81.  * bins using within-bin kernels.</p>
  82.  *
  83.  * <strong>CAVEAT</strong>: It is advised that the {@link #from(int,double[])
  84.  * bin count} is about one tenth of the size of the input array.
  85.  */
  86. public final class EmpiricalDistribution extends AbstractRealDistribution
  87.     implements ContinuousDistribution {
  88.     /** Bins characteristics. */
  89.     private final List<SummaryStatistics> binStats;
  90.     /** Sample statistics. */
  91.     private final SummaryStatistics sampleStats;
  92.     /** Max loaded value. */
  93.     private final double max;
  94.     /** Min loaded value. */
  95.     private final double min;
  96.     /** Grid size. */
  97.     private final double delta;
  98.     /** Number of bins. */
  99.     private final int binCount;
  100.     /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
  101.     private final double[] upperBounds;
  102.     /** Kernel factory. */
  103.     private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;

  104.     /**
  105.      * Creates a new instance with the specified data.
  106.      *
  107.      * @param binCount Number of bins.  Must be strictly positive.
  108.      * @param input Input data.  Cannot be {@code null}.
  109.      * @param kernelFactory Kernel factory.
  110.      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
  111.      */
  112.     private EmpiricalDistribution(int binCount,
  113.                                   double[] input,
  114.                                   Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
  115.         if (binCount <= 0) {
  116.             throw new NotStrictlyPositiveException(binCount);
  117.         }
  118.         this.binCount = binCount;

  119.         // First pass through the data.
  120.         sampleStats = new SummaryStatistics();
  121.         for (int i = 0; i < input.length; i++) {
  122.             sampleStats.addValue(input[i]);
  123.         }

  124.         // Set up grid.
  125.         min = sampleStats.getMin();
  126.         max = sampleStats.getMax();
  127.         delta = (max - min) / binCount;

  128.         // Second pass through the data.
  129.         binStats = createBinStats(input);

  130.         // Assign upper bounds based on bin counts.
  131.         upperBounds = new double[binCount];
  132.         final double n = sampleStats.getN();
  133.         upperBounds[0] = binStats.get(0).getN() / n;
  134.         for (int i = 1; i < binCount - 1; i++) {
  135.             upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
  136.         }
  137.         upperBounds[binCount - 1] = 1d;

  138.         this.kernelFactory = kernelFactory;
  139.      }

  140.     /**
  141.      * Factory that creates a new instance from the specified data.
  142.      *
  143.      * @param binCount Number of bins.  Must be strictly positive.
  144.      * @param input Input data.  Cannot be {@code null}.
  145.      * @param kernelFactory Factory for creating within-bin kernels.
  146.      * @return a new instance.
  147.      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
  148.      */
  149.     public static EmpiricalDistribution from(int binCount,
  150.                                              double[] input,
  151.                                              Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
  152.         return new EmpiricalDistribution(binCount,
  153.                                          input,
  154.                                          kernelFactory);
  155.     }

  156.     /**
  157.      * Factory that creates a new instance from the specified data.
  158.      *
  159.      * @param binCount Number of bins.  Must be strictly positive.
  160.      * @param input Input data.  Cannot be {@code null}.
  161.      * @return a new instance.
  162.      * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
  163.      */
  164.     public static EmpiricalDistribution from(int binCount,
  165.                                              double[] input) {
  166.         return from(binCount, input, defaultKernel());
  167.     }

  168.     /**
  169.      * Create statistics (second pass through the data).
  170.      *
  171.      * @param input Input data.
  172.      * @return bins statistics.
  173.      */
  174.     private List<SummaryStatistics> createBinStats(double[] input) {
  175.         final List<SummaryStatistics> stats = new ArrayList<>();

  176.         for (int i = 0; i < binCount; i++) {
  177.             stats.add(i, new SummaryStatistics());
  178.         }

  179.         // Second pass though the data.
  180.         for (int i = 0; i < input.length; i++) {
  181.             final double v = input[i];
  182.             stats.get(findBin(v)).addValue(v);
  183.         }

  184.         return stats;
  185.     }

  186.     /**
  187.      * Returns the index of the bin to which the given value belongs.
  188.      *
  189.      * @param value Value whose bin we are trying to find.
  190.      * @return the index of the bin containing the value.
  191.      */
  192.     private int findBin(double value) {
  193.         return Math.min(Math.max((int) JdkMath.ceil((value - min) / delta) - 1,
  194.                                  0),
  195.                         binCount - 1);
  196.     }

  197.     /**
  198.      * Returns a {@link StatisticalSummary} describing this distribution.
  199.      * <strong>Preconditions:</strong><ul>
  200.      * <li>the distribution must be loaded before invoking this method</li></ul>
  201.      *
  202.      * @return the sample statistics
  203.      * @throws IllegalStateException if the distribution has not been loaded
  204.      */
  205.     public StatisticalSummary getSampleStats() {
  206.         return sampleStats.copy();
  207.     }

  208.     /**
  209.      * Returns the number of bins.
  210.      *
  211.      * @return the number of bins.
  212.      */
  213.     public int getBinCount() {
  214.         return binCount;
  215.     }

  216.     /**
  217.      * Returns a copy of the {@link SummaryStatistics} instances containing
  218.      * statistics describing the values in each of the bins.
  219.      * The list is indexed on the bin number.
  220.      *
  221.      * @return the bins statistics.
  222.      */
  223.     public List<SummaryStatistics> getBinStats() {
  224.         final List<SummaryStatistics> copy = new ArrayList<>();
  225.         for (SummaryStatistics s : binStats) {
  226.             copy.add(s.copy());
  227.         }
  228.         return copy;
  229.     }

  230.     /**
  231.      * Returns the upper bounds of the bins.
  232.      *
  233.      * Assuming array {@code u} is returned by this method, the bins are:
  234.      * <ul>
  235.      *  <li>{@code (min, u[0])},</li>
  236.      *  <li>{@code (u[0], u[1])},</li>
  237.      *  <li>... ,</li>
  238.      *  <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
  239.      * </ul>
  240.      *
  241.      * @return the bins upper bounds.
  242.      *
  243.      * @since 2.1
  244.      */
  245.     public double[] getUpperBounds() {
  246.         double[] binUpperBounds = new double[binCount];
  247.         for (int i = 0; i < binCount - 1; i++) {
  248.             binUpperBounds[i] = min + delta * (i + 1);
  249.         }
  250.         binUpperBounds[binCount - 1] = max;
  251.         return binUpperBounds;
  252.     }

  253.     /**
  254.      * Returns the upper bounds of the subintervals of [0, 1] used in generating
  255.      * data from the empirical distribution.
  256.      * Subintervals correspond to bins with lengths proportional to bin counts.
  257.      *
  258.      * <strong>Preconditions:</strong><ul>
  259.      * <li>the distribution must be loaded before invoking this method</li></ul>
  260.      *
  261.      * @return array of upper bounds of subintervals used in data generation
  262.      * @throws NullPointerException unless a {@code load} method has been
  263.      * called beforehand.
  264.      *
  265.      * @since 2.1
  266.      */
  267.     public double[] getGeneratorUpperBounds() {
  268.         int len = upperBounds.length;
  269.         double[] out = new double[len];
  270.         System.arraycopy(upperBounds, 0, out, 0, len);
  271.         return out;
  272.     }

  273.     // Distribution methods.

  274.     /**
  275.      * {@inheritDoc}
  276.      *
  277.      * Returns the kernel density normalized so that its integral over each bin
  278.      * equals the bin mass.
  279.      *
  280.      * Algorithm description:
  281.      * <ol>
  282.      *  <li>Find the bin B that x belongs to.</li>
  283.      *  <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
  284.      *   integral of the kernel density over B).</li>
  285.      *  <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
  286.      *   and P(B) is the mass of B.</li>
  287.      * </ol>
  288.      *
  289.      * @since 3.1
  290.      */
  291.     @Override
  292.     public double density(double x) {
  293.         if (x < min || x > max) {
  294.             return 0d;
  295.         }
  296.         final int binIndex = findBin(x);
  297.         final ContinuousDistribution kernel = getKernel(binStats.get(binIndex));
  298.         return kernel.density(x) * pB(binIndex) / kB(binIndex);
  299.     }

  300.     /**
  301.      * {@inheritDoc}
  302.      *
  303.      * Algorithm description:
  304.      * <ol>
  305.      *  <li>Find the bin B that x belongs to.</li>
  306.      *  <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
  307.      *  <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
  308.      *   and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
  309.      *  <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
  310.      *   K(x) is the within-bin kernel distribution function evaluated at x.</li>
  311.      * </ol>
  312.      * If K is a constant distribution, we return P(B-) + P(B) (counting the full
  313.      * mass of B).
  314.      *
  315.      * @since 3.1
  316.      */
  317.     @Override
  318.     public double cumulativeProbability(double x) {
  319.         if (x < min) {
  320.             return 0d;
  321.         } else if (x >= max) {
  322.             return 1d;
  323.         }
  324.         final int binIndex = findBin(x);
  325.         final double pBminus = pBminus(binIndex);
  326.         final double pB = pB(binIndex);
  327.         final ContinuousDistribution kernel = k(x);
  328.         if (kernel instanceof ConstantContinuousDistribution) {
  329.             if (x < kernel.getMean()) {
  330.                 return pBminus;
  331.             } else {
  332.                 return pBminus + pB;
  333.             }
  334.         }
  335.         final double[] binBounds = getUpperBounds();
  336.         final double kB = kB(binIndex);
  337.         final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
  338.         final double withinBinCum =
  339.             (kernel.cumulativeProbability(x) -  kernel.cumulativeProbability(lower)) / kB;
  340.         return pBminus + pB * withinBinCum;
  341.     }

  342.     /**
  343.      * {@inheritDoc}
  344.      *
  345.      * Algorithm description:
  346.      * <ol>
  347.      *  <li>Find the smallest i such that the sum of the masses of the bins
  348.      *   through i is at least p.</li>
  349.      *  <li>
  350.      *   <ol>
  351.      *    <li>Let K be the within-bin kernel distribution for bin i.</li>
  352.      *    <li>Let K(B) be the mass of B under K.</li>
  353.      *    <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
  354.      *     mass of the bins below B under K).</li>
  355.      *    <li>Let P(B) be the probability of bin i.</li>
  356.      *    <li>Let P(B-) be the sum of the bin masses below bin i.</li>
  357.      *    <li>Let pCrit = p - P(B-)</li>
  358.      *   </ol>
  359.      *  </li>
  360.      *  <li>Return the inverse of K evaluated at
  361.      *    K(B-) + pCrit * K(B) / P(B) </li>
  362.      * </ol>
  363.      *
  364.      * @since 3.1
  365.      */
  366.     @Override
  367.     public double inverseCumulativeProbability(final double p) {
  368.         if (p < 0 ||
  369.             p > 1) {
  370.             throw new OutOfRangeException(p, 0, 1);
  371.         }

  372.         if (p == 0) {
  373.             return getSupportLowerBound();
  374.         }

  375.         if (p == 1) {
  376.             return getSupportUpperBound();
  377.         }

  378.         int i = 0;
  379.         while (cumBinP(i) < p) {
  380.             ++i;
  381.         }

  382.         final SummaryStatistics stats = binStats.get(i);
  383.         final ContinuousDistribution kernel = getKernel(stats);
  384.         final double kB = kB(i);
  385.         final double[] binBounds = getUpperBounds();
  386.         final double lower = i == 0 ? min : binBounds[i - 1];
  387.         final double kBminus = kernel.cumulativeProbability(lower);
  388.         final double pB = pB(i);
  389.         final double pBminus = pBminus(i);
  390.         final double pCrit = p - pBminus;
  391.         if (pCrit <= 0) {
  392.             return lower;
  393.         }

  394.         final double cP = kBminus + pCrit * kB / pB;

  395.         return Precision.equals(cP, 1d) ?
  396.             kernel.inverseCumulativeProbability(1d) :
  397.             kernel.inverseCumulativeProbability(cP);
  398.     }

  399.     /**
  400.      * {@inheritDoc}
  401.      * @since 3.1
  402.      */
  403.     @Override
  404.     public double getMean() {
  405.        return sampleStats.getMean();
  406.     }

  407.     /**
  408.      * {@inheritDoc}
  409.      * @since 3.1
  410.      */
  411.     @Override
  412.     public double getVariance() {
  413.         return sampleStats.getVariance();
  414.     }

  415.     /**
  416.      * {@inheritDoc}
  417.      * @since 3.1
  418.      */
  419.     @Override
  420.     public double getSupportLowerBound() {
  421.        return min;
  422.     }

  423.     /**
  424.      * {@inheritDoc}
  425.      * @since 3.1
  426.      */
  427.     @Override
  428.     public double getSupportUpperBound() {
  429.         return max;
  430.     }

  431.     /**
  432.      * The probability of bin i.
  433.      *
  434.      * @param i the index of the bin
  435.      * @return the probability that selection begins in bin i
  436.      */
  437.     private double pB(int i) {
  438.         return i == 0 ? upperBounds[0] :
  439.             upperBounds[i] - upperBounds[i - 1];
  440.     }

  441.     /**
  442.      * The combined probability of the bins up to but not including bin i.
  443.      *
  444.      * @param i the index of the bin
  445.      * @return the probability that selection begins in a bin below bin i.
  446.      */
  447.     private double pBminus(int i) {
  448.         return i == 0 ? 0 : upperBounds[i - 1];
  449.     }

  450.     /**
  451.      * Mass of bin i under the within-bin kernel of the bin.
  452.      *
  453.      * @param i index of the bin
  454.      * @return the difference in the within-bin kernel cdf between the
  455.      * upper and lower endpoints of bin i
  456.      */
  457.     private double kB(int i) {
  458.         final double[] binBounds = getUpperBounds();
  459.         final ContinuousDistribution kernel = getKernel(binStats.get(i));
  460.         return i == 0 ? kernel.probability(min, binBounds[0]) :
  461.             kernel.probability(binBounds[i - 1], binBounds[i]);
  462.     }

  463.     /**
  464.      * The within-bin kernel of the bin that x belongs to.
  465.      *
  466.      * @param x the value to locate within a bin
  467.      * @return the within-bin kernel of the bin containing x
  468.      */
  469.     private ContinuousDistribution k(double x) {
  470.         final int binIndex = findBin(x);
  471.         return getKernel(binStats.get(binIndex));
  472.     }

  473.     /**
  474.      * The combined probability of the bins up to and including binIndex.
  475.      *
  476.      * @param binIndex maximum bin index
  477.      * @return sum of the probabilities of bins through binIndex
  478.      */
  479.     private double cumBinP(int binIndex) {
  480.         return upperBounds[binIndex];
  481.     }

  482.     /**
  483.      * @param stats Bin statistics.
  484.      * @return the within-bin kernel.
  485.      */
  486.     private ContinuousDistribution getKernel(SummaryStatistics stats) {
  487.         return kernelFactory.apply(stats);
  488.     }

  489.     /**
  490.      * The within-bin smoothing kernel: A Gaussian distribution
  491.      * (unless the bin contains 0 or 1 observation, in which case
  492.      * a constant distribution is returned).
  493.      *
  494.      * @return the within-bin kernel factory.
  495.      */
  496.     private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
  497.         return stats -> {
  498.             if (stats.getN() <= 3 ||
  499.                 stats.getVariance() == 0) {
  500.                 return new ConstantContinuousDistribution(stats.getMean());
  501.             } else {
  502.                 return NormalDistribution.of(stats.getMean(),
  503.                                              stats.getStandardDeviation());
  504.             }
  505.         };
  506.     }

  507.     /**
  508.      * Constant distribution.
  509.      */
  510.     private static final class ConstantContinuousDistribution implements ContinuousDistribution {
  511.         /** Constant value of the distribution. */
  512.         private final double value;

  513.         /**
  514.          * Create a constant real distribution with the given value.
  515.          *
  516.          * @param value Value of this distribution.
  517.          */
  518.         ConstantContinuousDistribution(double value) {
  519.             this.value = value;
  520.         }

  521.         /** {@inheritDoc} */
  522.         @Override
  523.         public double density(double x) {
  524.             return x == value ? 1 : 0;
  525.         }

  526.         /** {@inheritDoc} */
  527.         @Override
  528.         public double cumulativeProbability(double x)  {
  529.             return x < value ? 0 : 1;
  530.         }

  531.         /** {@inheritDoc} */
  532.         @Override
  533.         public double inverseCumulativeProbability(final double p) {
  534.             if (p < 0 ||
  535.                 p > 1) {
  536.                 // Should never happen.
  537.                 throw new IllegalArgumentException("Internal error");
  538.             }
  539.             return value;
  540.         }

  541.         /** {@inheritDoc} */
  542.         @Override
  543.         public double getMean() {
  544.             return value;
  545.         }

  546.         /** {@inheritDoc} */
  547.         @Override
  548.         public double getVariance() {
  549.             return 0;
  550.         }

  551.         /**{@inheritDoc} */
  552.         @Override
  553.         public double getSupportLowerBound() {
  554.             return value;
  555.         }

  556.         /** {@inheritDoc} */
  557.         @Override
  558.         public double getSupportUpperBound() {
  559.             return value;
  560.         }

  561.         /**
  562.          * {@inheritDoc}
  563.          *
  564.          * @param rng Not used: distribution contains a single value.
  565.          * @return the value of the distribution.
  566.          */
  567.         @Override
  568.         public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
  569.             return this::getSupportLowerBound;
  570.         }
  571.     }
  572. }