View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math4.legacy.distribution;
18  
19  import java.io.BufferedReader;
20  import java.io.IOException;
21  import java.io.InputStreamReader;
22  import java.net.URL;
23  import java.util.ArrayList;
24  
25  import org.apache.commons.rng.UniformRandomProvider;
26  import org.apache.commons.rng.simple.RandomSource;
27  import org.apache.commons.statistics.distribution.ContinuousDistribution;
28  import org.apache.commons.statistics.distribution.UniformContinuousDistribution;
29  import org.apache.commons.statistics.distribution.NormalDistribution;
30  import org.apache.commons.statistics.distribution.ExponentialDistribution;
31  import org.apache.commons.math4.legacy.TestUtils;
32  import org.apache.commons.math4.legacy.analysis.UnivariateFunction;
33  import org.apache.commons.math4.legacy.analysis.integration.BaseAbstractUnivariateIntegrator;
34  import org.apache.commons.math4.legacy.analysis.integration.IterativeLegendreGaussIntegrator;
35  import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
36  import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
37  import org.apache.commons.math4.core.jdkmath.JdkMath;
38  import org.junit.Assert;
39  import org.junit.Before;
40  import org.junit.Test;
41  
42  /**
43   * Test cases for the {@link EmpiricalDistribution} class.
44   */
45  public final class EmpiricalDistributionTest extends RealDistributionAbstractTest {
46      private EmpiricalDistribution empiricalDistribution = null;
47      private double[] dataArray = null;
48      private final int n = 10000;
49      /** Uniform bin mass = 10/10001 == mass of all but the first bin */
50      private final double binMass = 10d / (n + 1);
51      /** Mass of first bin = 11/10001 */
52      private final double firstBinMass = 11d / (n + 1);
53  
54      @Override
55      @Before
56      public void setUp() {
57          super.setUp();
58  
59          final URL url = getClass().getResource("testData.txt");
60          final ArrayList<Double> list = new ArrayList<>();
61          try {
62              final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
63              String str = null;
64              while ((str = in.readLine()) != null) {
65                  list.add(Double.valueOf(str));
66              }
67              in.close();
68          } catch (IOException ex) {
69              Assert.fail("IOException " + ex);
70          }
71  
72          dataArray = new double[list.size()];
73          int i = 0;
74          for (Double data : list) {
75              dataArray[i] = data.doubleValue();
76              i++;
77          }
78  
79          empiricalDistribution = EmpiricalDistribution.from(100, dataArray);
80      }
81  
82      // MATH-1279
83      @Test(expected=NotStrictlyPositiveException.class)
84      public void testPrecondition1() {
85          EmpiricalDistribution.from(0, new double[] {1,2,3});
86      }
87  
88      /**
89       * Test using data taken from sample data file.
90       * Check that the sampleCount, mu and sigma match data in the sample data file.
91       */
92      @Test
93      public void testDoubleLoad() {
94          // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1
95          // Make sure that loaded distribution matches this
96          Assert.assertEquals(empiricalDistribution.getSampleStats().getN(),
97                              1000, 1e-7);
98          //TODO: replace with statistical tests
99          Assert.assertEquals(empiricalDistribution.getSampleStats().getMean(),
100                             5.069831575018909, 1e-7);
101         Assert.assertEquals(empiricalDistribution.getSampleStats().getStandardDeviation(),
102                             1.0173699343977738, 1e-7);
103 
104         double[] bounds = empiricalDistribution.getGeneratorUpperBounds();
105         Assert.assertEquals(bounds.length, 100);
106         Assert.assertEquals(bounds[99], 1.0, 10e-12);
107     }
108 
109     // MATH-1531
110     @Test
111     public void testMath1531() {
112         final double[] data = new double[] {
113             50.993456376721454, 49.455345691918055, 49.527276095295804, 50.017183448668845, 49.10508147470046,
114             49.813998274118696, 50.87195348756139, 50.419474110037, 50.63614906979689, 49.49694777179407,
115             50.71799078406067, 50.03192853759164, 49.915092423165994, 49.56895392597687, 51.034638001064934,
116             50.681227971275945, 50.43749845081759, 49.86513120270245, 50.21475262482965, 49.99202971042547,
117             50.02382189838519, 49.386888585302884, 49.45585010202781, 49.988009479855435, 49.8136712206123,
118             49.6715197127997, 50.1981278397565, 49.842297508010276, 49.62491227740015, 50.05101916097176,
119             48.834912763303926, 49.806787657848574, 49.478236106374695, 49.56648347371614, 49.95069238081982,
120             49.71845132077346, 50.6097468705947, 49.80724637775541, 49.90448813086025, 49.39641861662603,
121             50.434295712893714, 49.227176959566734, 49.541126466050905, 49.03416593170446, 49.11584328494423,
122             49.61387482435674, 49.92877857995328, 50.70638552955101, 50.60078208448842, 49.39326233277838,
123             49.21488424364095, 49.69503351015096, 50.13733214001718, 50.22084761458942, 51.09804435604931,
124             49.18559131120419, 49.52286371605357, 49.34804374996689, 49.6901827776375, 50.01316351359638,
125             48.7751460520373, 50.12961836291053, 49.9978419772511, 49.885658399408584, 49.673438879979834,
126             49.45565980965606, 50.429747484906564, 49.40129274804164, 50.13034614008073, 49.87685735146651,
127             50.12967905393557, 50.323560376181696, 49.83519233651367, 49.37333369733053, 49.70074301611427,
128             50.11626105774947, 50.28249500380083, 50.543354367136466, 50.05866241335002, 50.39516515672527,
129             49.4838561463057, 50.451757089234796, 50.31370674203726, 49.79063762614284, 50.19652349768548,
130             49.75881420748814, 49.98371855036422, 49.82171344472916, 48.810793204162415, 49.37040569084592,
131             50.050641186203976, 50.48360952263646, 50.86666450358076, 50.463268776129844, 50.137489751888666,
132             50.23823061444118, 49.881460479468004, 50.641174398764356, 49.09314136851421, 48.80877928574451,
133             50.46197084844826, 49.97691704141741, 49.99933997561926, 50.25692254481885, 49.52973451252715,
134             49.81229858420664, 48.996112655915994, 48.740531054814674, 50.026642633066416, 49.98696633604899,
135             49.61307159972952, 50.5115278979726, 50.75245152442404, 50.51807785445929, 49.60929671768147,
136             49.1079533564074, 49.65347196551866, 49.31684818724059, 50.4906368627049, 50.37483603684714
137         };
138 
139         EmpiricalDistribution.from(120, data).inverseCumulativeProbability(0.7166666666666669);
140     }
141 
142     /**
143       * Generate 1000 random values and make sure they look OK.<br>
144       * Note that there is a non-zero (but very small) probability that
145       * these tests will fail even if the code is working as designed.
146       */
147     @Test
148     public void testNext() {
149         tstGen(empiricalDistribution,
150                0.1);
151     }
152 
153     /**
154      * Make sure we can handle a grid size that is too fine
155      */
156     @Test
157     public void testGridTooFine() {
158         tstGen(EmpiricalDistribution.from(1001, dataArray),
159                0.1);
160     }
161 
162     /**
163      * How about too fat?
164      */
165     @Test
166     public void testGridTooFat() {
167         tstGen(EmpiricalDistribution.from(1, dataArray),
168                5); // ridiculous tolerance; but ridiculous grid size
169                    // really just checking to make sure we do not bomb
170     }
171 
172     /**
173      * Test bin index overflow problem (BZ 36450)
174      */
175     @Test
176     public void testBinIndexOverflow() {
177         double[] x = new double[] {9474.94326071674, 2080107.8865462579};
178         EmpiricalDistribution.from(1000, x);
179     }
180 
181     @Test(expected=NullPointerException.class)
182     public void testLoadNullDoubleArray() {
183         EmpiricalDistribution.from(1000, null);
184     }
185 
186     /**
187      * MATH-298
188      */
189     @Test
190     public void testGetBinUpperBounds() {
191         double[] testData = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10};
192         EmpiricalDistribution dist = EmpiricalDistribution.from(5, testData);
193         double[] expectedBinUpperBounds = {2, 4, 6, 8, 10};
194         double[] expectedGeneratorUpperBounds = {4d/13d, 7d/13d, 9d/13d, 11d/13d, 1};
195         double tol = 10E-12;
196         TestUtils.assertEquals(expectedBinUpperBounds, dist.getUpperBounds(), tol);
197         TestUtils.assertEquals(expectedGeneratorUpperBounds, dist.getGeneratorUpperBounds(), tol);
198     }
199 
200     private void verifySame(EmpiricalDistribution d1,
201                             EmpiricalDistribution d2) {
202         Assert.assertEquals(d1.getBinCount(), d2.getBinCount());
203         Assert.assertEquals(d1.getSampleStats(), d2.getSampleStats());
204 
205         for (int i = 0;  i < d1.getUpperBounds().length; i++) {
206             Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0);
207         }
208         Assert.assertEquals(d1.getBinStats(), d2.getBinStats());
209     }
210 
211     private void tstGen(EmpiricalDistribution dist,
212                         double tolerance) {
213         final ContinuousDistribution.Sampler sampler
214             = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
215         final SummaryStatistics stats = new SummaryStatistics();
216         for (int i = 1; i < 1000; i++) {
217             stats.addValue(sampler.sample());
218         }
219         Assert.assertEquals("mean", 5.069831575018909, stats.getMean(),tolerance);
220         Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(),tolerance);
221     }
222 
223     //  Setup for distribution tests
224 
225     @Override
226     public ContinuousDistribution makeDistribution() {
227         // Create a uniform distribution on [0, 10,000].
228         final double[] sourceData = new double[n + 1];
229         for (int i = 0; i < n + 1; i++) {
230             sourceData[i] = i;
231         }
232         EmpiricalDistribution dist = EmpiricalDistribution.from(1000, sourceData);
233         return dist;
234     }
235 
236     @Override
237     public double[] makeCumulativeTestPoints() {
238        final double[] testPoints = new double[] {9, 10, 15, 1000, 5004, 9999};
239        return testPoints;
240     }
241 
242 
243     @Override
244     public double[] makeCumulativeTestValues() {
245         /*
246          * Bins should be [0, 10], (10, 20], ..., (9990, 10000]
247          * Kernels should be N(4.5, 3.02765), N(14.5, 3.02765)...
248          * Each bin should have mass 10/10000 = .001
249          */
250         final double[] testPoints = getCumulativeTestPoints();
251         final double[] cumValues = new double[testPoints.length];
252         final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution();
253         final double[] binBounds = empiricalDistribution.getUpperBounds();
254         for (int i = 0; i < testPoints.length; i++) {
255             final int bin = findBin(testPoints[i]);
256             final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() :
257                 binBounds[bin - 1];
258             final double upper = binBounds[bin];
259             // Compute bMinus = sum or mass of bins below the bin containing the point
260             // First bin has mass 11 / 10000, the rest have mass 10 / 10000.
261             final double bMinus = bin == 0 ? 0 : (bin - 1) * binMass + firstBinMass;
262             final ContinuousDistribution kernel = findKernel(lower, upper);
263             final double withinBinKernelMass = kernel.probability(lower, upper);
264             final double kernelCum = kernel.probability(lower, testPoints[i]);
265             cumValues[i] = bMinus + (bin == 0 ? firstBinMass : binMass) * kernelCum/withinBinKernelMass;
266         }
267         return cumValues;
268     }
269 
270     @Override
271     public double[] makeDensityTestValues() {
272         final double[] testPoints = getCumulativeTestPoints();
273         final double[] densityValues = new double[testPoints.length];
274         final EmpiricalDistribution empiricalDistribution = (EmpiricalDistribution) makeDistribution();
275         final double[] binBounds = empiricalDistribution.getUpperBounds();
276         for (int i = 0; i < testPoints.length; i++) {
277             final int bin = findBin(testPoints[i]);
278             final double lower = bin == 0 ? empiricalDistribution.getSupportLowerBound() :
279                 binBounds[bin - 1];
280             final double upper = binBounds[bin];
281             final ContinuousDistribution kernel = findKernel(lower, upper);
282             final double withinBinKernelMass = kernel.probability(lower, upper);
283             final double density = kernel.density(testPoints[i]);
284             densityValues[i] = density * (bin == 0 ? firstBinMass : binMass) / withinBinKernelMass;
285         }
286         return densityValues;
287     }
288 
289     /**
290      * Modify test integration bounds from the default. Because the distribution
291      * has discontinuities at bin boundaries, integrals spanning multiple bins
292      * will face convergence problems.  Only test within-bin integrals and spans
293      * across no more than 3 bin boundaries.
294      */
295     @Override
296     @Test
297     public void testDensityIntegrals() {
298         final ContinuousDistribution distribution = makeDistribution();
299         final double tol = 1.0e-9;
300         final BaseAbstractUnivariateIntegrator integrator =
301             new IterativeLegendreGaussIntegrator(5, 1.0e-12, 1.0e-10);
302         final UnivariateFunction d = new UnivariateFunction() {
303             @Override
304             public double value(double x) {
305                 return distribution.density(x);
306             }
307         };
308         final double[] lower = {0, 5, 1000, 5001, 9995};
309         final double[] upper = {5, 12, 1030, 5010, 10000};
310         for (int i = 1; i < 5; i++) {
311             Assert.assertEquals(
312                     distribution.probability(
313                             lower[i], upper[i]),
314                             integrator.integrate(
315                                     1000000, // Triangle integrals are very slow to converge
316                                     d, lower[i], upper[i]), tol);
317         }
318     }
319 
320     /**
321      * MATH-984
322      * Verify that sampled values do not go outside of the range of the data.
323      */
324     @Test
325     public void testSampleValuesRange() {
326         // Concentrate values near the endpoints of (0, 1).
327         // Unconstrained Gaussian kernel would generate values outside the interval.
328         final double[] data = new double[100];
329         for (int i = 0; i < 50; i++) {
330             data[i] = 1 / ((double) i + 1);
331         }
332         for (int i = 51; i < 100; i++) {
333             data[i] = 1 - 1 / (100 - (double) i + 2);
334         }
335         EmpiricalDistribution dist = EmpiricalDistribution.from(10, data);
336         ContinuousDistribution.Sampler sampler
337             = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
338         for (int i = 0; i < 1000; i++) {
339             final double dev = sampler.sample();
340             Assert.assertTrue(dev < 1);
341             Assert.assertTrue(dev > 0);
342         }
343     }
344 
345     /**
346      * MATH-1203, MATH-1208
347      */
348     @Test
349     public void testNoBinVariance() {
350         final double[] data = {0, 0, 1, 1};
351         EmpiricalDistribution dist = EmpiricalDistribution.from(2, data);
352         ContinuousDistribution.Sampler sampler
353             = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
354         for (int i = 0; i < 1000; i++) {
355             final double dev = sampler.sample();
356             Assert.assertTrue(dev == 0 || dev == 1);
357         }
358         Assert.assertEquals(0.5, dist.cumulativeProbability(0), Double.MIN_VALUE);
359         Assert.assertEquals(1.0, dist.cumulativeProbability(1), Double.MIN_VALUE);
360         Assert.assertEquals(0.5, dist.cumulativeProbability(0.5), Double.MIN_VALUE);
361         Assert.assertEquals(0.5, dist.cumulativeProbability(0.7), Double.MIN_VALUE);
362     }
363 
364     /**
365      * Find the bin that x belongs (relative to {@link #makeDistribution()}).
366      */
367     private int findBin(double x) {
368         // Number of bins below x should be trunc(x/10)
369         final double nMinus = JdkMath.floor(x / 10);
370         final int bin =  (int) JdkMath.round(nMinus);
371         // If x falls on a bin boundary, it is in the lower bin
372         return JdkMath.floor(x / 10) == x / 10 ? bin - 1 : bin;
373     }
374 
375     /**
376      * Find the within-bin kernel for the bin with lower bound lower
377      * and upper bound upper. All bins other than the first contain 10 points
378      * exclusive of the lower bound and are centered at (lower + upper + 1) / 2.
379      * The first bin includes its lower bound, 0, so has different mean and
380      * standard deviation.
381      */
382     private ContinuousDistribution findKernel(double lower, double upper) {
383         if (lower < 1) {
384             return NormalDistribution.of(5d, 3.3166247903554);
385         } else {
386             return NormalDistribution.of((upper + lower + 1) / 2d, 3.0276503540974917);
387         }
388     }
389 
390     @Test
391     public void testKernelOverrideUniform() {
392         final double[] data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
393         final EmpiricalDistribution dist =
394             EmpiricalDistribution.from(5, data,
395                                        s -> UniformContinuousDistribution.of(s.getMin(), s.getMax()));
396         final ContinuousDistribution.Sampler sampler
397             = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
398         // Kernels are uniform distributions on [1,3], [4,6], [7,9], [10,12], [13,15]
399         final double bounds[] = {3d, 6d, 9d, 12d};
400         final double tol = 10E-12;
401         for (int i = 0; i < 20; i++) {
402             final double v = sampler.sample();
403             // Make sure v is not in the excluded range between bins - that is (bounds[i], bounds[i] + 1)
404             for (int j = 0; j < bounds.length; j++) {
405                 Assert.assertFalse(v > bounds[j] + tol && v < bounds[j] + 1 - tol);
406             }
407         }
408         Assert.assertEquals(0.0, dist.cumulativeProbability(1), tol);
409         Assert.assertEquals(0.1, dist.cumulativeProbability(2), tol);
410         Assert.assertEquals(0.6, dist.cumulativeProbability(10), tol);
411         Assert.assertEquals(0.8, dist.cumulativeProbability(12), tol);
412         Assert.assertEquals(0.8, dist.cumulativeProbability(13), tol);
413         Assert.assertEquals(1.0, dist.cumulativeProbability(15), tol);
414 
415         Assert.assertEquals(2.0, dist.inverseCumulativeProbability(0.1), tol);
416         Assert.assertEquals(3.0, dist.inverseCumulativeProbability(0.2), tol);
417         Assert.assertEquals(5.0, dist.inverseCumulativeProbability(0.3), tol);
418         Assert.assertEquals(6.0, dist.inverseCumulativeProbability(0.4), tol);
419         Assert.assertEquals(8.0, dist.inverseCumulativeProbability(0.5), tol);
420         Assert.assertEquals(9.0, dist.inverseCumulativeProbability(0.6), tol);
421     }
422 
423     @Test
424     public void testMath1431() {
425         final UniformRandomProvider rng = RandomSource.WELL_19937_C.create(1000);
426         final ContinuousDistribution.Sampler exponentialDistributionSampler
427             = ExponentialDistribution.of(0.05).createSampler(rng);
428         final double[] empiricalDataPoints = new double[3000];
429         for (int i = 0; i < empiricalDataPoints.length; i++) {
430             empiricalDataPoints[i] = exponentialDistributionSampler.sample();
431         }
432 
433         final EmpiricalDistribution testDistribution = EmpiricalDistribution.from(100, empiricalDataPoints);
434 
435         for (int i = 0; i < 1000; i++) {
436             final double point = rng.nextDouble();
437             final double cdf = testDistribution.cumulativeProbability(point);
438             Assert.assertFalse("point: " + point, Double.isNaN(cdf));
439         }
440     }
441 
442     @Test
443     public void testMath1462() {
444         final double[] data = {
445             6464.0205, 6449.1328, 6489.4569, 6497.5533, 6251.6487,
446             6252.6513, 6339.7883, 6356.2622, 6222.1251, 6157.3813,
447             6242.4741, 6332.5347, 6468.0633, 6471.2319, 6473.9929,
448             6589.1322, 6511.2191, 6339.4349, 6307.7735, 6288.0915,
449             6354.0572, 6385.8283, 6325.3756, 6433.1699, 6433.6507,
450             6424.6806, 6380.5268, 6407.6705, 6241.2198, 6230.3681,
451             6367.5943, 6358.4817, 6272.8039, 6269.0211, 6312.9027,
452             6349.5926, 6404.0775, 6326.986, 6283.8685, 6309.9021,
453             6336.8554, 6389.1598, 6281.0372, 6304.8852, 6359.2651,
454             6426.519, 6400.3926, 6440.6798, 6292.5812, 6398.4911,
455             6307.0002, 6284.2111, 6271.371, 6368.6377, 6323.3372,
456             6276.2155, 6335.0117, 6319.2466, 6252.9969, 6445.2074,
457             6461.3944, 6384.1345
458         };
459 
460         final EmpiricalDistribution ed = EmpiricalDistribution.from(data.length / 10, data);
461 
462         double v;
463         double p;
464 
465         p = 0.49999;
466         v = ed.inverseCumulativeProbability(p);
467         Assert.assertTrue("p=" + p + " => v=" + v, v < 6344);
468 
469         p = 0.5;
470         v = ed.inverseCumulativeProbability(p);
471         Assert.assertTrue("p=" + p + " => v=" + v, v < 7000);
472 
473         p = 0.51111;
474         v = ed.inverseCumulativeProbability(p);
475         Assert.assertTrue("p=" + p + " => v=" + v, v < 6350);
476     }
477 
478     @Test
479     public void testMath1462InfiniteQuantile() {
480         final double[] data = {
481             18054, 17548, 17350, 17860, 17827, 17653, 18113, 18405, 17746,
482             17647, 18160, 17955, 17705, 17890, 17974, 17857, 13287, 18645,
483             17775, 17730, 17996, 18263, 17861, 17161, 17717, 18134, 18669,
484             18340, 17221, 18292, 18146, 17520, 18207, 17829, 18206, 13301,
485             18257, 17626, 18358, 18340, 18320, 17852, 17804, 17577, 17718,
486             18099, 13395, 17763, 17911, 17978, 12935, 17519, 17550, 18728,
487             18518, 17698, 18739, 18553, 17982, 18113, 17974, 17961, 17645,
488             17867, 17890, 17498, 18718, 18191, 18177, 17923, 18164, 18155,
489             6212, 5961, 711
490         };
491 
492         final double p = 0.32;
493         for (int i = 745; i <= 1100; i++) {
494             final EmpiricalDistribution ed = EmpiricalDistribution.from(i, data);
495             final double v = ed.inverseCumulativeProbability(p);
496 
497             Assert.assertTrue("p=" + p + " => v=" + v, Double.isFinite(v));
498         }
499     }
500 }