View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math4.legacy.stat.descriptive;
19  
20  import java.util.ArrayList;
21  import java.util.Collection;
22  
23  import org.apache.commons.math4.legacy.TestUtils;
24  import org.apache.commons.statistics.distribution.DiscreteDistribution;
25  import org.apache.commons.statistics.distribution.ContinuousDistribution;
26  import org.apache.commons.math4.legacy.distribution.AbstractRealDistribution;
27  import org.apache.commons.statistics.distribution.UniformDiscreteDistribution;
28  import org.apache.commons.statistics.distribution.UniformContinuousDistribution;
29  import org.apache.commons.numbers.core.Precision;
30  import org.apache.commons.rng.simple.RandomSource;
31  import org.junit.Assert;
32  import org.junit.Test;
33  
34  
35  /**
36   * Test cases for {@link AggregateSummaryStatistics}
37   */
38  public class AggregateSummaryStatisticsTest {
39  
40      /**
41       * Tests the standard aggregation behavior
42       */
43      @Test
44      public void testAggregation() {
45          AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
46          SummaryStatistics setOneStats = aggregate.createContributingStatistics();
47          SummaryStatistics setTwoStats = aggregate.createContributingStatistics();
48  
49          Assert.assertNotNull("The set one contributing stats are null", setOneStats);
50          Assert.assertNotNull("The set two contributing stats are null", setTwoStats);
51          Assert.assertNotSame("Contributing stats objects are the same", setOneStats, setTwoStats);
52  
53          setOneStats.addValue(2);
54          setOneStats.addValue(3);
55          setOneStats.addValue(5);
56          setOneStats.addValue(7);
57          setOneStats.addValue(11);
58          Assert.assertEquals("Wrong number of set one values", 5, setOneStats.getN());
59          Assert.assertTrue("Wrong sum of set one values", Precision.equals(28.0, setOneStats.getSum(), 1));
60  
61          setTwoStats.addValue(2);
62          setTwoStats.addValue(4);
63          setTwoStats.addValue(8);
64          Assert.assertEquals("Wrong number of set two values", 3, setTwoStats.getN());
65          Assert.assertTrue("Wrong sum of set two values", Precision.equals(14.0, setTwoStats.getSum(), 1));
66  
67          Assert.assertEquals("Wrong number of aggregate values", 8, aggregate.getN());
68          Assert.assertTrue("Wrong aggregate sum", Precision.equals(42.0, aggregate.getSum(), 1));
69      }
70  
71      /**
72       * Verify that aggregating over a partition gives the same results
73       * as direct computation.
74       *
75       *  1) Randomly generate a dataset of 10-100 values
76       *     from [-100, 100]
77       *  2) Divide the dataset it into 2-5 partitions
78       *  3) Create an AggregateSummaryStatistic and ContributingStatistics
79       *     for each partition
80       *  4) Compare results from the AggregateSummaryStatistic with values
81       *     returned by a single SummaryStatistics instance that is provided
82       *     the full dataset
83       */
84      @Test
85      public void testAggregationConsistency() {
86  
87          // Generate a random sample and random partition
88          double[] totalSample = generateSample();
89          double[][] subSamples = generatePartition(totalSample);
90          int nSamples = subSamples.length;
91  
92          // Create aggregator and total stats for comparison
93          AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
94          SummaryStatistics totalStats = new SummaryStatistics();
95  
96          // Create array of component stats
97          SummaryStatistics componentStats[] = new SummaryStatistics[nSamples];
98  
99          for (int i = 0; i < nSamples; i++) {
100 
101             // Make componentStats[i] a contributing statistic to aggregate
102             componentStats[i] = aggregate.createContributingStatistics();
103 
104             // Add values from subsample
105             for (int j = 0; j < subSamples[i].length; j++) {
106                 componentStats[i].addValue(subSamples[i][j]);
107             }
108         }
109 
110         // Compute totalStats directly
111         for (int i = 0; i < totalSample.length; i++) {
112             totalStats.addValue(totalSample[i]);
113         }
114 
115         /*
116          * Compare statistics in totalStats with aggregate.
117          * Note that guaranteed success of this comparison depends on the
118          * fact that <aggregate> gets values in exactly the same order
119          * as <totalStats>.
120          *
121          */
122         Assert.assertEquals(totalStats.getSummary(), aggregate.getSummary());
123     }
124 
125     /**
126      * Test aggregate function by randomly generating a dataset of 10-100 values
127      * from [-100, 100], dividing it into 2-5 partitions, computing stats for each
128      * partition and comparing the result of aggregate(...) applied to the collection
129      * of per-partition SummaryStatistics with a single SummaryStatistics computed
130      * over the full sample.
131      */
132     @Test
133     public void testAggregate() {
134 
135         // Generate a random sample and random partition
136         double[] totalSample = generateSample();
137         double[][] subSamples = generatePartition(totalSample);
138         int nSamples = subSamples.length;
139 
140         // Compute combined stats directly
141         SummaryStatistics totalStats = new SummaryStatistics();
142         for (int i = 0; i < totalSample.length; i++) {
143             totalStats.addValue(totalSample[i]);
144         }
145 
146         // Now compute subsample stats individually and aggregate
147         SummaryStatistics[] subSampleStats = new SummaryStatistics[nSamples];
148         for (int i = 0; i < nSamples; i++) {
149             subSampleStats[i] = new SummaryStatistics();
150         }
151         Collection<SummaryStatistics> aggregate = new ArrayList<>();
152         for (int i = 0; i < nSamples; i++) {
153             for (int j = 0; j < subSamples[i].length; j++) {
154                 subSampleStats[i].addValue(subSamples[i][j]);
155             }
156             aggregate.add(subSampleStats[i]);
157         }
158 
159         // Compare values
160         StatisticalSummary aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
161         assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
162     }
163 
164     /**
165      * Similar to {@link #testAggregate()} but operating on
166      * {@link StatisticalSummary} instead.
167      */
168     @Test
169     public void testAggregateStatisticalSummary() {
170 
171         // Generate a random sample and random partition
172         double[] totalSample = generateSample();
173         double[][] subSamples = generatePartition(totalSample);
174         int nSamples = subSamples.length;
175 
176         // Compute combined stats directly
177         SummaryStatistics totalStats = new SummaryStatistics();
178         for (int i = 0; i < totalSample.length; i++) {
179             totalStats.addValue(totalSample[i]);
180         }
181 
182         // Now compute subsample stats individually and aggregate
183         SummaryStatistics[] subSampleStats = new SummaryStatistics[nSamples];
184         for (int i = 0; i < nSamples; i++) {
185             subSampleStats[i] = new SummaryStatistics();
186         }
187         Collection<StatisticalSummary> aggregate = new ArrayList<>();
188         for (int i = 0; i < nSamples; i++) {
189             for (int j = 0; j < subSamples[i].length; j++) {
190                 subSampleStats[i].addValue(subSamples[i][j]);
191             }
192             aggregate.add(subSampleStats[i].getSummary());
193         }
194 
195         // Compare values
196         StatisticalSummary aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
197         assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
198     }
199 
200 
201     @Test
202     public void testAggregateDegenerate() {
203         double[] totalSample = {1, 2, 3, 4, 5};
204         double[][] subSamples = {{1}, {2}, {3}, {4}, {5}};
205 
206         // Compute combined stats directly
207         SummaryStatistics totalStats = new SummaryStatistics();
208         for (int i = 0; i < totalSample.length; i++) {
209             totalStats.addValue(totalSample[i]);
210         }
211 
212         // Now compute subsample stats individually and aggregate
213         SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
214         for (int i = 0; i < 5; i++) {
215             subSampleStats[i] = new SummaryStatistics();
216         }
217         Collection<SummaryStatistics> aggregate = new ArrayList<>();
218         for (int i = 0; i < 5; i++) {
219             for (int j = 0; j < subSamples[i].length; j++) {
220                 subSampleStats[i].addValue(subSamples[i][j]);
221             }
222             aggregate.add(subSampleStats[i]);
223         }
224 
225         // Compare values
226         StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
227         assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
228     }
229 
230     @Test
231     public void testAggregateSpecialValues() {
232         double[] totalSample = {Double.POSITIVE_INFINITY, 2, 3, Double.NaN, 5};
233         double[][] subSamples = {{Double.POSITIVE_INFINITY, 2}, {3}, {Double.NaN}, {5}};
234 
235         // Compute combined stats directly
236         SummaryStatistics totalStats = new SummaryStatistics();
237         for (int i = 0; i < totalSample.length; i++) {
238             totalStats.addValue(totalSample[i]);
239         }
240 
241         // Now compute subsample stats individually and aggregate
242         SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
243         for (int i = 0; i < 4; i++) {
244             subSampleStats[i] = new SummaryStatistics();
245         }
246         Collection<SummaryStatistics> aggregate = new ArrayList<>();
247         for (int i = 0; i < 4; i++) {
248             for (int j = 0; j < subSamples[i].length; j++) {
249                 subSampleStats[i].addValue(subSamples[i][j]);
250             }
251             aggregate.add(subSampleStats[i]);
252         }
253 
254         // Compare values
255         StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
256         assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
257     }
258 
259     /**
260      * Verifies that a StatisticalSummary and a StatisticalSummaryValues are equal up
261      * to delta, with NaNs, infinities returned in the same spots. For max, min, n, values
262      * have to agree exactly, delta is used only for sum, mean, variance, std dev.
263      */
264     protected static void assertEquals(StatisticalSummary expected, StatisticalSummary observed, double delta) {
265         TestUtils.assertEquals(expected.getMax(), observed.getMax(), 0);
266         TestUtils.assertEquals(expected.getMin(), observed.getMin(), 0);
267         Assert.assertEquals(expected.getN(), observed.getN());
268         TestUtils.assertEquals(expected.getSum(), observed.getSum(), delta);
269         TestUtils.assertEquals(expected.getMean(), observed.getMean(), delta);
270         TestUtils.assertEquals(expected.getStandardDeviation(), observed.getStandardDeviation(), delta);
271         TestUtils.assertEquals(expected.getVariance(), observed.getVariance(), delta);
272     }
273 
274 
275     /**
276      * Generates a random sample of double values.
277      * Sample size is random, between 10 and 100 and values are
278      * uniformly distributed over [-100, 100].
279      *
280      * @return array of random double values
281      */
282     private double[] generateSample() {
283         final DiscreteDistribution.Sampler size =
284             UniformDiscreteDistribution.of(10, 100).createSampler(RandomSource.WELL_512_A.create(327652));
285         final ContinuousDistribution.Sampler randomData
286             = UniformContinuousDistribution.of(-100, 100).createSampler(RandomSource.WELL_512_A.create(64925784252L));
287         final int sampleSize = size.sample();
288         final double[] out = AbstractRealDistribution.sample(sampleSize, randomData);
289         return out;
290     }
291 
292     /**
293      * Generates a partition of <sample> into up to 5 sequentially selected
294      * subsamples with randomly selected partition points.
295      *
296      * @param sample array to partition
297      * @return rectangular array with rows = subsamples
298      */
299     private double[][] generatePartition(double[] sample) {
300         final int length = sample.length;
301         final double[][] out = new double[5][];
302         int cur = 0;          // beginning of current partition segment
303         int offset = 0;       // end of current partition segment
304         int sampleCount = 0;  // number of segments defined
305         for (int i = 0; i < 5; i++) {
306             if (cur == length || offset == length) {
307                 break;
308             }
309             final int next;
310             if (i == 4 || cur == length - 1) {
311                 next = length - 1;
312             } else {
313                 final DiscreteDistribution.Sampler sampler =
314                     UniformDiscreteDistribution.of(cur, length - 1).createSampler(RandomSource.WELL_512_A.create());
315                 next = sampler.sample();
316             }
317             final int subLength = next - cur + 1;
318             out[i] = new double[subLength];
319             System.arraycopy(sample, offset, out[i], 0, subLength);
320             cur = next + 1;
321             sampleCount++;
322             offset += subLength;
323         }
324         if (sampleCount < 5) {
325             double[][] out2 = new double[sampleCount][];
326             for (int j = 0; j < sampleCount; j++) {
327                 final int curSize = out[j].length;
328                 out2[j] = new double[curSize];
329                 System.arraycopy(out[j], 0, out2[j], 0, curSize);
330             }
331             return out2;
332         } else {
333             return out;
334         }
335     }
336 }