001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math3.stat.descriptive;
019    
020    import java.util.ArrayList;
021    import java.util.Collection;
022    
023    
024    import org.apache.commons.math3.TestUtils;
025    import org.apache.commons.math3.distribution.RealDistribution;
026    import org.apache.commons.math3.distribution.UniformRealDistribution;
027    import org.apache.commons.math3.distribution.IntegerDistribution;
028    import org.apache.commons.math3.distribution.UniformIntegerDistribution;
029    import org.apache.commons.math3.util.Precision;
030    import org.junit.Assert;
031    import org.junit.Test;
032    
033    
034    /**
035     * Test cases for {@link AggregateSummaryStatistics}
036     *
037     */
038    public class AggregateSummaryStatisticsTest {
039    
040        /**
041         * Tests the standard aggregation behavior
042         */
043        @Test
044        public void testAggregation() {
045            AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
046            SummaryStatistics setOneStats = aggregate.createContributingStatistics();
047            SummaryStatistics setTwoStats = aggregate.createContributingStatistics();
048    
049            Assert.assertNotNull("The set one contributing stats are null", setOneStats);
050            Assert.assertNotNull("The set two contributing stats are null", setTwoStats);
051            Assert.assertNotSame("Contributing stats objects are the same", setOneStats, setTwoStats);
052    
053            setOneStats.addValue(2);
054            setOneStats.addValue(3);
055            setOneStats.addValue(5);
056            setOneStats.addValue(7);
057            setOneStats.addValue(11);
058            Assert.assertEquals("Wrong number of set one values", 5, setOneStats.getN());
059            Assert.assertTrue("Wrong sum of set one values", Precision.equals(28.0, setOneStats.getSum(), 1));
060    
061            setTwoStats.addValue(2);
062            setTwoStats.addValue(4);
063            setTwoStats.addValue(8);
064            Assert.assertEquals("Wrong number of set two values", 3, setTwoStats.getN());
065            Assert.assertTrue("Wrong sum of set two values", Precision.equals(14.0, setTwoStats.getSum(), 1));
066    
067            Assert.assertEquals("Wrong number of aggregate values", 8, aggregate.getN());
068            Assert.assertTrue("Wrong aggregate sum", Precision.equals(42.0, aggregate.getSum(), 1));
069        }
070    
071        /**
072         * Verify that aggregating over a partition gives the same results
073         * as direct computation.
074         *
075         *  1) Randomly generate a dataset of 10-100 values
076         *     from [-100, 100]
077         *  2) Divide the dataset it into 2-5 partitions
078         *  3) Create an AggregateSummaryStatistic and ContributingStatistics
079         *     for each partition
080         *  4) Compare results from the AggregateSummaryStatistic with values
081         *     returned by a single SummaryStatistics instance that is provided
082         *     the full dataset
083         */
084        @Test
085        public void testAggregationConsistency() {
086    
087            // Generate a random sample and random partition
088            double[] totalSample = generateSample();
089            double[][] subSamples = generatePartition(totalSample);
090            int nSamples = subSamples.length;
091    
092            // Create aggregator and total stats for comparison
093            AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
094            SummaryStatistics totalStats = new SummaryStatistics();
095    
096            // Create array of component stats
097            SummaryStatistics componentStats[] = new SummaryStatistics[nSamples];
098    
099            for (int i = 0; i < nSamples; i++) {
100    
101                // Make componentStats[i] a contributing statistic to aggregate
102                componentStats[i] = aggregate.createContributingStatistics();
103    
104                // Add values from subsample
105                for (int j = 0; j < subSamples[i].length; j++) {
106                    componentStats[i].addValue(subSamples[i][j]);
107                }
108            }
109    
110            // Compute totalStats directly
111            for (int i = 0; i < totalSample.length; i++) {
112                totalStats.addValue(totalSample[i]);
113            }
114    
115            /*
116             * Compare statistics in totalStats with aggregate.
117             * Note that guaranteed success of this comparison depends on the
118             * fact that <aggregate> gets values in exactly the same order
119             * as <totalStats>.
120             *
121             */
122            Assert.assertEquals(totalStats.getSummary(), aggregate.getSummary());
123    
124        }
125    
126        /**
127         * Test aggregate function by randomly generating a dataset of 10-100 values
128         * from [-100, 100], dividing it into 2-5 partitions, computing stats for each
129         * partition and comparing the result of aggregate(...) applied to the collection
130         * of per-partition SummaryStatistics with a single SummaryStatistics computed
131         * over the full sample.
132         *
133         */
134        @Test
135        public void testAggregate() {
136    
137            // Generate a random sample and random partition
138            double[] totalSample = generateSample();
139            double[][] subSamples = generatePartition(totalSample);
140            int nSamples = subSamples.length;
141    
142            // Compute combined stats directly
143            SummaryStatistics totalStats = new SummaryStatistics();
144            for (int i = 0; i < totalSample.length; i++) {
145                totalStats.addValue(totalSample[i]);
146            }
147    
148            // Now compute subsample stats individually and aggregate
149            SummaryStatistics[] subSampleStats = new SummaryStatistics[nSamples];
150            for (int i = 0; i < nSamples; i++) {
151                subSampleStats[i] = new SummaryStatistics();
152            }
153            Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
154            for (int i = 0; i < nSamples; i++) {
155                for (int j = 0; j < subSamples[i].length; j++) {
156                    subSampleStats[i].addValue(subSamples[i][j]);
157                }
158                aggregate.add(subSampleStats[i]);
159            }
160    
161            // Compare values
162            StatisticalSummary aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
163            assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
164        }
165    
166    
167        @Test
168        public void testAggregateDegenerate() {
169            double[] totalSample = {1, 2, 3, 4, 5};
170            double[][] subSamples = {{1}, {2}, {3}, {4}, {5}};
171    
172            // Compute combined stats directly
173            SummaryStatistics totalStats = new SummaryStatistics();
174            for (int i = 0; i < totalSample.length; i++) {
175                totalStats.addValue(totalSample[i]);
176            }
177    
178            // Now compute subsample stats individually and aggregate
179            SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
180            for (int i = 0; i < 5; i++) {
181                subSampleStats[i] = new SummaryStatistics();
182            }
183            Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
184            for (int i = 0; i < 5; i++) {
185                for (int j = 0; j < subSamples[i].length; j++) {
186                    subSampleStats[i].addValue(subSamples[i][j]);
187                }
188                aggregate.add(subSampleStats[i]);
189            }
190    
191            // Compare values
192            StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
193            assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
194        }
195    
196        @Test
197        public void testAggregateSpecialValues() {
198            double[] totalSample = {Double.POSITIVE_INFINITY, 2, 3, Double.NaN, 5};
199            double[][] subSamples = {{Double.POSITIVE_INFINITY, 2}, {3}, {Double.NaN}, {5}};
200    
201            // Compute combined stats directly
202            SummaryStatistics totalStats = new SummaryStatistics();
203            for (int i = 0; i < totalSample.length; i++) {
204                totalStats.addValue(totalSample[i]);
205            }
206    
207            // Now compute subsample stats individually and aggregate
208            SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
209            for (int i = 0; i < 4; i++) {
210                subSampleStats[i] = new SummaryStatistics();
211            }
212            Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
213            for (int i = 0; i < 4; i++) {
214                for (int j = 0; j < subSamples[i].length; j++) {
215                    subSampleStats[i].addValue(subSamples[i][j]);
216                }
217                aggregate.add(subSampleStats[i]);
218            }
219    
220            // Compare values
221            StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
222            assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
223    
224        }
225    
226        /**
227         * Verifies that a StatisticalSummary and a StatisticalSummaryValues are equal up
228         * to delta, with NaNs, infinities returned in the same spots. For max, min, n, values
229         * have to agree exactly, delta is used only for sum, mean, variance, std dev.
230         */
231        protected static void assertEquals(StatisticalSummary expected, StatisticalSummary observed, double delta) {
232            TestUtils.assertEquals(expected.getMax(), observed.getMax(), 0);
233            TestUtils.assertEquals(expected.getMin(), observed.getMin(), 0);
234            Assert.assertEquals(expected.getN(), observed.getN());
235            TestUtils.assertEquals(expected.getSum(), observed.getSum(), delta);
236            TestUtils.assertEquals(expected.getMean(), observed.getMean(), delta);
237            TestUtils.assertEquals(expected.getStandardDeviation(), observed.getStandardDeviation(), delta);
238            TestUtils.assertEquals(expected.getVariance(), observed.getVariance(), delta);
239        }
240    
241    
242        /**
243         * Generates a random sample of double values.
244         * Sample size is random, between 10 and 100 and values are
245         * uniformly distributed over [-100, 100].
246         *
247         * @return array of random double values
248         */
249        private double[] generateSample() {
250            final IntegerDistribution size = new UniformIntegerDistribution(10, 100);
251            final RealDistribution randomData = new UniformRealDistribution(-100, 100);
252            final int sampleSize = size.sample();
253            final double[] out = randomData.sample(sampleSize);
254            return out;
255        }
256    
257        /**
258         * Generates a partition of <sample> into up to 5 sequentially selected
259         * subsamples with randomly selected partition points.
260         *
261         * @param sample array to partition
262         * @return rectangular array with rows = subsamples
263         */
264        private double[][] generatePartition(double[] sample) {
265            final int length = sample.length;
266            final double[][] out = new double[5][];
267            int cur = 0;          // beginning of current partition segment
268            int offset = 0;       // end of current partition segment
269            int sampleCount = 0;  // number of segments defined 
270            for (int i = 0; i < 5; i++) {
271                if (cur == length || offset == length) {
272                    break;
273                }
274                final int next;
275                if (i == 4 || cur == length - 1) {
276                    next = length - 1;
277                } else {
278                    next = (new UniformIntegerDistribution(cur, length - 1)).sample();
279                }
280                final int subLength = next - cur + 1;
281                out[i] = new double[subLength];
282                System.arraycopy(sample, offset, out[i], 0, subLength);
283                cur = next + 1;
284                sampleCount++;
285                offset += subLength;
286            }
287            if (sampleCount < 5) {
288                double[][] out2 = new double[sampleCount][];
289                for (int j = 0; j < sampleCount; j++) {
290                    final int curSize = out[j].length;
291                    out2[j] = new double[curSize];
292                    System.arraycopy(out[j], 0, out2[j], 0, curSize);
293                }
294                return out2;
295            } else {
296                return out;
297            }
298        }
299    
300    }