View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math4.legacy.stat.inference;
18  
19  import java.util.ArrayList;
20  import java.util.Collection;
21  
22  import org.apache.commons.statistics.distribution.FDistribution;
23  import org.apache.commons.math4.legacy.exception.ConvergenceException;
24  import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
25  import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
26  import org.apache.commons.math4.legacy.exception.NullArgumentException;
27  import org.apache.commons.math4.legacy.exception.OutOfRangeException;
28  import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
29  import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
30  
31  /**
32   * Implements one-way ANOVA (analysis of variance) statistics.
33   *
34   * <p> Tests for differences between two or more categories of univariate data
35   * (for example, the body mass index of accountants, lawyers, doctors and
36   * computer programmers).  When two categories are given, this is equivalent to
37   * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}.
38   * </p><p>
39   * Uses the {@link org.apache.commons.statistics.distribution.FDistribution
40   * commons-math F Distribution implementation} to estimate exact p-values.</p>
41   * <p>This implementation is based on a description at
42   * http://faculty.vassar.edu/lowry/ch13pt1.html</p>;
43   * <pre>
44   * Abbreviations: bg = between groups,
45   *                wg = within groups,
46   *                ss = sum squared deviations
47   * </pre>
48   *
49   * @since 1.2
50   */
51  public class OneWayAnova {
52  
53      /**
54       * Default constructor.
55       */
56      public OneWayAnova() {
57      }
58  
59      /**
60       * Computes the ANOVA F-value for a collection of <code>double[]</code>
61       * arrays.
62       *
63       * <p><strong>Preconditions</strong>: <ul>
64       * <li>The categoryData <code>Collection</code> must contain
65       * <code>double[]</code> arrays.</li>
66       * <li> There must be at least two <code>double[]</code> arrays in the
67       * <code>categoryData</code> collection and each of these arrays must
68       * contain at least two values.</li></ul><p>
69       * This implementation computes the F statistic using the definitional
70       * formula<pre>
71       *   F = msbg/mswg</pre>
72       * where<pre>
73       *  msbg = between group mean square
74       *  mswg = within group mean square</pre>
75       * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
76       * here</a>
77       *
78       * @param categoryData <code>Collection</code> of <code>double[]</code>
79       * arrays each containing data for one category
80       * @return Fvalue
81       * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
82       * @throws DimensionMismatchException if the length of the <code>categoryData</code>
83       * array is less than 2 or a contained <code>double[]</code> array does not have
84       * at least two values
85       */
86      public double anovaFValue(final Collection<double[]> categoryData)
87          throws NullArgumentException, DimensionMismatchException {
88  
89          AnovaStats a = anovaStats(categoryData);
90          return a.f;
91      }
92  
93      /**
94       * Computes the ANOVA P-value for a collection of <code>double[]</code>
95       * arrays.
96       *
97       * <p><strong>Preconditions</strong>: <ul>
98       * <li>The categoryData <code>Collection</code> must contain
99       * <code>double[]</code> arrays.</li>
100      * <li> There must be at least two <code>double[]</code> arrays in the
101      * <code>categoryData</code> collection and each of these arrays must
102      * contain at least two values.</li></ul><p>
103      * This implementation uses the
104      * {@link org.apache.commons.statistics.distribution.FDistribution
105      * commons-math F Distribution implementation} to estimate the exact
106      * p-value, using the formula<pre>
107      *   p = survivalProbability(F)</pre>
108      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
109      * is the commons-statistics implementation of the F distribution.
110      *
111      * @param categoryData <code>Collection</code> of <code>double[]</code>
112      * arrays each containing data for one category
113      * @return Pvalue
114      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
115      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
116      * array is less than 2 or a contained <code>double[]</code> array does not have
117      * at least two values
118      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
119      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
120      */
121     public double anovaPValue(final Collection<double[]> categoryData)
122         throws NullArgumentException, DimensionMismatchException,
123         ConvergenceException, MaxCountExceededException {
124 
125         final AnovaStats a = anovaStats(categoryData);
126         // No try-catch or advertised exception because args are valid
127         // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
128         final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
129         return fdist.survivalProbability(a.f);
130     }
131 
132     /**
133      * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
134      *
135      * <p><strong>Preconditions</strong>: <ul>
136      * <li>The categoryData <code>Collection</code> must contain
137      * {@link SummaryStatistics}.</li>
138      * <li> There must be at least two {@link SummaryStatistics} in the
139      * <code>categoryData</code> collection and each of these statistics must
140      * contain at least two values.</li></ul><p>
141      * This implementation uses the
142      * {@link org.apache.commons.statistics.distribution.FDistribution
143      * commons-math F Distribution implementation} to estimate the exact
144      * p-value, using the formula<pre>
145      *   p = survivalProbability(F)</pre>
146      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
147      * is the commons-statistics implementation of the F distribution.
148      *
149      * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
150      * each containing data for one category
151      * @param allowOneElementData if true, allow computation for one catagory
152      * only or for one data element per category
153      * @return Pvalue
154      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
155      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
156      * array is less than 2 or a contained {@link SummaryStatistics} does not have
157      * at least two values
158      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
159      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
160      * @since 3.2
161      */
162     public double anovaPValue(final Collection<SummaryStatistics> categoryData,
163                               final boolean allowOneElementData)
164         throws NullArgumentException, DimensionMismatchException,
165                ConvergenceException, MaxCountExceededException {
166 
167         final AnovaStats a = anovaStats(categoryData, allowOneElementData);
168         // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
169         final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
170         return fdist.survivalProbability(a.f);
171     }
172 
173     /**
174      * This method calls the method that actually does the calculations (except
175      * P-value).
176      *
177      * @param categoryData
178      *            <code>Collection</code> of <code>double[]</code> arrays each
179      *            containing data for one category
180      * @return computed AnovaStats
181      * @throws NullArgumentException
182      *             if <code>categoryData</code> is <code>null</code>
183      * @throws DimensionMismatchException
184      *             if the length of the <code>categoryData</code> array is less
185      *             than 2 or a contained <code>double[]</code> array does not
186      *             contain at least two values
187      */
188     private AnovaStats anovaStats(final Collection<double[]> categoryData)
189         throws NullArgumentException, DimensionMismatchException {
190 
191         NullArgumentException.check(categoryData);
192 
193         final Collection<SummaryStatistics> categoryDataSummaryStatistics =
194                 new ArrayList<>(categoryData.size());
195 
196         // convert arrays to SummaryStatistics
197         for (final double[] data : categoryData) {
198             final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
199             categoryDataSummaryStatistics.add(dataSummaryStatistics);
200             for (final double val : data) {
201                 dataSummaryStatistics.addValue(val);
202             }
203         }
204 
205         return anovaStats(categoryDataSummaryStatistics, false);
206     }
207 
208     /**
209      * Performs an ANOVA test, evaluating the null hypothesis that there
210      * is no difference among the means of the data categories.
211      *
212      * <p><strong>Preconditions</strong>: <ul>
213      * <li>The categoryData <code>Collection</code> must contain
214      * <code>double[]</code> arrays.</li>
215      * <li> There must be at least two <code>double[]</code> arrays in the
216      * <code>categoryData</code> collection and each of these arrays must
217      * contain at least two values.</li>
218      * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
219      * </li></ul><p>
220      * This implementation uses the
221      * {@link org.apache.commons.statistics.distribution.FDistribution
222      * commons-math F Distribution implementation} to estimate the exact
223      * p-value, using the formula<pre>
224      *   p = survivalProbability(F)</pre>
225      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
226      * is the commons-statistics implementation of the F distribution.
227      * <p>True is returned iff the estimated p-value is less than alpha.</p>
228      *
229      * @param categoryData <code>Collection</code> of <code>double[]</code>
230      * arrays each containing data for one category
231      * @param alpha significance level of the test
232      * @return true if the null hypothesis can be rejected with
233      * confidence 1 - alpha
234      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
235      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
236      * array is less than 2 or a contained <code>double[]</code> array does not have
237      * at least two values
238      * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
239      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
240      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
241      */
242     public boolean anovaTest(final Collection<double[]> categoryData,
243                              final double alpha)
244         throws NullArgumentException, DimensionMismatchException,
245         OutOfRangeException, ConvergenceException, MaxCountExceededException {
246 
247         if (alpha <= 0 || alpha > 0.5) {
248             throw new OutOfRangeException(
249                     LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
250                     alpha, 0, 0.5);
251         }
252         return anovaPValue(categoryData) < alpha;
253     }
254 
255     /**
256      * This method actually does the calculations (except P-value).
257      *
258      * @param categoryData <code>Collection</code> of <code>double[]</code>
259      * arrays each containing data for one category
260      * @param allowOneElementData if true, allow computation for one catagory
261      * only or for one data element per category
262      * @return computed AnovaStats
263      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
264      * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
265      * categories is less than 2 or a contained SummaryStatistics does not contain
266      * at least two values
267      */
268     private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
269                                   final boolean allowOneElementData)
270         throws NullArgumentException, DimensionMismatchException {
271 
272         NullArgumentException.check(categoryData);
273 
274         if (!allowOneElementData) {
275             // check if we have enough categories
276             if (categoryData.size() < 2) {
277                 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
278                                                      categoryData.size(), 2);
279             }
280 
281             // check if each category has enough data
282             for (final SummaryStatistics array : categoryData) {
283                 if (array.getN() <= 1) {
284                     throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
285                                                          (int) array.getN(), 2);
286                 }
287             }
288         }
289 
290         int dfwg = 0;
291         double sswg = 0;
292         double totsum = 0;
293         double totsumsq = 0;
294         int totnum = 0;
295 
296         for (final SummaryStatistics data : categoryData) {
297 
298             final double sum = data.getSum();
299             final double sumsq = data.getSumsq();
300             final int num = (int) data.getN();
301             totnum += num;
302             totsum += sum;
303             totsumsq += sumsq;
304 
305             dfwg += num - 1;
306             final double ss = sumsq - ((sum * sum) / num);
307             sswg += ss;
308         }
309 
310         final double sst = totsumsq - ((totsum * totsum) / totnum);
311         final double ssbg = sst - sswg;
312         final int dfbg = categoryData.size() - 1;
313         final double msbg = ssbg / dfbg;
314         final double mswg = sswg / dfwg;
315         final double f = msbg / mswg;
316 
317         return new AnovaStats(dfbg, dfwg, f);
318     }
319 
320     /**
321         Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
322         No get/set methods provided.
323     */
324     private static final class AnovaStats {
325 
326         /** Degrees of freedom in numerator (between groups). */
327         private final int dfbg;
328 
329         /** Degrees of freedom in denominator (within groups). */
330         private final int dfwg;
331 
332         /** Statistic. */
333         private final double f;
334 
335         /**
336          * Constructor.
337          * @param dfbg degrees of freedom in numerator (between groups)
338          * @param dfwg degrees of freedom in denominator (within groups)
339          * @param f statistic
340          */
341         private AnovaStats(int dfbg, int dfwg, double f) {
342             this.dfbg = dfbg;
343             this.dfwg = dfwg;
344             this.f = f;
345         }
346     }
347 }