1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.math4.legacy.stat.inference;
18
19 import java.util.ArrayList;
20 import java.util.Collection;
21
22 import org.apache.commons.statistics.distribution.FDistribution;
23 import org.apache.commons.math4.legacy.exception.ConvergenceException;
24 import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
25 import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
26 import org.apache.commons.math4.legacy.exception.NullArgumentException;
27 import org.apache.commons.math4.legacy.exception.OutOfRangeException;
28 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
29 import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
30
31 /**
32 * Implements one-way ANOVA (analysis of variance) statistics.
33 *
34 * <p> Tests for differences between two or more categories of univariate data
35 * (for example, the body mass index of accountants, lawyers, doctors and
36 * computer programmers). When two categories are given, this is equivalent to
37 * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}.
38 * </p><p>
39 * Uses the {@link org.apache.commons.statistics.distribution.FDistribution
40 * commons-math F Distribution implementation} to estimate exact p-values.</p>
41 * <p>This implementation is based on a description at
42 * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
43 * <pre>
44 * Abbreviations: bg = between groups,
45 * wg = within groups,
46 * ss = sum squared deviations
47 * </pre>
48 *
49 * @since 1.2
50 */
51 public class OneWayAnova {
52
53 /**
54 * Default constructor.
55 */
56 public OneWayAnova() {
57 }
58
59 /**
60 * Computes the ANOVA F-value for a collection of <code>double[]</code>
61 * arrays.
62 *
63 * <p><strong>Preconditions</strong>: <ul>
64 * <li>The categoryData <code>Collection</code> must contain
65 * <code>double[]</code> arrays.</li>
66 * <li> There must be at least two <code>double[]</code> arrays in the
67 * <code>categoryData</code> collection and each of these arrays must
68 * contain at least two values.</li></ul><p>
69 * This implementation computes the F statistic using the definitional
70 * formula<pre>
71 * F = msbg/mswg</pre>
72 * where<pre>
73 * msbg = between group mean square
74 * mswg = within group mean square</pre>
75 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
76 * here</a>
77 *
78 * @param categoryData <code>Collection</code> of <code>double[]</code>
79 * arrays each containing data for one category
80 * @return Fvalue
81 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
82 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
83 * array is less than 2 or a contained <code>double[]</code> array does not have
84 * at least two values
85 */
86 public double anovaFValue(final Collection<double[]> categoryData)
87 throws NullArgumentException, DimensionMismatchException {
88
89 AnovaStats a = anovaStats(categoryData);
90 return a.f;
91 }
92
93 /**
94 * Computes the ANOVA P-value for a collection of <code>double[]</code>
95 * arrays.
96 *
97 * <p><strong>Preconditions</strong>: <ul>
98 * <li>The categoryData <code>Collection</code> must contain
99 * <code>double[]</code> arrays.</li>
100 * <li> There must be at least two <code>double[]</code> arrays in the
101 * <code>categoryData</code> collection and each of these arrays must
102 * contain at least two values.</li></ul><p>
103 * This implementation uses the
104 * {@link org.apache.commons.statistics.distribution.FDistribution
105 * commons-math F Distribution implementation} to estimate the exact
106 * p-value, using the formula<pre>
107 * p = survivalProbability(F)</pre>
108 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
109 * is the commons-statistics implementation of the F distribution.
110 *
111 * @param categoryData <code>Collection</code> of <code>double[]</code>
112 * arrays each containing data for one category
113 * @return Pvalue
114 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
115 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
116 * array is less than 2 or a contained <code>double[]</code> array does not have
117 * at least two values
118 * @throws ConvergenceException if the p-value can not be computed due to a convergence error
119 * @throws MaxCountExceededException if the maximum number of iterations is exceeded
120 */
121 public double anovaPValue(final Collection<double[]> categoryData)
122 throws NullArgumentException, DimensionMismatchException,
123 ConvergenceException, MaxCountExceededException {
124
125 final AnovaStats a = anovaStats(categoryData);
126 // No try-catch or advertised exception because args are valid
127 // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
128 final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
129 return fdist.survivalProbability(a.f);
130 }
131
132 /**
133 * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
134 *
135 * <p><strong>Preconditions</strong>: <ul>
136 * <li>The categoryData <code>Collection</code> must contain
137 * {@link SummaryStatistics}.</li>
138 * <li> There must be at least two {@link SummaryStatistics} in the
139 * <code>categoryData</code> collection and each of these statistics must
140 * contain at least two values.</li></ul><p>
141 * This implementation uses the
142 * {@link org.apache.commons.statistics.distribution.FDistribution
143 * commons-math F Distribution implementation} to estimate the exact
144 * p-value, using the formula<pre>
145 * p = survivalProbability(F)</pre>
146 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
147 * is the commons-statistics implementation of the F distribution.
148 *
149 * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
150 * each containing data for one category
151 * @param allowOneElementData if true, allow computation for one catagory
152 * only or for one data element per category
153 * @return Pvalue
154 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
155 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
156 * array is less than 2 or a contained {@link SummaryStatistics} does not have
157 * at least two values
158 * @throws ConvergenceException if the p-value can not be computed due to a convergence error
159 * @throws MaxCountExceededException if the maximum number of iterations is exceeded
160 * @since 3.2
161 */
162 public double anovaPValue(final Collection<SummaryStatistics> categoryData,
163 final boolean allowOneElementData)
164 throws NullArgumentException, DimensionMismatchException,
165 ConvergenceException, MaxCountExceededException {
166
167 final AnovaStats a = anovaStats(categoryData, allowOneElementData);
168 // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
169 final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
170 return fdist.survivalProbability(a.f);
171 }
172
173 /**
174 * This method calls the method that actually does the calculations (except
175 * P-value).
176 *
177 * @param categoryData
178 * <code>Collection</code> of <code>double[]</code> arrays each
179 * containing data for one category
180 * @return computed AnovaStats
181 * @throws NullArgumentException
182 * if <code>categoryData</code> is <code>null</code>
183 * @throws DimensionMismatchException
184 * if the length of the <code>categoryData</code> array is less
185 * than 2 or a contained <code>double[]</code> array does not
186 * contain at least two values
187 */
188 private AnovaStats anovaStats(final Collection<double[]> categoryData)
189 throws NullArgumentException, DimensionMismatchException {
190
191 NullArgumentException.check(categoryData);
192
193 final Collection<SummaryStatistics> categoryDataSummaryStatistics =
194 new ArrayList<>(categoryData.size());
195
196 // convert arrays to SummaryStatistics
197 for (final double[] data : categoryData) {
198 final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
199 categoryDataSummaryStatistics.add(dataSummaryStatistics);
200 for (final double val : data) {
201 dataSummaryStatistics.addValue(val);
202 }
203 }
204
205 return anovaStats(categoryDataSummaryStatistics, false);
206 }
207
208 /**
209 * Performs an ANOVA test, evaluating the null hypothesis that there
210 * is no difference among the means of the data categories.
211 *
212 * <p><strong>Preconditions</strong>: <ul>
213 * <li>The categoryData <code>Collection</code> must contain
214 * <code>double[]</code> arrays.</li>
215 * <li> There must be at least two <code>double[]</code> arrays in the
216 * <code>categoryData</code> collection and each of these arrays must
217 * contain at least two values.</li>
218 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
219 * </li></ul><p>
220 * This implementation uses the
221 * {@link org.apache.commons.statistics.distribution.FDistribution
222 * commons-math F Distribution implementation} to estimate the exact
223 * p-value, using the formula<pre>
224 * p = survivalProbability(F)</pre>
225 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
226 * is the commons-statistics implementation of the F distribution.
227 * <p>True is returned iff the estimated p-value is less than alpha.</p>
228 *
229 * @param categoryData <code>Collection</code> of <code>double[]</code>
230 * arrays each containing data for one category
231 * @param alpha significance level of the test
232 * @return true if the null hypothesis can be rejected with
233 * confidence 1 - alpha
234 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
235 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
236 * array is less than 2 or a contained <code>double[]</code> array does not have
237 * at least two values
238 * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
239 * @throws ConvergenceException if the p-value can not be computed due to a convergence error
240 * @throws MaxCountExceededException if the maximum number of iterations is exceeded
241 */
242 public boolean anovaTest(final Collection<double[]> categoryData,
243 final double alpha)
244 throws NullArgumentException, DimensionMismatchException,
245 OutOfRangeException, ConvergenceException, MaxCountExceededException {
246
247 if (alpha <= 0 || alpha > 0.5) {
248 throw new OutOfRangeException(
249 LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
250 alpha, 0, 0.5);
251 }
252 return anovaPValue(categoryData) < alpha;
253 }
254
255 /**
256 * This method actually does the calculations (except P-value).
257 *
258 * @param categoryData <code>Collection</code> of <code>double[]</code>
259 * arrays each containing data for one category
260 * @param allowOneElementData if true, allow computation for one catagory
261 * only or for one data element per category
262 * @return computed AnovaStats
263 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
264 * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
265 * categories is less than 2 or a contained SummaryStatistics does not contain
266 * at least two values
267 */
268 private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
269 final boolean allowOneElementData)
270 throws NullArgumentException, DimensionMismatchException {
271
272 NullArgumentException.check(categoryData);
273
274 if (!allowOneElementData) {
275 // check if we have enough categories
276 if (categoryData.size() < 2) {
277 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
278 categoryData.size(), 2);
279 }
280
281 // check if each category has enough data
282 for (final SummaryStatistics array : categoryData) {
283 if (array.getN() <= 1) {
284 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
285 (int) array.getN(), 2);
286 }
287 }
288 }
289
290 int dfwg = 0;
291 double sswg = 0;
292 double totsum = 0;
293 double totsumsq = 0;
294 int totnum = 0;
295
296 for (final SummaryStatistics data : categoryData) {
297
298 final double sum = data.getSum();
299 final double sumsq = data.getSumsq();
300 final int num = (int) data.getN();
301 totnum += num;
302 totsum += sum;
303 totsumsq += sumsq;
304
305 dfwg += num - 1;
306 final double ss = sumsq - ((sum * sum) / num);
307 sswg += ss;
308 }
309
310 final double sst = totsumsq - ((totsum * totsum) / totnum);
311 final double ssbg = sst - sswg;
312 final int dfbg = categoryData.size() - 1;
313 final double msbg = ssbg / dfbg;
314 final double mswg = sswg / dfwg;
315 final double f = msbg / mswg;
316
317 return new AnovaStats(dfbg, dfwg, f);
318 }
319
320 /**
321 Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
322 No get/set methods provided.
323 */
324 private static final class AnovaStats {
325
326 /** Degrees of freedom in numerator (between groups). */
327 private final int dfbg;
328
329 /** Degrees of freedom in denominator (within groups). */
330 private final int dfwg;
331
332 /** Statistic. */
333 private final double f;
334
335 /**
336 * Constructor.
337 * @param dfbg degrees of freedom in numerator (between groups)
338 * @param dfwg degrees of freedom in denominator (within groups)
339 * @param f statistic
340 */
341 private AnovaStats(int dfbg, int dfwg, double f) {
342 this.dfbg = dfbg;
343 this.dfwg = dfwg;
344 this.f = f;
345 }
346 }
347 }