001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.math4.legacy.stat.inference; 018 019import java.util.ArrayList; 020import java.util.Collection; 021 022import org.apache.commons.statistics.distribution.FDistribution; 023import org.apache.commons.math4.legacy.exception.ConvergenceException; 024import org.apache.commons.math4.legacy.exception.DimensionMismatchException; 025import org.apache.commons.math4.legacy.exception.MaxCountExceededException; 026import org.apache.commons.math4.legacy.exception.NullArgumentException; 027import org.apache.commons.math4.legacy.exception.OutOfRangeException; 028import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; 029import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics; 030 031/** 032 * Implements one-way ANOVA (analysis of variance) statistics. 033 * 034 * <p> Tests for differences between two or more categories of univariate data 035 * (for example, the body mass index of accountants, lawyers, doctors and 036 * computer programmers). When two categories are given, this is equivalent to 037 * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}. 038 * </p><p> 039 * Uses the {@link org.apache.commons.statistics.distribution.FDistribution 040 * commons-math F Distribution implementation} to estimate exact p-values.</p> 041 * <p>This implementation is based on a description at 042 * http://faculty.vassar.edu/lowry/ch13pt1.html</p> 043 * <pre> 044 * Abbreviations: bg = between groups, 045 * wg = within groups, 046 * ss = sum squared deviations 047 * </pre> 048 * 049 * @since 1.2 050 */ 051public class OneWayAnova { 052 053 /** 054 * Default constructor. 055 */ 056 public OneWayAnova() { 057 } 058 059 /** 060 * Computes the ANOVA F-value for a collection of <code>double[]</code> 061 * arrays. 062 * 063 * <p><strong>Preconditions</strong>: <ul> 064 * <li>The categoryData <code>Collection</code> must contain 065 * <code>double[]</code> arrays.</li> 066 * <li> There must be at least two <code>double[]</code> arrays in the 067 * <code>categoryData</code> collection and each of these arrays must 068 * contain at least two values.</li></ul><p> 069 * This implementation computes the F statistic using the definitional 070 * formula<pre> 071 * F = msbg/mswg</pre> 072 * where<pre> 073 * msbg = between group mean square 074 * mswg = within group mean square</pre> 075 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html"> 076 * here</a> 077 * 078 * @param categoryData <code>Collection</code> of <code>double[]</code> 079 * arrays each containing data for one category 080 * @return Fvalue 081 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 082 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 083 * array is less than 2 or a contained <code>double[]</code> array does not have 084 * at least two values 085 */ 086 public double anovaFValue(final Collection<double[]> categoryData) 087 throws NullArgumentException, DimensionMismatchException { 088 089 AnovaStats a = anovaStats(categoryData); 090 return a.f; 091 } 092 093 /** 094 * Computes the ANOVA P-value for a collection of <code>double[]</code> 095 * arrays. 096 * 097 * <p><strong>Preconditions</strong>: <ul> 098 * <li>The categoryData <code>Collection</code> must contain 099 * <code>double[]</code> arrays.</li> 100 * <li> There must be at least two <code>double[]</code> arrays in the 101 * <code>categoryData</code> collection and each of these arrays must 102 * contain at least two values.</li></ul><p> 103 * This implementation uses the 104 * {@link org.apache.commons.statistics.distribution.FDistribution 105 * commons-math F Distribution implementation} to estimate the exact 106 * p-value, using the formula<pre> 107 * p = survivalProbability(F)</pre> 108 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code> 109 * is the commons-statistics implementation of the F distribution. 110 * 111 * @param categoryData <code>Collection</code> of <code>double[]</code> 112 * arrays each containing data for one category 113 * @return Pvalue 114 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 115 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 116 * array is less than 2 or a contained <code>double[]</code> array does not have 117 * at least two values 118 * @throws ConvergenceException if the p-value can not be computed due to a convergence error 119 * @throws MaxCountExceededException if the maximum number of iterations is exceeded 120 */ 121 public double anovaPValue(final Collection<double[]> categoryData) 122 throws NullArgumentException, DimensionMismatchException, 123 ConvergenceException, MaxCountExceededException { 124 125 final AnovaStats a = anovaStats(categoryData); 126 // No try-catch or advertised exception because args are valid 127 // pass a null rng to avoid unneeded overhead as we will not sample from this distribution 128 final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg); 129 return fdist.survivalProbability(a.f); 130 } 131 132 /** 133 * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}. 134 * 135 * <p><strong>Preconditions</strong>: <ul> 136 * <li>The categoryData <code>Collection</code> must contain 137 * {@link SummaryStatistics}.</li> 138 * <li> There must be at least two {@link SummaryStatistics} in the 139 * <code>categoryData</code> collection and each of these statistics must 140 * contain at least two values.</li></ul><p> 141 * This implementation uses the 142 * {@link org.apache.commons.statistics.distribution.FDistribution 143 * commons-math F Distribution implementation} to estimate the exact 144 * p-value, using the formula<pre> 145 * p = survivalProbability(F)</pre> 146 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code> 147 * is the commons-statistics implementation of the F distribution. 148 * 149 * @param categoryData <code>Collection</code> of {@link SummaryStatistics} 150 * each containing data for one category 151 * @param allowOneElementData if true, allow computation for one catagory 152 * only or for one data element per category 153 * @return Pvalue 154 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 155 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 156 * array is less than 2 or a contained {@link SummaryStatistics} does not have 157 * at least two values 158 * @throws ConvergenceException if the p-value can not be computed due to a convergence error 159 * @throws MaxCountExceededException if the maximum number of iterations is exceeded 160 * @since 3.2 161 */ 162 public double anovaPValue(final Collection<SummaryStatistics> categoryData, 163 final boolean allowOneElementData) 164 throws NullArgumentException, DimensionMismatchException, 165 ConvergenceException, MaxCountExceededException { 166 167 final AnovaStats a = anovaStats(categoryData, allowOneElementData); 168 // pass a null rng to avoid unneeded overhead as we will not sample from this distribution 169 final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg); 170 return fdist.survivalProbability(a.f); 171 } 172 173 /** 174 * This method calls the method that actually does the calculations (except 175 * P-value). 176 * 177 * @param categoryData 178 * <code>Collection</code> of <code>double[]</code> arrays each 179 * containing data for one category 180 * @return computed AnovaStats 181 * @throws NullArgumentException 182 * if <code>categoryData</code> is <code>null</code> 183 * @throws DimensionMismatchException 184 * if the length of the <code>categoryData</code> array is less 185 * than 2 or a contained <code>double[]</code> array does not 186 * contain at least two values 187 */ 188 private AnovaStats anovaStats(final Collection<double[]> categoryData) 189 throws NullArgumentException, DimensionMismatchException { 190 191 NullArgumentException.check(categoryData); 192 193 final Collection<SummaryStatistics> categoryDataSummaryStatistics = 194 new ArrayList<>(categoryData.size()); 195 196 // convert arrays to SummaryStatistics 197 for (final double[] data : categoryData) { 198 final SummaryStatistics dataSummaryStatistics = new SummaryStatistics(); 199 categoryDataSummaryStatistics.add(dataSummaryStatistics); 200 for (final double val : data) { 201 dataSummaryStatistics.addValue(val); 202 } 203 } 204 205 return anovaStats(categoryDataSummaryStatistics, false); 206 } 207 208 /** 209 * Performs an ANOVA test, evaluating the null hypothesis that there 210 * is no difference among the means of the data categories. 211 * 212 * <p><strong>Preconditions</strong>: <ul> 213 * <li>The categoryData <code>Collection</code> must contain 214 * <code>double[]</code> arrays.</li> 215 * <li> There must be at least two <code>double[]</code> arrays in the 216 * <code>categoryData</code> collection and each of these arrays must 217 * contain at least two values.</li> 218 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5. 219 * </li></ul><p> 220 * This implementation uses the 221 * {@link org.apache.commons.statistics.distribution.FDistribution 222 * commons-math F Distribution implementation} to estimate the exact 223 * p-value, using the formula<pre> 224 * p = survivalProbability(F)</pre> 225 * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code> 226 * is the commons-statistics implementation of the F distribution. 227 * <p>True is returned iff the estimated p-value is less than alpha.</p> 228 * 229 * @param categoryData <code>Collection</code> of <code>double[]</code> 230 * arrays each containing data for one category 231 * @param alpha significance level of the test 232 * @return true if the null hypothesis can be rejected with 233 * confidence 1 - alpha 234 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 235 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 236 * array is less than 2 or a contained <code>double[]</code> array does not have 237 * at least two values 238 * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5] 239 * @throws ConvergenceException if the p-value can not be computed due to a convergence error 240 * @throws MaxCountExceededException if the maximum number of iterations is exceeded 241 */ 242 public boolean anovaTest(final Collection<double[]> categoryData, 243 final double alpha) 244 throws NullArgumentException, DimensionMismatchException, 245 OutOfRangeException, ConvergenceException, MaxCountExceededException { 246 247 if (alpha <= 0 || alpha > 0.5) { 248 throw new OutOfRangeException( 249 LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, 250 alpha, 0, 0.5); 251 } 252 return anovaPValue(categoryData) < alpha; 253 } 254 255 /** 256 * This method actually does the calculations (except P-value). 257 * 258 * @param categoryData <code>Collection</code> of <code>double[]</code> 259 * arrays each containing data for one category 260 * @param allowOneElementData if true, allow computation for one catagory 261 * only or for one data element per category 262 * @return computed AnovaStats 263 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 264 * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of 265 * categories is less than 2 or a contained SummaryStatistics does not contain 266 * at least two values 267 */ 268 private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData, 269 final boolean allowOneElementData) 270 throws NullArgumentException, DimensionMismatchException { 271 272 NullArgumentException.check(categoryData); 273 274 if (!allowOneElementData) { 275 // check if we have enough categories 276 if (categoryData.size() < 2) { 277 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, 278 categoryData.size(), 2); 279 } 280 281 // check if each category has enough data 282 for (final SummaryStatistics array : categoryData) { 283 if (array.getN() <= 1) { 284 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, 285 (int) array.getN(), 2); 286 } 287 } 288 } 289 290 int dfwg = 0; 291 double sswg = 0; 292 double totsum = 0; 293 double totsumsq = 0; 294 int totnum = 0; 295 296 for (final SummaryStatistics data : categoryData) { 297 298 final double sum = data.getSum(); 299 final double sumsq = data.getSumsq(); 300 final int num = (int) data.getN(); 301 totnum += num; 302 totsum += sum; 303 totsumsq += sumsq; 304 305 dfwg += num - 1; 306 final double ss = sumsq - ((sum * sum) / num); 307 sswg += ss; 308 } 309 310 final double sst = totsumsq - ((totsum * totsum) / totnum); 311 final double ssbg = sst - sswg; 312 final int dfbg = categoryData.size() - 1; 313 final double msbg = ssbg / dfbg; 314 final double mswg = sswg / dfwg; 315 final double f = msbg / mswg; 316 317 return new AnovaStats(dfbg, dfwg, f); 318 } 319 320 /** 321 Convenience class to pass dfbg,dfwg,F values around within OneWayAnova. 322 No get/set methods provided. 323 */ 324 private static final class AnovaStats { 325 326 /** Degrees of freedom in numerator (between groups). */ 327 private final int dfbg; 328 329 /** Degrees of freedom in denominator (within groups). */ 330 private final int dfwg; 331 332 /** Statistic. */ 333 private final double f; 334 335 /** 336 * Constructor. 337 * @param dfbg degrees of freedom in numerator (between groups) 338 * @param dfwg degrees of freedom in denominator (within groups) 339 * @param f statistic 340 */ 341 private AnovaStats(int dfbg, int dfwg, double f) { 342 this.dfbg = dfbg; 343 this.dfwg = dfwg; 344 this.f = f; 345 } 346 } 347}