001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.stat.inference;
018
019import java.util.ArrayList;
020import java.util.Collection;
021
022import org.apache.commons.math3.distribution.FDistribution;
023import org.apache.commons.math3.exception.ConvergenceException;
024import org.apache.commons.math3.exception.DimensionMismatchException;
025import org.apache.commons.math3.exception.MaxCountExceededException;
026import org.apache.commons.math3.exception.NullArgumentException;
027import org.apache.commons.math3.exception.OutOfRangeException;
028import org.apache.commons.math3.exception.util.LocalizedFormats;
029import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
030import org.apache.commons.math3.util.MathUtils;
031
032/**
033 * Implements one-way ANOVA (analysis of variance) statistics.
034 *
035 * <p> Tests for differences between two or more categories of univariate data
036 * (for example, the body mass index of accountants, lawyers, doctors and
037 * computer programmers).  When two categories are given, this is equivalent to
038 * the {@link org.apache.commons.math3.stat.inference.TTest}.
039 * </p><p>
040 * Uses the {@link org.apache.commons.math3.distribution.FDistribution
041 * commons-math F Distribution implementation} to estimate exact p-values.</p>
042 * <p>This implementation is based on a description at
043 * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
044 * <pre>
045 * Abbreviations: bg = between groups,
046 *                wg = within groups,
047 *                ss = sum squared deviations
048 * </pre>
049 *
050 * @since 1.2
051 * @version $Id: OneWayAnova.java 1462423 2013-03-29 07:25:18Z luc $
052 */
053public class OneWayAnova {
054
055    /**
056     * Default constructor.
057     */
058    public OneWayAnova() {
059    }
060
061    /**
062     * Computes the ANOVA F-value for a collection of <code>double[]</code>
063     * arrays.
064     *
065     * <p><strong>Preconditions</strong>: <ul>
066     * <li>The categoryData <code>Collection</code> must contain
067     * <code>double[]</code> arrays.</li>
068     * <li> There must be at least two <code>double[]</code> arrays in the
069     * <code>categoryData</code> collection and each of these arrays must
070     * contain at least two values.</li></ul></p><p>
071     * This implementation computes the F statistic using the definitional
072     * formula<pre>
073     *   F = msbg/mswg</pre>
074     * where<pre>
075     *  msbg = between group mean square
076     *  mswg = within group mean square</pre>
077     * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
078     * here</a></p>
079     *
080     * @param categoryData <code>Collection</code> of <code>double[]</code>
081     * arrays each containing data for one category
082     * @return Fvalue
083     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
084     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
085     * array is less than 2 or a contained <code>double[]</code> array does not have
086     * at least two values
087     */
088    public double anovaFValue(final Collection<double[]> categoryData)
089        throws NullArgumentException, DimensionMismatchException {
090
091        AnovaStats a = anovaStats(categoryData);
092        return a.F;
093
094    }
095
096    /**
097     * Computes the ANOVA P-value for a collection of <code>double[]</code>
098     * arrays.
099     *
100     * <p><strong>Preconditions</strong>: <ul>
101     * <li>The categoryData <code>Collection</code> must contain
102     * <code>double[]</code> arrays.</li>
103     * <li> There must be at least two <code>double[]</code> arrays in the
104     * <code>categoryData</code> collection and each of these arrays must
105     * contain at least two values.</li></ul></p><p>
106     * This implementation uses the
107     * {@link org.apache.commons.math3.distribution.FDistribution
108     * commons-math F Distribution implementation} to estimate the exact
109     * p-value, using the formula<pre>
110     *   p = 1 - cumulativeProbability(F)</pre>
111     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
112     * is the commons-math implementation of the F distribution.</p>
113     *
114     * @param categoryData <code>Collection</code> of <code>double[]</code>
115     * arrays each containing data for one category
116     * @return Pvalue
117     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
118     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
119     * array is less than 2 or a contained <code>double[]</code> array does not have
120     * at least two values
121     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
122     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
123     */
124    public double anovaPValue(final Collection<double[]> categoryData)
125        throws NullArgumentException, DimensionMismatchException,
126        ConvergenceException, MaxCountExceededException {
127
128        AnovaStats a = anovaStats(categoryData);
129        // No try-catch or advertised exception because args are valid
130        FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
131        return 1.0 - fdist.cumulativeProbability(a.F);
132
133    }
134
135    /**
136     * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
137     *
138     * <p><strong>Preconditions</strong>: <ul>
139     * <li>The categoryData <code>Collection</code> must contain
140     * {@link SummaryStatistics}.</li>
141     * <li> There must be at least two {@link SummaryStatistics} in the
142     * <code>categoryData</code> collection and each of these statistics must
143     * contain at least two values.</li></ul></p><p>
144     * This implementation uses the
145     * {@link org.apache.commons.math3.distribution.FDistribution
146     * commons-math F Distribution implementation} to estimate the exact
147     * p-value, using the formula<pre>
148     *   p = 1 - cumulativeProbability(F)</pre>
149     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
150     * is the commons-math implementation of the F distribution.</p>
151     *
152     * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
153     * each containing data for one category
154     * @param allowOneElementData if true, allow computation for one catagory
155     * only or for one data element per category
156     * @return Pvalue
157     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
158     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
159     * array is less than 2 or a contained {@link SummaryStatistics} does not have
160     * at least two values
161     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
162     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
163     * @since 3.2
164     */
165    public double anovaPValue(final Collection<SummaryStatistics> categoryData,
166                              final boolean allowOneElementData)
167        throws NullArgumentException, DimensionMismatchException,
168               ConvergenceException, MaxCountExceededException {
169
170        final AnovaStats a = anovaStats(categoryData, allowOneElementData);
171        final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
172        return 1.0 - fdist.cumulativeProbability(a.F);
173
174    }
175
176    /**
177     * This method calls the method that actually does the calculations (except
178     * P-value).
179     *
180     * @param categoryData
181     *            <code>Collection</code> of <code>double[]</code> arrays each
182     *            containing data for one category
183     * @return computed AnovaStats
184     * @throws NullArgumentException
185     *             if <code>categoryData</code> is <code>null</code>
186     * @throws DimensionMismatchException
187     *             if the length of the <code>categoryData</code> array is less
188     *             than 2 or a contained <code>double[]</code> array does not
189     *             contain at least two values
190     */
191    private AnovaStats anovaStats(final Collection<double[]> categoryData)
192        throws NullArgumentException, DimensionMismatchException {
193
194        MathUtils.checkNotNull(categoryData);
195
196        final Collection<SummaryStatistics> categoryDataSummaryStatistics =
197                new ArrayList<SummaryStatistics>(categoryData.size());
198
199        // convert arrays to SummaryStatistics
200        for (final double[] data : categoryData) {
201            final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
202            categoryDataSummaryStatistics.add(dataSummaryStatistics);
203            for (final double val : data) {
204                dataSummaryStatistics.addValue(val);
205            }
206        }
207
208        return anovaStats(categoryDataSummaryStatistics, false);
209
210    }
211
212    /**
213     * Performs an ANOVA test, evaluating the null hypothesis that there
214     * is no difference among the means of the data categories.
215     *
216     * <p><strong>Preconditions</strong>: <ul>
217     * <li>The categoryData <code>Collection</code> must contain
218     * <code>double[]</code> arrays.</li>
219     * <li> There must be at least two <code>double[]</code> arrays in the
220     * <code>categoryData</code> collection and each of these arrays must
221     * contain at least two values.</li>
222     * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
223     * </li></ul></p><p>
224     * This implementation uses the
225     * {@link org.apache.commons.math3.distribution.FDistribution
226     * commons-math F Distribution implementation} to estimate the exact
227     * p-value, using the formula<pre>
228     *   p = 1 - cumulativeProbability(F)</pre>
229     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
230     * is the commons-math implementation of the F distribution.</p>
231     * <p>True is returned iff the estimated p-value is less than alpha.</p>
232     *
233     * @param categoryData <code>Collection</code> of <code>double[]</code>
234     * arrays each containing data for one category
235     * @param alpha significance level of the test
236     * @return true if the null hypothesis can be rejected with
237     * confidence 1 - alpha
238     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
239     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
240     * array is less than 2 or a contained <code>double[]</code> array does not have
241     * at least two values
242     * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
243     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
244     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
245     */
246    public boolean anovaTest(final Collection<double[]> categoryData,
247                             final double alpha)
248        throws NullArgumentException, DimensionMismatchException,
249        OutOfRangeException, ConvergenceException, MaxCountExceededException {
250
251        if ((alpha <= 0) || (alpha > 0.5)) {
252            throw new OutOfRangeException(
253                    LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
254                    alpha, 0, 0.5);
255        }
256        return anovaPValue(categoryData) < alpha;
257
258    }
259
260    /**
261     * This method actually does the calculations (except P-value).
262     *
263     * @param categoryData <code>Collection</code> of <code>double[]</code>
264     * arrays each containing data for one category
265     * @param allowOneElementData if true, allow computation for one catagory
266     * only or for one data element per category
267     * @return computed AnovaStats
268     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
269     * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
270     * categories is less than 2 or a contained SummaryStatistics does not contain
271     * at least two values
272     */
273    private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
274                                  final boolean allowOneElementData)
275        throws NullArgumentException, DimensionMismatchException {
276
277        MathUtils.checkNotNull(categoryData);
278
279        if (!allowOneElementData) {
280            // check if we have enough categories
281            if (categoryData.size() < 2) {
282                throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
283                                                     categoryData.size(), 2);
284            }
285
286            // check if each category has enough data
287            for (final SummaryStatistics array : categoryData) {
288                if (array.getN() <= 1) {
289                    throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
290                                                         (int) array.getN(), 2);
291                }
292            }
293        }
294
295        int dfwg = 0;
296        double sswg = 0;
297        double totsum = 0;
298        double totsumsq = 0;
299        int totnum = 0;
300
301        for (final SummaryStatistics data : categoryData) {
302
303            final double sum = data.getSum();
304            final double sumsq = data.getSumsq();
305            final int num = (int) data.getN();
306            totnum += num;
307            totsum += sum;
308            totsumsq += sumsq;
309
310            dfwg += num - 1;
311            final double ss = sumsq - ((sum * sum) / num);
312            sswg += ss;
313        }
314
315        final double sst = totsumsq - ((totsum * totsum) / totnum);
316        final double ssbg = sst - sswg;
317        final int dfbg = categoryData.size() - 1;
318        final double msbg = ssbg / dfbg;
319        final double mswg = sswg / dfwg;
320        final double F = msbg / mswg;
321
322        return new AnovaStats(dfbg, dfwg, F);
323
324    }
325
326    /**
327        Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
328        No get/set methods provided.
329    */
330    private static class AnovaStats {
331
332        /** Degrees of freedom in numerator (between groups). */
333        private final int dfbg;
334
335        /** Degrees of freedom in denominator (within groups). */
336        private final int dfwg;
337
338        /** Statistic. */
339        private final double F;
340
341        /**
342         * Constructor
343         * @param dfbg degrees of freedom in numerator (between groups)
344         * @param dfwg degrees of freedom in denominator (within groups)
345         * @param F statistic
346         */
347        private AnovaStats(int dfbg, int dfwg, double F) {
348            this.dfbg = dfbg;
349            this.dfwg = dfwg;
350            this.F = F;
351        }
352    }
353
354}