001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math4.stat.inference;
018
019import java.util.ArrayList;
020import java.util.Collection;
021
022import org.apache.commons.statistics.distribution.FDistribution;
023import org.apache.commons.math4.exception.ConvergenceException;
024import org.apache.commons.math4.exception.DimensionMismatchException;
025import org.apache.commons.math4.exception.MaxCountExceededException;
026import org.apache.commons.math4.exception.NullArgumentException;
027import org.apache.commons.math4.exception.OutOfRangeException;
028import org.apache.commons.math4.exception.util.LocalizedFormats;
029import org.apache.commons.math4.stat.descriptive.SummaryStatistics;
030import org.apache.commons.math4.util.MathUtils;
031
032/**
033 * Implements one-way ANOVA (analysis of variance) statistics.
034 *
035 * <p> Tests for differences between two or more categories of univariate data
036 * (for example, the body mass index of accountants, lawyers, doctors and
037 * computer programmers).  When two categories are given, this is equivalent to
038 * the {@link org.apache.commons.math4.stat.inference.TTest}.
039 * </p><p>
040 * Uses the {@link org.apache.commons.statistics.distribution.FDistribution
041 * commons-math F Distribution implementation} to estimate exact p-values.</p>
042 * <p>This implementation is based on a description at
043 * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
044 * <pre>
045 * Abbreviations: bg = between groups,
046 *                wg = within groups,
047 *                ss = sum squared deviations
048 * </pre>
049 *
050 * @since 1.2
051 */
052public class OneWayAnova {
053
054    /**
055     * Default constructor.
056     */
057    public OneWayAnova() {
058    }
059
060    /**
061     * Computes the ANOVA F-value for a collection of <code>double[]</code>
062     * arrays.
063     *
064     * <p><strong>Preconditions</strong>: <ul>
065     * <li>The categoryData <code>Collection</code> must contain
066     * <code>double[]</code> arrays.</li>
067     * <li> There must be at least two <code>double[]</code> arrays in the
068     * <code>categoryData</code> collection and each of these arrays must
069     * contain at least two values.</li></ul><p>
070     * This implementation computes the F statistic using the definitional
071     * formula<pre>
072     *   F = msbg/mswg</pre>
073     * where<pre>
074     *  msbg = between group mean square
075     *  mswg = within group mean square</pre>
076     * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
077     * here</a>
078     *
079     * @param categoryData <code>Collection</code> of <code>double[]</code>
080     * arrays each containing data for one category
081     * @return Fvalue
082     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
083     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
084     * array is less than 2 or a contained <code>double[]</code> array does not have
085     * at least two values
086     */
087    public double anovaFValue(final Collection<double[]> categoryData)
088        throws NullArgumentException, DimensionMismatchException {
089
090        AnovaStats a = anovaStats(categoryData);
091        return a.F;
092
093    }
094
095    /**
096     * Computes the ANOVA P-value for a collection of <code>double[]</code>
097     * arrays.
098     *
099     * <p><strong>Preconditions</strong>: <ul>
100     * <li>The categoryData <code>Collection</code> must contain
101     * <code>double[]</code> arrays.</li>
102     * <li> There must be at least two <code>double[]</code> arrays in the
103     * <code>categoryData</code> collection and each of these arrays must
104     * contain at least two values.</li></ul><p>
105     * This implementation uses the
106     * {@link org.apache.commons.statistics.distribution.FDistribution
107     * commons-math F Distribution implementation} to estimate the exact
108     * p-value, using the formula<pre>
109     *   p = 1 - cumulativeProbability(F)</pre>
110     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
111     * is the commons-math implementation of the F distribution.
112     *
113     * @param categoryData <code>Collection</code> of <code>double[]</code>
114     * arrays each containing data for one category
115     * @return Pvalue
116     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
117     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
118     * array is less than 2 or a contained <code>double[]</code> array does not have
119     * at least two values
120     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
121     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
122     */
123    public double anovaPValue(final Collection<double[]> categoryData)
124        throws NullArgumentException, DimensionMismatchException,
125        ConvergenceException, MaxCountExceededException {
126
127        final AnovaStats a = anovaStats(categoryData);
128        // No try-catch or advertised exception because args are valid
129        // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
130        final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
131        return 1.0 - fdist.cumulativeProbability(a.F);
132
133    }
134
135    /**
136     * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
137     *
138     * <p><strong>Preconditions</strong>: <ul>
139     * <li>The categoryData <code>Collection</code> must contain
140     * {@link SummaryStatistics}.</li>
141     * <li> There must be at least two {@link SummaryStatistics} in the
142     * <code>categoryData</code> collection and each of these statistics must
143     * contain at least two values.</li></ul><p>
144     * This implementation uses the
145     * {@link org.apache.commons.statistics.distribution.FDistribution
146     * commons-math F Distribution implementation} to estimate the exact
147     * p-value, using the formula<pre>
148     *   p = 1 - cumulativeProbability(F)</pre>
149     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
150     * is the commons-math implementation of the F distribution.
151     *
152     * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
153     * each containing data for one category
154     * @param allowOneElementData if true, allow computation for one catagory
155     * only or for one data element per category
156     * @return Pvalue
157     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
158     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
159     * array is less than 2 or a contained {@link SummaryStatistics} does not have
160     * at least two values
161     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
162     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
163     * @since 3.2
164     */
165    public double anovaPValue(final Collection<SummaryStatistics> categoryData,
166                              final boolean allowOneElementData)
167        throws NullArgumentException, DimensionMismatchException,
168               ConvergenceException, MaxCountExceededException {
169
170        final AnovaStats a = anovaStats(categoryData, allowOneElementData);
171        // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
172        final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
173        return 1.0 - fdist.cumulativeProbability(a.F);
174
175    }
176
177    /**
178     * This method calls the method that actually does the calculations (except
179     * P-value).
180     *
181     * @param categoryData
182     *            <code>Collection</code> of <code>double[]</code> arrays each
183     *            containing data for one category
184     * @return computed AnovaStats
185     * @throws NullArgumentException
186     *             if <code>categoryData</code> is <code>null</code>
187     * @throws DimensionMismatchException
188     *             if the length of the <code>categoryData</code> array is less
189     *             than 2 or a contained <code>double[]</code> array does not
190     *             contain at least two values
191     */
192    private AnovaStats anovaStats(final Collection<double[]> categoryData)
193        throws NullArgumentException, DimensionMismatchException {
194
195        MathUtils.checkNotNull(categoryData);
196
197        final Collection<SummaryStatistics> categoryDataSummaryStatistics =
198                new ArrayList<>(categoryData.size());
199
200        // convert arrays to SummaryStatistics
201        for (final double[] data : categoryData) {
202            final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
203            categoryDataSummaryStatistics.add(dataSummaryStatistics);
204            for (final double val : data) {
205                dataSummaryStatistics.addValue(val);
206            }
207        }
208
209        return anovaStats(categoryDataSummaryStatistics, false);
210
211    }
212
213    /**
214     * Performs an ANOVA test, evaluating the null hypothesis that there
215     * is no difference among the means of the data categories.
216     *
217     * <p><strong>Preconditions</strong>: <ul>
218     * <li>The categoryData <code>Collection</code> must contain
219     * <code>double[]</code> arrays.</li>
220     * <li> There must be at least two <code>double[]</code> arrays in the
221     * <code>categoryData</code> collection and each of these arrays must
222     * contain at least two values.</li>
223     * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
224     * </li></ul><p>
225     * This implementation uses the
226     * {@link org.apache.commons.statistics.distribution.FDistribution
227     * commons-math F Distribution implementation} to estimate the exact
228     * p-value, using the formula<pre>
229     *   p = 1 - cumulativeProbability(F)</pre>
230     * where <code>F</code> is the F value and <code>cumulativeProbability</code>
231     * is the commons-math implementation of the F distribution.
232     * <p>True is returned iff the estimated p-value is less than alpha.</p>
233     *
234     * @param categoryData <code>Collection</code> of <code>double[]</code>
235     * arrays each containing data for one category
236     * @param alpha significance level of the test
237     * @return true if the null hypothesis can be rejected with
238     * confidence 1 - alpha
239     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
240     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
241     * array is less than 2 or a contained <code>double[]</code> array does not have
242     * at least two values
243     * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
244     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
245     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
246     */
247    public boolean anovaTest(final Collection<double[]> categoryData,
248                             final double alpha)
249        throws NullArgumentException, DimensionMismatchException,
250        OutOfRangeException, ConvergenceException, MaxCountExceededException {
251
252        if ((alpha <= 0) || (alpha > 0.5)) {
253            throw new OutOfRangeException(
254                    LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
255                    alpha, 0, 0.5);
256        }
257        return anovaPValue(categoryData) < alpha;
258
259    }
260
261    /**
262     * This method actually does the calculations (except P-value).
263     *
264     * @param categoryData <code>Collection</code> of <code>double[]</code>
265     * arrays each containing data for one category
266     * @param allowOneElementData if true, allow computation for one catagory
267     * only or for one data element per category
268     * @return computed AnovaStats
269     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
270     * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
271     * categories is less than 2 or a contained SummaryStatistics does not contain
272     * at least two values
273     */
274    private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
275                                  final boolean allowOneElementData)
276        throws NullArgumentException, DimensionMismatchException {
277
278        MathUtils.checkNotNull(categoryData);
279
280        if (!allowOneElementData) {
281            // check if we have enough categories
282            if (categoryData.size() < 2) {
283                throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
284                                                     categoryData.size(), 2);
285            }
286
287            // check if each category has enough data
288            for (final SummaryStatistics array : categoryData) {
289                if (array.getN() <= 1) {
290                    throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
291                                                         (int) array.getN(), 2);
292                }
293            }
294        }
295
296        int dfwg = 0;
297        double sswg = 0;
298        double totsum = 0;
299        double totsumsq = 0;
300        int totnum = 0;
301
302        for (final SummaryStatistics data : categoryData) {
303
304            final double sum = data.getSum();
305            final double sumsq = data.getSumsq();
306            final int num = (int) data.getN();
307            totnum += num;
308            totsum += sum;
309            totsumsq += sumsq;
310
311            dfwg += num - 1;
312            final double ss = sumsq - ((sum * sum) / num);
313            sswg += ss;
314        }
315
316        final double sst = totsumsq - ((totsum * totsum) / totnum);
317        final double ssbg = sst - sswg;
318        final int dfbg = categoryData.size() - 1;
319        final double msbg = ssbg / dfbg;
320        final double mswg = sswg / dfwg;
321        final double F = msbg / mswg;
322
323        return new AnovaStats(dfbg, dfwg, F);
324
325    }
326
327    /**
328        Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
329        No get/set methods provided.
330    */
331    private static class AnovaStats {
332
333        /** Degrees of freedom in numerator (between groups). */
334        private final int dfbg;
335
336        /** Degrees of freedom in denominator (within groups). */
337        private final int dfwg;
338
339        /** Statistic. */
340        private final double F;
341
342        /**
343         * Constructor
344         * @param dfbg degrees of freedom in numerator (between groups)
345         * @param dfwg degrees of freedom in denominator (within groups)
346         * @param F statistic
347         */
348        private AnovaStats(int dfbg, int dfwg, double F) {
349            this.dfbg = dfbg;
350            this.dfwg = dfwg;
351            this.F = F;
352        }
353    }
354
355}