001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math4.legacy.stat.inference;
018
019import java.util.ArrayList;
020import java.util.Collection;
021
022import org.apache.commons.statistics.distribution.FDistribution;
023import org.apache.commons.math4.legacy.exception.ConvergenceException;
024import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
025import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
026import org.apache.commons.math4.legacy.exception.NullArgumentException;
027import org.apache.commons.math4.legacy.exception.OutOfRangeException;
028import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
029import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
030
031/**
032 * Implements one-way ANOVA (analysis of variance) statistics.
033 *
034 * <p> Tests for differences between two or more categories of univariate data
035 * (for example, the body mass index of accountants, lawyers, doctors and
036 * computer programmers).  When two categories are given, this is equivalent to
037 * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}.
038 * </p><p>
039 * Uses the {@link org.apache.commons.statistics.distribution.FDistribution
040 * commons-math F Distribution implementation} to estimate exact p-values.</p>
041 * <p>This implementation is based on a description at
042 * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
043 * <pre>
044 * Abbreviations: bg = between groups,
045 *                wg = within groups,
046 *                ss = sum squared deviations
047 * </pre>
048 *
049 * @since 1.2
050 */
051public class OneWayAnova {
052
053    /**
054     * Default constructor.
055     */
056    public OneWayAnova() {
057    }
058
059    /**
060     * Computes the ANOVA F-value for a collection of <code>double[]</code>
061     * arrays.
062     *
063     * <p><strong>Preconditions</strong>: <ul>
064     * <li>The categoryData <code>Collection</code> must contain
065     * <code>double[]</code> arrays.</li>
066     * <li> There must be at least two <code>double[]</code> arrays in the
067     * <code>categoryData</code> collection and each of these arrays must
068     * contain at least two values.</li></ul><p>
069     * This implementation computes the F statistic using the definitional
070     * formula<pre>
071     *   F = msbg/mswg</pre>
072     * where<pre>
073     *  msbg = between group mean square
074     *  mswg = within group mean square</pre>
075     * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
076     * here</a>
077     *
078     * @param categoryData <code>Collection</code> of <code>double[]</code>
079     * arrays each containing data for one category
080     * @return Fvalue
081     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
082     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
083     * array is less than 2 or a contained <code>double[]</code> array does not have
084     * at least two values
085     */
086    public double anovaFValue(final Collection<double[]> categoryData)
087        throws NullArgumentException, DimensionMismatchException {
088
089        AnovaStats a = anovaStats(categoryData);
090        return a.f;
091    }
092
093    /**
094     * Computes the ANOVA P-value for a collection of <code>double[]</code>
095     * arrays.
096     *
097     * <p><strong>Preconditions</strong>: <ul>
098     * <li>The categoryData <code>Collection</code> must contain
099     * <code>double[]</code> arrays.</li>
100     * <li> There must be at least two <code>double[]</code> arrays in the
101     * <code>categoryData</code> collection and each of these arrays must
102     * contain at least two values.</li></ul><p>
103     * This implementation uses the
104     * {@link org.apache.commons.statistics.distribution.FDistribution
105     * commons-math F Distribution implementation} to estimate the exact
106     * p-value, using the formula<pre>
107     *   p = survivalProbability(F)</pre>
108     * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
109     * is the commons-statistics implementation of the F distribution.
110     *
111     * @param categoryData <code>Collection</code> of <code>double[]</code>
112     * arrays each containing data for one category
113     * @return Pvalue
114     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
115     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
116     * array is less than 2 or a contained <code>double[]</code> array does not have
117     * at least two values
118     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
119     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
120     */
121    public double anovaPValue(final Collection<double[]> categoryData)
122        throws NullArgumentException, DimensionMismatchException,
123        ConvergenceException, MaxCountExceededException {
124
125        final AnovaStats a = anovaStats(categoryData);
126        // No try-catch or advertised exception because args are valid
127        // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
128        final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
129        return fdist.survivalProbability(a.f);
130    }
131
132    /**
133     * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
134     *
135     * <p><strong>Preconditions</strong>: <ul>
136     * <li>The categoryData <code>Collection</code> must contain
137     * {@link SummaryStatistics}.</li>
138     * <li> There must be at least two {@link SummaryStatistics} in the
139     * <code>categoryData</code> collection and each of these statistics must
140     * contain at least two values.</li></ul><p>
141     * This implementation uses the
142     * {@link org.apache.commons.statistics.distribution.FDistribution
143     * commons-math F Distribution implementation} to estimate the exact
144     * p-value, using the formula<pre>
145     *   p = survivalProbability(F)</pre>
146     * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
147     * is the commons-statistics implementation of the F distribution.
148     *
149     * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
150     * each containing data for one category
151     * @param allowOneElementData if true, allow computation for one catagory
152     * only or for one data element per category
153     * @return Pvalue
154     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
155     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
156     * array is less than 2 or a contained {@link SummaryStatistics} does not have
157     * at least two values
158     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
159     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
160     * @since 3.2
161     */
162    public double anovaPValue(final Collection<SummaryStatistics> categoryData,
163                              final boolean allowOneElementData)
164        throws NullArgumentException, DimensionMismatchException,
165               ConvergenceException, MaxCountExceededException {
166
167        final AnovaStats a = anovaStats(categoryData, allowOneElementData);
168        // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
169        final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
170        return fdist.survivalProbability(a.f);
171    }
172
173    /**
174     * This method calls the method that actually does the calculations (except
175     * P-value).
176     *
177     * @param categoryData
178     *            <code>Collection</code> of <code>double[]</code> arrays each
179     *            containing data for one category
180     * @return computed AnovaStats
181     * @throws NullArgumentException
182     *             if <code>categoryData</code> is <code>null</code>
183     * @throws DimensionMismatchException
184     *             if the length of the <code>categoryData</code> array is less
185     *             than 2 or a contained <code>double[]</code> array does not
186     *             contain at least two values
187     */
188    private AnovaStats anovaStats(final Collection<double[]> categoryData)
189        throws NullArgumentException, DimensionMismatchException {
190
191        NullArgumentException.check(categoryData);
192
193        final Collection<SummaryStatistics> categoryDataSummaryStatistics =
194                new ArrayList<>(categoryData.size());
195
196        // convert arrays to SummaryStatistics
197        for (final double[] data : categoryData) {
198            final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
199            categoryDataSummaryStatistics.add(dataSummaryStatistics);
200            for (final double val : data) {
201                dataSummaryStatistics.addValue(val);
202            }
203        }
204
205        return anovaStats(categoryDataSummaryStatistics, false);
206    }
207
208    /**
209     * Performs an ANOVA test, evaluating the null hypothesis that there
210     * is no difference among the means of the data categories.
211     *
212     * <p><strong>Preconditions</strong>: <ul>
213     * <li>The categoryData <code>Collection</code> must contain
214     * <code>double[]</code> arrays.</li>
215     * <li> There must be at least two <code>double[]</code> arrays in the
216     * <code>categoryData</code> collection and each of these arrays must
217     * contain at least two values.</li>
218     * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
219     * </li></ul><p>
220     * This implementation uses the
221     * {@link org.apache.commons.statistics.distribution.FDistribution
222     * commons-math F Distribution implementation} to estimate the exact
223     * p-value, using the formula<pre>
224     *   p = survivalProbability(F)</pre>
225     * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
226     * is the commons-statistics implementation of the F distribution.
227     * <p>True is returned iff the estimated p-value is less than alpha.</p>
228     *
229     * @param categoryData <code>Collection</code> of <code>double[]</code>
230     * arrays each containing data for one category
231     * @param alpha significance level of the test
232     * @return true if the null hypothesis can be rejected with
233     * confidence 1 - alpha
234     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
235     * @throws DimensionMismatchException if the length of the <code>categoryData</code>
236     * array is less than 2 or a contained <code>double[]</code> array does not have
237     * at least two values
238     * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
239     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
240     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
241     */
242    public boolean anovaTest(final Collection<double[]> categoryData,
243                             final double alpha)
244        throws NullArgumentException, DimensionMismatchException,
245        OutOfRangeException, ConvergenceException, MaxCountExceededException {
246
247        if (alpha <= 0 || alpha > 0.5) {
248            throw new OutOfRangeException(
249                    LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
250                    alpha, 0, 0.5);
251        }
252        return anovaPValue(categoryData) < alpha;
253    }
254
255    /**
256     * This method actually does the calculations (except P-value).
257     *
258     * @param categoryData <code>Collection</code> of <code>double[]</code>
259     * arrays each containing data for one category
260     * @param allowOneElementData if true, allow computation for one catagory
261     * only or for one data element per category
262     * @return computed AnovaStats
263     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
264     * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
265     * categories is less than 2 or a contained SummaryStatistics does not contain
266     * at least two values
267     */
268    private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
269                                  final boolean allowOneElementData)
270        throws NullArgumentException, DimensionMismatchException {
271
272        NullArgumentException.check(categoryData);
273
274        if (!allowOneElementData) {
275            // check if we have enough categories
276            if (categoryData.size() < 2) {
277                throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
278                                                     categoryData.size(), 2);
279            }
280
281            // check if each category has enough data
282            for (final SummaryStatistics array : categoryData) {
283                if (array.getN() <= 1) {
284                    throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
285                                                         (int) array.getN(), 2);
286                }
287            }
288        }
289
290        int dfwg = 0;
291        double sswg = 0;
292        double totsum = 0;
293        double totsumsq = 0;
294        int totnum = 0;
295
296        for (final SummaryStatistics data : categoryData) {
297
298            final double sum = data.getSum();
299            final double sumsq = data.getSumsq();
300            final int num = (int) data.getN();
301            totnum += num;
302            totsum += sum;
303            totsumsq += sumsq;
304
305            dfwg += num - 1;
306            final double ss = sumsq - ((sum * sum) / num);
307            sswg += ss;
308        }
309
310        final double sst = totsumsq - ((totsum * totsum) / totnum);
311        final double ssbg = sst - sswg;
312        final int dfbg = categoryData.size() - 1;
313        final double msbg = ssbg / dfbg;
314        final double mswg = sswg / dfwg;
315        final double f = msbg / mswg;
316
317        return new AnovaStats(dfbg, dfwg, f);
318    }
319
320    /**
321        Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
322        No get/set methods provided.
323    */
324    private static final class AnovaStats {
325
326        /** Degrees of freedom in numerator (between groups). */
327        private final int dfbg;
328
329        /** Degrees of freedom in denominator (within groups). */
330        private final int dfwg;
331
332        /** Statistic. */
333        private final double f;
334
335        /**
336         * Constructor.
337         * @param dfbg degrees of freedom in numerator (between groups)
338         * @param dfwg degrees of freedom in denominator (within groups)
339         * @param f statistic
340         */
341        private AnovaStats(int dfbg, int dfwg, double f) {
342            this.dfbg = dfbg;
343            this.dfwg = dfwg;
344            this.f = f;
345        }
346    }
347}