OneWayAnova.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.math4.legacy.stat.inference;

  18. import java.util.ArrayList;
  19. import java.util.Collection;

  20. import org.apache.commons.statistics.distribution.FDistribution;
  21. import org.apache.commons.math4.legacy.exception.ConvergenceException;
  22. import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
  23. import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
  24. import org.apache.commons.math4.legacy.exception.NullArgumentException;
  25. import org.apache.commons.math4.legacy.exception.OutOfRangeException;
  26. import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
  27. import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;

  28. /**
  29.  * Implements one-way ANOVA (analysis of variance) statistics.
  30.  *
  31.  * <p> Tests for differences between two or more categories of univariate data
  32.  * (for example, the body mass index of accountants, lawyers, doctors and
  33.  * computer programmers).  When two categories are given, this is equivalent to
  34.  * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}.
  35.  * </p><p>
  36.  * Uses the {@link org.apache.commons.statistics.distribution.FDistribution
  37.  * commons-math F Distribution implementation} to estimate exact p-values.</p>
  38.  * <p>This implementation is based on a description at
  39.  * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
  40.  * <pre>
  41.  * Abbreviations: bg = between groups,
  42.  *                wg = within groups,
  43.  *                ss = sum squared deviations
  44.  * </pre>
  45.  *
  46.  * @since 1.2
  47.  */
  48. public class OneWayAnova {

  49.     /**
  50.      * Default constructor.
  51.      */
  52.     public OneWayAnova() {
  53.     }

  54.     /**
  55.      * Computes the ANOVA F-value for a collection of <code>double[]</code>
  56.      * arrays.
  57.      *
  58.      * <p><strong>Preconditions</strong>: <ul>
  59.      * <li>The categoryData <code>Collection</code> must contain
  60.      * <code>double[]</code> arrays.</li>
  61.      * <li> There must be at least two <code>double[]</code> arrays in the
  62.      * <code>categoryData</code> collection and each of these arrays must
  63.      * contain at least two values.</li></ul><p>
  64.      * This implementation computes the F statistic using the definitional
  65.      * formula<pre>
  66.      *   F = msbg/mswg</pre>
  67.      * where<pre>
  68.      *  msbg = between group mean square
  69.      *  mswg = within group mean square</pre>
  70.      * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
  71.      * here</a>
  72.      *
  73.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  74.      * arrays each containing data for one category
  75.      * @return Fvalue
  76.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  77.      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
  78.      * array is less than 2 or a contained <code>double[]</code> array does not have
  79.      * at least two values
  80.      */
  81.     public double anovaFValue(final Collection<double[]> categoryData)
  82.         throws NullArgumentException, DimensionMismatchException {

  83.         AnovaStats a = anovaStats(categoryData);
  84.         return a.f;
  85.     }

  86.     /**
  87.      * Computes the ANOVA P-value for a collection of <code>double[]</code>
  88.      * arrays.
  89.      *
  90.      * <p><strong>Preconditions</strong>: <ul>
  91.      * <li>The categoryData <code>Collection</code> must contain
  92.      * <code>double[]</code> arrays.</li>
  93.      * <li> There must be at least two <code>double[]</code> arrays in the
  94.      * <code>categoryData</code> collection and each of these arrays must
  95.      * contain at least two values.</li></ul><p>
  96.      * This implementation uses the
  97.      * {@link org.apache.commons.statistics.distribution.FDistribution
  98.      * commons-math F Distribution implementation} to estimate the exact
  99.      * p-value, using the formula<pre>
  100.      *   p = survivalProbability(F)</pre>
  101.      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
  102.      * is the commons-statistics implementation of the F distribution.
  103.      *
  104.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  105.      * arrays each containing data for one category
  106.      * @return Pvalue
  107.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  108.      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
  109.      * array is less than 2 or a contained <code>double[]</code> array does not have
  110.      * at least two values
  111.      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
  112.      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
  113.      */
  114.     public double anovaPValue(final Collection<double[]> categoryData)
  115.         throws NullArgumentException, DimensionMismatchException,
  116.         ConvergenceException, MaxCountExceededException {

  117.         final AnovaStats a = anovaStats(categoryData);
  118.         // No try-catch or advertised exception because args are valid
  119.         // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
  120.         final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
  121.         return fdist.survivalProbability(a.f);
  122.     }

  123.     /**
  124.      * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
  125.      *
  126.      * <p><strong>Preconditions</strong>: <ul>
  127.      * <li>The categoryData <code>Collection</code> must contain
  128.      * {@link SummaryStatistics}.</li>
  129.      * <li> There must be at least two {@link SummaryStatistics} in the
  130.      * <code>categoryData</code> collection and each of these statistics must
  131.      * contain at least two values.</li></ul><p>
  132.      * This implementation uses the
  133.      * {@link org.apache.commons.statistics.distribution.FDistribution
  134.      * commons-math F Distribution implementation} to estimate the exact
  135.      * p-value, using the formula<pre>
  136.      *   p = survivalProbability(F)</pre>
  137.      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
  138.      * is the commons-statistics implementation of the F distribution.
  139.      *
  140.      * @param categoryData <code>Collection</code> of {@link SummaryStatistics}
  141.      * each containing data for one category
  142.      * @param allowOneElementData if true, allow computation for one catagory
  143.      * only or for one data element per category
  144.      * @return Pvalue
  145.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  146.      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
  147.      * array is less than 2 or a contained {@link SummaryStatistics} does not have
  148.      * at least two values
  149.      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
  150.      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
  151.      * @since 3.2
  152.      */
  153.     public double anovaPValue(final Collection<SummaryStatistics> categoryData,
  154.                               final boolean allowOneElementData)
  155.         throws NullArgumentException, DimensionMismatchException,
  156.                ConvergenceException, MaxCountExceededException {

  157.         final AnovaStats a = anovaStats(categoryData, allowOneElementData);
  158.         // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
  159.         final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg);
  160.         return fdist.survivalProbability(a.f);
  161.     }

  162.     /**
  163.      * This method calls the method that actually does the calculations (except
  164.      * P-value).
  165.      *
  166.      * @param categoryData
  167.      *            <code>Collection</code> of <code>double[]</code> arrays each
  168.      *            containing data for one category
  169.      * @return computed AnovaStats
  170.      * @throws NullArgumentException
  171.      *             if <code>categoryData</code> is <code>null</code>
  172.      * @throws DimensionMismatchException
  173.      *             if the length of the <code>categoryData</code> array is less
  174.      *             than 2 or a contained <code>double[]</code> array does not
  175.      *             contain at least two values
  176.      */
  177.     private AnovaStats anovaStats(final Collection<double[]> categoryData)
  178.         throws NullArgumentException, DimensionMismatchException {

  179.         NullArgumentException.check(categoryData);

  180.         final Collection<SummaryStatistics> categoryDataSummaryStatistics =
  181.                 new ArrayList<>(categoryData.size());

  182.         // convert arrays to SummaryStatistics
  183.         for (final double[] data : categoryData) {
  184.             final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
  185.             categoryDataSummaryStatistics.add(dataSummaryStatistics);
  186.             for (final double val : data) {
  187.                 dataSummaryStatistics.addValue(val);
  188.             }
  189.         }

  190.         return anovaStats(categoryDataSummaryStatistics, false);
  191.     }

  192.     /**
  193.      * Performs an ANOVA test, evaluating the null hypothesis that there
  194.      * is no difference among the means of the data categories.
  195.      *
  196.      * <p><strong>Preconditions</strong>: <ul>
  197.      * <li>The categoryData <code>Collection</code> must contain
  198.      * <code>double[]</code> arrays.</li>
  199.      * <li> There must be at least two <code>double[]</code> arrays in the
  200.      * <code>categoryData</code> collection and each of these arrays must
  201.      * contain at least two values.</li>
  202.      * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
  203.      * </li></ul><p>
  204.      * This implementation uses the
  205.      * {@link org.apache.commons.statistics.distribution.FDistribution
  206.      * commons-math F Distribution implementation} to estimate the exact
  207.      * p-value, using the formula<pre>
  208.      *   p = survivalProbability(F)</pre>
  209.      * where <code>F</code> is the F value and <code>survivalProbability = 1 - cumulativeProbability</code>
  210.      * is the commons-statistics implementation of the F distribution.
  211.      * <p>True is returned iff the estimated p-value is less than alpha.</p>
  212.      *
  213.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  214.      * arrays each containing data for one category
  215.      * @param alpha significance level of the test
  216.      * @return true if the null hypothesis can be rejected with
  217.      * confidence 1 - alpha
  218.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  219.      * @throws DimensionMismatchException if the length of the <code>categoryData</code>
  220.      * array is less than 2 or a contained <code>double[]</code> array does not have
  221.      * at least two values
  222.      * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
  223.      * @throws ConvergenceException if the p-value can not be computed due to a convergence error
  224.      * @throws MaxCountExceededException if the maximum number of iterations is exceeded
  225.      */
  226.     public boolean anovaTest(final Collection<double[]> categoryData,
  227.                              final double alpha)
  228.         throws NullArgumentException, DimensionMismatchException,
  229.         OutOfRangeException, ConvergenceException, MaxCountExceededException {

  230.         if (alpha <= 0 || alpha > 0.5) {
  231.             throw new OutOfRangeException(
  232.                     LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
  233.                     alpha, 0, 0.5);
  234.         }
  235.         return anovaPValue(categoryData) < alpha;
  236.     }

  237.     /**
  238.      * This method actually does the calculations (except P-value).
  239.      *
  240.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  241.      * arrays each containing data for one category
  242.      * @param allowOneElementData if true, allow computation for one catagory
  243.      * only or for one data element per category
  244.      * @return computed AnovaStats
  245.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  246.      * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
  247.      * categories is less than 2 or a contained SummaryStatistics does not contain
  248.      * at least two values
  249.      */
  250.     private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
  251.                                   final boolean allowOneElementData)
  252.         throws NullArgumentException, DimensionMismatchException {

  253.         NullArgumentException.check(categoryData);

  254.         if (!allowOneElementData) {
  255.             // check if we have enough categories
  256.             if (categoryData.size() < 2) {
  257.                 throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
  258.                                                      categoryData.size(), 2);
  259.             }

  260.             // check if each category has enough data
  261.             for (final SummaryStatistics array : categoryData) {
  262.                 if (array.getN() <= 1) {
  263.                     throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
  264.                                                          (int) array.getN(), 2);
  265.                 }
  266.             }
  267.         }

  268.         int dfwg = 0;
  269.         double sswg = 0;
  270.         double totsum = 0;
  271.         double totsumsq = 0;
  272.         int totnum = 0;

  273.         for (final SummaryStatistics data : categoryData) {

  274.             final double sum = data.getSum();
  275.             final double sumsq = data.getSumsq();
  276.             final int num = (int) data.getN();
  277.             totnum += num;
  278.             totsum += sum;
  279.             totsumsq += sumsq;

  280.             dfwg += num - 1;
  281.             final double ss = sumsq - ((sum * sum) / num);
  282.             sswg += ss;
  283.         }

  284.         final double sst = totsumsq - ((totsum * totsum) / totnum);
  285.         final double ssbg = sst - sswg;
  286.         final int dfbg = categoryData.size() - 1;
  287.         final double msbg = ssbg / dfbg;
  288.         final double mswg = sswg / dfwg;
  289.         final double f = msbg / mswg;

  290.         return new AnovaStats(dfbg, dfwg, f);
  291.     }

  292.     /**
  293.         Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
  294.         No get/set methods provided.
  295.     */
  296.     private static final class AnovaStats {

  297.         /** Degrees of freedom in numerator (between groups). */
  298.         private final int dfbg;

  299.         /** Degrees of freedom in denominator (within groups). */
  300.         private final int dfwg;

  301.         /** Statistic. */
  302.         private final double f;

  303.         /**
  304.          * Constructor.
  305.          * @param dfbg degrees of freedom in numerator (between groups)
  306.          * @param dfwg degrees of freedom in denominator (within groups)
  307.          * @param f statistic
  308.          */
  309.         private AnovaStats(int dfbg, int dfwg, double f) {
  310.             this.dfbg = dfbg;
  311.             this.dfwg = dfwg;
  312.             this.f = f;
  313.         }
  314.     }
  315. }