OneWayAnova.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.inference;

  18. import java.util.Collection;
  19. import java.util.Iterator;
  20. import org.apache.commons.numbers.core.Sum;
  21. import org.apache.commons.statistics.distribution.FDistribution;

  22. /**
  23.  * Implements one-way ANOVA (analysis of variance) statistics.
  24.  *
  25.  * <p>Tests for differences between two or more categories of univariate data
  26.  * (for example, the body mass index of accountants, lawyers, doctors and
  27.  * computer programmers). When two categories are given, this is equivalent to
  28.  * the {@link TTest}.
  29.  *
  30.  * <p>This implementation computes the F statistic using the definitional formula:
  31.  *
  32.  * <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \]
  33.  *
  34.  * @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a>
  35.  * @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems">
  36.  * Multiple-comparison ANOVA problems (Wikipedia)</a>
  37.  * @see <a href="https://www.biostathandbook.com/onewayanova.html">
  38.  * McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland.
  39.  * One-way anova. pp 145-156.</a>
  40.  * @since 1.1
  41.  */
  42. public final class OneWayAnova {
  43.     /** Default instance. */
  44.     private static final OneWayAnova DEFAULT = new OneWayAnova();

  45.     /**
  46.      * Result for the one-way ANOVA.
  47.      *
  48.      * <p>This class is immutable.
  49.      *
  50.      * @since 1.1
  51.      */
  52.     public static final class Result extends BaseSignificanceResult {
  53.         /** Degrees of freedom in numerator (between groups). */
  54.         private final int dfbg;
  55.         /** Degrees of freedom in denominator (within groups). */
  56.         private final long dfwg;
  57.         /** Mean square between groups. */
  58.         private final double msbg;
  59.         /** Mean square within groups. */
  60.         private final double mswg;
  61.         /** nO value used to partition the variance. */
  62.         private final double nO;

  63.         /**
  64.          * @param dfbg Degrees of freedom in numerator (between groups).
  65.          * @param dfwg Degrees of freedom in denominator (within groups).
  66.          * @param msbg Mean square between groups.
  67.          * @param mswg Mean square within groups.
  68.          * @param nO Factor for partitioning the variance.
  69.          * @param f F statistic
  70.          * @param p P-value.
  71.          */
  72.         Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) {
  73.             super(f, p);
  74.             this.dfbg = dfbg;
  75.             this.dfwg = dfwg;
  76.             this.msbg = msbg;
  77.             this.mswg = mswg;
  78.             this.nO = nO;
  79.         }

  80.         /**
  81.          * Gets the degrees of freedom in the numerator (between groups).
  82.          *
  83.          * @return degrees of freedom between groups
  84.          */
  85.         int getDFBG() {
  86.             return dfbg;
  87.         }

  88.         /**
  89.          * Gets the degrees of freedom in the denominator (within groups).
  90.          *
  91.          * @return degrees of freedom within groups
  92.          */
  93.         long getDFWG() {
  94.             return dfwg;
  95.         }

  96.         /**
  97.          * Gets the mean square between groups.
  98.          *
  99.          * @return mean square between groups
  100.          */
  101.         public double getMSBG() {
  102.             return msbg;
  103.         }

  104.         /**
  105.          * Gets the mean square within groups.
  106.          *
  107.          * @return mean square within groups
  108.          */
  109.         public double getMSWG() {
  110.             return mswg;
  111.         }

  112.         /**
  113.          * Gets the variance component between groups.
  114.          *
  115.          * <p>The value is a partitioning of the variance.
  116.          * It is the complement of {@link #getVCWG()}.
  117.          *
  118.          * <p>Partitioning the variance applies only to a model II
  119.          * (random effects) one-way anova. This applies when the
  120.          * groups are random samples from a larger set of groups;
  121.          * partitioning the variance allows comparison of the
  122.          * variation between groups to the variation within groups.
  123.          *
  124.          * <p>If the {@linkplain #getMSBG() MSBG} is less than the
  125.          * {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this
  126.          * creates an estimate of the added variance component
  127.          * between groups as:
  128.          *
  129.          * <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \]
  130.          *
  131.          * <p>where \( n_o \) is a number close to, but usually less than,
  132.          * the arithmetic mean of the sample size \(n_i\) of each
  133.          * of the \( a \) groups:
  134.          *
  135.          * <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \]
  136.          *
  137.          * <p>The added variance component among groups \( A \) is expressed
  138.          * as a fraction of the total variance components \( A + B \) where
  139.          * \( B \) is the {@linkplain #getMSWG() MSWG}.
  140.          *
  141.          * @return variance component between groups (in [0, 1]).
  142.          */
  143.         public double getVCBG() {
  144.             if (msbg <= mswg) {
  145.                 return 0;
  146.             }
  147.             // a is an estimate of the between-group variance
  148.             final double a = (msbg - mswg) / nO;
  149.             final double b = mswg;
  150.             return a / (a + b);
  151.         }

  152.         /**
  153.          * Gets the variance component within groups.
  154.          *
  155.          * <p>The value is a partitioning of the variance.
  156.          * It is the complement of {@link #getVCBG()}. See
  157.          * that method for details.
  158.          *
  159.          * @return variance component within groups (in [0, 1]).
  160.          */
  161.         public double getVCWG() {
  162.             if (msbg <= mswg) {
  163.                 return 1;
  164.             }
  165.             final double a = (msbg - mswg) / nO;
  166.             final double b = mswg;
  167.             return b / (a + b);
  168.         }
  169.     }

  170.     /** Private constructor. */
  171.     private OneWayAnova() {
  172.         // Do nothing
  173.     }

  174.     /**
  175.      * Return an instance using the default options.
  176.      *
  177.      * @return default instance
  178.      */
  179.     public static OneWayAnova withDefaults() {
  180.         return DEFAULT;
  181.     }

  182.     /**
  183.      * Computes the F statistic for an ANOVA test for a collection of category data,
  184.      * evaluating the null hypothesis that there is no difference among the means of
  185.      * the data categories.
  186.      *
  187.      * <p>Special cases:
  188.      * <ul>
  189.      * <li>If the value in each category is the same (no variance within groups) but different
  190.      * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}.
  191.      * <li>If the value in every group is the same (no variance within or between groups),
  192.      * the f-value is {@link Double#NaN NaN}.
  193.      * </ul>
  194.      *
  195.      * @param data Category summary data.
  196.      * @return F statistic
  197.      * @throws IllegalArgumentException if the number of categories is less than
  198.      * two; a contained category does not have at least one value; or all
  199.      * categories have only one value (zero degrees of freedom within groups)
  200.      */
  201.     public double statistic(Collection<double[]> data) {
  202.         final double[] f = new double[1];
  203.         aov(data, f);
  204.         return f[0];
  205.     }

  206.     /**
  207.      * Performs an ANOVA test for a collection of category data,
  208.      * evaluating the null hypothesis that there is no difference among the means of
  209.      * the data categories.
  210.      *
  211.      * <p>Special cases:
  212.      * <ul>
  213.      * <li>If the value in each category is the same (no variance within groups) but different
  214.      * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero.
  215.      * <li>If the value in every group is the same (no variance within or between groups),
  216.      * the f-value and p-value are {@link Double#NaN NaN}.
  217.      * </ul>
  218.      *
  219.      * @param data Category summary data.
  220.      * @return test result
  221.      * @throws IllegalArgumentException if the number of categories is less than
  222.      * two; a contained category does not have at least one value; or all
  223.      * categories have only one value (zero degrees of freedom within groups)
  224.      */
  225.     public Result test(Collection<double[]> data) {
  226.         return aov(data, null);
  227.     }

  228.     /**
  229.      * Performs an ANOVA test for a collection of category data, evaluating the null
  230.      * hypothesis that there is no difference among the means of the data categories.
  231.      *
  232.      * <p>This is a utility method to allow computation of the F statistic without
  233.      * the p-value or partitioning of the variance. If the {@code statistic} is not null
  234.      * the method will record the F statistic in the array and return null.
  235.      *
  236.      * @param data Category summary data.
  237.      * @param statistic Result for the F statistic (or null).
  238.      * @return test result (or null)
  239.      * @throws IllegalArgumentException if the number of categories is less than two; a
  240.      * contained category does not have at least one value; or all categories have only
  241.      * one value (zero degrees of freedom within groups)
  242.      */
  243.     private static Result aov(Collection<double[]> data, double[] statistic) {
  244.         Arguments.checkCategoriesRequiredSize(data.size(), 2);
  245.         long n = 0;
  246.         for (final double[] array : data) {
  247.             n += array.length;
  248.             Arguments.checkValuesRequiredSize(array.length, 1);
  249.         }
  250.         final long dfwg = n - data.size();
  251.         if (dfwg == 0) {
  252.             throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
  253.         }

  254.         // wg = within group
  255.         // bg = between group

  256.         // F = Var(bg) / Var(wg)
  257.         // Var = SS / df
  258.         // SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
  259.         //         = SSwg + SSbg
  260.         // Some cancellation of terms reduces the computation to 3 sums:
  261.         // SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
  262.         // SSbg = SStotal - SSwg
  263.         //      = sum_g { sum(x)^2/n) } - sum(x)^2/n
  264.         // SSwg = SStotal - SSbg
  265.         //      = sum(x^2) - sum_g { sum(x)^2/n) }

  266.         // Stabilize the computation by shifting all to a common mean of zero.
  267.         // This minimise the magnitude of x^2 terms.
  268.         // The terms sum(x)^2/n -> 0. Included them to capture the round-off.
  269.         final double m = StatisticUtils.mean(data);
  270.         final Sum sxx = Sum.create();
  271.         final Sum sx = Sum.create();
  272.         final Sum sg = Sum.create();
  273.         // Track if each group had the same value
  274.         boolean eachSame = true;
  275.         for (final double[] array : data) {
  276.             eachSame = eachSame && allMatch(array[0], array);
  277.             final Sum s = Sum.create();
  278.             for (final double v : array) {
  279.                 final double x = v - m;
  280.                 s.add(x);
  281.                 // sum(x)
  282.                 sx.add(x);
  283.                 // sum(x^2)
  284.                 sxx.add(x * x);
  285.             }
  286.             // Create the negative sum so we can subtract it via 'add'
  287.             // -sum_g { sum(x)^2/n) }
  288.             sg.add(-pow2(s.getAsDouble()) / array.length);
  289.         }

  290.         // Note: SS terms should not be negative given:
  291.         // SS = sum((x - u)^2)
  292.         // This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
  293.         final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
  294.         // Flip the sign back
  295.         final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble());
  296.         final int dfbg = data.size() - 1;
  297.         // Handle edge-cases:
  298.         // Note: 0 / 0 -> NaN : x / 0 -> inf
  299.         // These are documented results and should output p=NaN or 0.
  300.         // This result will occur naturally.
  301.         // However the SS totals may not be 0.0 so we correct these cases.
  302.         final boolean allSame = eachSame && allMatch(data);
  303.         final double msbg = allSame ? 0 : ssbg / dfbg;
  304.         final double mswg = eachSame ? 0 : sswg / dfwg;
  305.         final double f = msbg / mswg;
  306.         if (statistic != null) {
  307.             statistic[0] = f;
  308.             return null;
  309.         }
  310.         final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);

  311.         // Support partitioning the variance
  312.         // ni = size of each of the groups
  313.         // nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
  314.         final double nO = (n - data.stream()
  315.                 .mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg;

  316.         return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
  317.     }

  318.     /**
  319.      * Return true if all values in the array match the specified value.
  320.      *
  321.      * @param v Value.
  322.      * @param a Array.
  323.      * @return true if all match
  324.      */
  325.     private static boolean allMatch(double v, double[] a) {
  326.         for (final double w : a) {
  327.             if (v != w) {
  328.                 return false;
  329.             }
  330.         }
  331.         return true;
  332.     }

  333.     /**
  334.      * Return true if all values in the arrays match.
  335.      *
  336.      * <p>Assumes that there are at least two arrays and that each array has the same
  337.      * value throughout. Thus only the first element in each array is checked.
  338.      *
  339.      * @param data Arrays.
  340.      * @return true if all match
  341.      */
  342.     private static boolean allMatch(Collection<double[]> data) {
  343.         final Iterator<double[]> iter = data.iterator();
  344.         final double v = iter.next()[0];
  345.         while (iter.hasNext()) {
  346.             if (iter.next()[0] != v) {
  347.                 return false;
  348.             }
  349.         }
  350.         return true;
  351.     }

  352.     /**
  353.      * Compute {@code x^2}.
  354.      *
  355.      * @param x Value.
  356.      * @return {@code x^2}
  357.      */
  358.     private static double pow2(double x) {
  359.         return x * x;
  360.     }
  361. }