GTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.inference;

  18. import org.apache.commons.numbers.core.Sum;
  19. import org.apache.commons.statistics.descriptive.LongMean;
  20. import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

  21. /**
  22.  * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
  23.  *
  24.  * <p>This is known in statistical genetics as the McDonald-Kreitman test.
  25.  * The implementation handles both known and unknown distributions.
  26.  *
  27.  * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
  28.  * but provided by one sample, or when the hypothesis under test is that the two
  29.  * samples come from the same underlying distribution.
  30.  *
  31.  * @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a>
  32.  * @since 1.1
  33.  */
  34. public final class GTest {
  35.     // Note:
  36.     // The g-test statistic is a summation of terms with positive and negative sign
  37.     // and thus the sum may exhibit cancellation. This class uses separate high precision
  38.     // sums of the positive and negative terms which are then combined.
  39.     // Total cancellation for a large number of terms will not impact
  40.     // p-values of interest around critical alpha values as the Chi^2
  41.     // distribution exhibits strong concentration around its mean (degrees of freedom, k).
  42.     // The summation only need maintain enough bits in the final sum to distinguish
  43.     // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k,
  44.     // with k = number of terms - 1.

  45.     /** Default instance. */
  46.     private static final GTest DEFAULT = new GTest(0);

  47.     /** Degrees of freedom adjustment. */
  48.     private final int degreesOfFreedomAdjustment;

  49.     /**
  50.      * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
  51.      */
  52.     private GTest(int degreesOfFreedomAdjustment) {
  53.         this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
  54.     }

  55.     /**
  56.      * Return an instance using the default options.
  57.      *
  58.      * <ul>
  59.      * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
  60.      * </ul>
  61.      *
  62.      * @return default instance
  63.      */
  64.     public static GTest withDefaults() {
  65.         return DEFAULT;
  66.     }

  67.     /**
  68.      * Return an instance with the configured degrees of freedom adjustment.
  69.      *
  70.      * <p>The default degrees of freedom for a sample of length {@code n} are
  71.      * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
  72.      * more parameters from the data in order to get the numbers for your null
  73.      * hypothesis. For a distribution with {@code p} parameters where up to
  74.      * {@code p} parameters have been estimated from the data the degrees of freedom
  75.      * is in the range {@code [n - 1 - p, n - 1]}.
  76.      *
  77.      * @param v Value.
  78.      * @return an instance
  79.      * @throws IllegalArgumentException if the value is negative
  80.      */
  81.     public GTest withDegreesOfFreedomAdjustment(int v) {
  82.         return new GTest(Arguments.checkNonNegative(v));
  83.     }

  84.     /**
  85.      * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to
  86.      * a uniform expected value (each category is equally likely).
  87.      *
  88.      * <p>Note: This is a specialized version of a comparison of {@code observed}
  89.      * with an {@code expected} array of uniform values. The result is faster than
  90.      * calling {@link #statistic(double[], long[])} and the statistic is the same,
  91.      * with an allowance for accumulated floating-point error due to the optimized
  92.      * routine.
  93.      *
  94.      * @param observed Observed frequency counts.
  95.      * @return G-test statistic
  96.      * @throws IllegalArgumentException if the sample size is less than 2;
  97.      * {@code observed} has negative entries; or all the observations are zero.
  98.      * @see #test(long[])
  99.      */
  100.     public double statistic(long[] observed) {
  101.         Arguments.checkValuesRequiredSize(observed.length, 2);
  102.         Arguments.checkNonNegative(observed);
  103.         final double e = LongMean.of(observed).getAsDouble();
  104.         if (e == 0) {
  105.             throw new InferenceException(InferenceException.NO_DATA);
  106.         }
  107.         // g = 2 * sum{o * ln(o/e)}
  108.         //   = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ]
  109.         // The second form has more cancellation as the sums are larger.
  110.         // Separate sum for positive and negative terms.
  111.         final Sum sum = Sum.create();
  112.         final Sum sum2 = Sum.create();
  113.         for (final double o : observed) {
  114.             if (o > e) {
  115.                 // Positive term
  116.                 sum.add(o * Math.log(o / e));
  117.             } else if (o > 0) {
  118.                 // Negative term
  119.                 // Process non-zero counts to avoid 0 * -inf = NaN
  120.                 sum2.add(o * Math.log(o / e));
  121.             }
  122.         }
  123.         return sum.add(sum2).getAsDouble() * 2;
  124.     }

  125.     /**
  126.      * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected}
  127.      * frequency counts.
  128.      *
  129.      * <p><strong>Note:</strong>This implementation rescales the values
  130.      * if necessary to ensure that the sum of the expected and observed counts
  131.      * are equal.
  132.      *
  133.      * @param expected Expected frequency counts.
  134.      * @param observed Observed frequency counts.
  135.      * @return G-test statistic
  136.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  137.      * sizes do not match; {@code expected} has entries that are not strictly
  138.      * positive; {@code observed} has negative entries; or all the observations are zero.
  139.      * @see #test(double[], long[])
  140.      */
  141.     public double statistic(double[] expected, long[] observed) {
  142.         // g = 2 * sum{o * ln(o/e)}
  143.         // The sum of o and e must be the same.
  144.         final double ratio = StatisticUtils.computeRatio(expected, observed);
  145.         // High precision sum to reduce cancellation.
  146.         // Separate sum for positive and negative terms.
  147.         final Sum sum = Sum.create();
  148.         final Sum sum2 = Sum.create();
  149.         for (int i = 0; i < observed.length; i++) {
  150.             final long o = observed[i];
  151.             // Process non-zero counts to avoid 0 * -inf = NaN
  152.             if (o != 0) {
  153.                 final double term = o * Math.log(o / (ratio * expected[i]));
  154.                 if (term < 0) {
  155.                     sum2.add(term);
  156.                 } else {
  157.                     sum.add(term);
  158.                 }
  159.             }
  160.         }
  161.         return sum.add(sum2).getAsDouble() * 2;
  162.     }

  163.     /**
  164.      * Computes a G-test statistic associated with a G-test of
  165.      * independence based on the input {@code counts} array, viewed as a two-way
  166.      * table. The formula used to compute the test statistic is:
  167.      *
  168.      * <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \]
  169.      *
  170.      * <p>and \( H \) is the <a
  171.      * href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
  172.      * Shannon Entropy</a> of the random variable formed by viewing the elements of
  173.      * the argument array as incidence counts:
  174.      *
  175.      * <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \]
  176.      *
  177.      * @param counts 2-way table.
  178.      * @return G-test statistic
  179.      * @throws IllegalArgumentException if the number of rows or columns is less
  180.      * than 2; the array is non-rectangular; the array has negative entries; or the
  181.      * sum of a row or column is zero.
  182.      * @see ChiSquareTest#test(long[][])
  183.      */
  184.     public double statistic(long[][] counts) {
  185.         Arguments.checkCategoriesRequiredSize(counts.length, 2);
  186.         Arguments.checkValuesRequiredSize(counts[0].length, 2);
  187.         Arguments.checkRectangular(counts);
  188.         Arguments.checkNonNegative(counts);

  189.         final int ni = counts.length;
  190.         final int nj = counts[0].length;

  191.         // Compute row, column and total sums
  192.         final double[] sumi = new double[ni];
  193.         final double[] sumj = new double[nj];
  194.         double n = 0;
  195.         // We can sum data on the first pass. See below for computation details.
  196.         final Sum sum = Sum.create();
  197.         for (int i = 0; i < ni; i++) {
  198.             for (int j = 0; j < nj; j++) {
  199.                 final long c = counts[i][j];
  200.                 sumi[i] += c;
  201.                 sumj[j] += c;
  202.                 if (c > 1) {
  203.                     sum.add(c * Math.log(c));
  204.                 }
  205.             }
  206.             checkNonZero(sumi[i], "Row", i);
  207.             n += sumi[i];
  208.         }

  209.         for (int j = 0; j < nj; j++) {
  210.             checkNonZero(sumj[j], "Column", j);
  211.         }

  212.         // This computes a modified form of the Shannon entropy H without requiring
  213.         // normalisation of observations to probabilities and without negation,
  214.         // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].

  215.         // H  = -sum (p * log(p))
  216.         // H' = n * sum (p * log(p))
  217.         //    = n * sum (o/n * log(o/n))
  218.         //    = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
  219.         //    = sum(o * log(o)) - n log(n)

  220.         // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
  221.         sum.addProduct(n, Math.log(n));
  222.         // Negative terms
  223.         final Sum sum2 = Sum.create();
  224.         // All these counts are above zero so no check for zeros
  225.         for (final double c : sumi) {
  226.             sum2.add(c * -Math.log(c));
  227.         }
  228.         for (final double c : sumj) {
  229.             sum2.add(c * -Math.log(c));
  230.         }

  231.         return sum.add(sum2).getAsDouble() * 2;
  232.     }

  233.     /**
  234.      * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
  235.      * counts conform to a uniform distribution (each category is equally likely).
  236.      *
  237.      * @param observed Observed frequency counts.
  238.      * @return test result
  239.      * @throws IllegalArgumentException if the sample size is less than 2;
  240.      * {@code observed} has negative entries; or all the observations are zero
  241.      * @see #statistic(long[])
  242.      */
  243.     public SignificanceResult test(long[] observed) {
  244.         final int df = observed.length - 1;
  245.         final double g = statistic(observed);
  246.         final double p = computeP(g, df);
  247.         return new BaseSignificanceResult(g, p);
  248.     }

  249.     /**
  250.      * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
  251.      * counts conform to the {@code expected} counts.
  252.      *
  253.      * <p>The test can be configured to apply an adjustment to the degrees of freedom
  254.      * if the observed data has been used to create the expected counts.
  255.      *
  256.      * @param expected Expected frequency counts.
  257.      * @param observed Observed frequency counts.
  258.      * @return test result
  259.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  260.      * sizes do not match; {@code expected} has entries that are not strictly
  261.      * positive; {@code observed} has negative entries; all the observations are zero; or
  262.      * the adjusted degrees of freedom are not strictly positive
  263.      * @see #withDegreesOfFreedomAdjustment(int)
  264.      * @see #statistic(double[], long[])
  265.      */
  266.     public SignificanceResult test(double[] expected, long[] observed) {
  267.         final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
  268.         final double g = statistic(expected, observed);
  269.         final double p = computeP(g, df);
  270.         return new BaseSignificanceResult(g, p);
  271.     }

  272.     /**
  273.      * Perform a G-test of independence based on the input
  274.      * {@code counts} array, viewed as a two-way table.
  275.      *
  276.      * @param counts 2-way table.
  277.      * @return test result
  278.      * @throws IllegalArgumentException if the number of rows or columns is less
  279.      * than 2; the array is non-rectangular; the array has negative entries; or the
  280.      * sum of a row or column is zero.
  281.      * @see #statistic(long[][])
  282.      */
  283.     public SignificanceResult test(long[][] counts) {
  284.         final double g = statistic(counts);
  285.         final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
  286.         final double p = computeP(g, df);
  287.         return new BaseSignificanceResult(g, p);
  288.     }

  289.     /**
  290.      * Compute the G-test p-value.
  291.      *
  292.      * @param g G-test statistic.
  293.      * @param degreesOfFreedom Degrees of freedom.
  294.      * @return p-value
  295.      */
  296.     private static double computeP(double g, double degreesOfFreedom) {
  297.         return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g);
  298.     }

  299.     /**
  300.      * Check the array value is non-zero.
  301.      *
  302.      * @param value Value
  303.      * @param name Name of the array
  304.      * @param index Index in the array
  305.      * @throws IllegalArgumentException if the value is zero
  306.      */
  307.     private static void checkNonZero(double value, String name, int index) {
  308.         if (value == 0) {
  309.             throw new InferenceException(InferenceException.ZERO_AT, name, index);
  310.         }
  311.     }
  312. }