GTest.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.statistics.inference;
- import org.apache.commons.numbers.core.Sum;
- import org.apache.commons.statistics.descriptive.LongMean;
- import org.apache.commons.statistics.distribution.ChiSquaredDistribution;
- /**
- * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
- *
- * <p>This is known in statistical genetics as the McDonald-Kreitman test.
- * The implementation handles both known and unknown distributions.
- *
- * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
- * but provided by one sample, or when the hypothesis under test is that the two
- * samples come from the same underlying distribution.
- *
- * @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a>
- * @since 1.1
- */
- public final class GTest {
- // Note:
- // The g-test statistic is a summation of terms with positive and negative sign
- // and thus the sum may exhibit cancellation. This class uses separate high precision
- // sums of the positive and negative terms which are then combined.
- // Total cancellation for a large number of terms will not impact
- // p-values of interest around critical alpha values as the Chi^2
- // distribution exhibits strong concentration around its mean (degrees of freedom, k).
- // The summation only need maintain enough bits in the final sum to distinguish
- // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k,
- // with k = number of terms - 1.
- /** Default instance. */
- private static final GTest DEFAULT = new GTest(0);
- /** Degrees of freedom adjustment. */
- private final int degreesOfFreedomAdjustment;
- /**
- * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
- */
- private GTest(int degreesOfFreedomAdjustment) {
- this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
- }
- /**
- * Return an instance using the default options.
- *
- * <ul>
- * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
- * </ul>
- *
- * @return default instance
- */
- public static GTest withDefaults() {
- return DEFAULT;
- }
- /**
- * Return an instance with the configured degrees of freedom adjustment.
- *
- * <p>The default degrees of freedom for a sample of length {@code n} are
- * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
- * more parameters from the data in order to get the numbers for your null
- * hypothesis. For a distribution with {@code p} parameters where up to
- * {@code p} parameters have been estimated from the data the degrees of freedom
- * is in the range {@code [n - 1 - p, n - 1]}.
- *
- * @param v Value.
- * @return an instance
- * @throws IllegalArgumentException if the value is negative
- */
- public GTest withDegreesOfFreedomAdjustment(int v) {
- return new GTest(Arguments.checkNonNegative(v));
- }
- /**
- * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to
- * a uniform expected value (each category is equally likely).
- *
- * <p>Note: This is a specialized version of a comparison of {@code observed}
- * with an {@code expected} array of uniform values. The result is faster than
- * calling {@link #statistic(double[], long[])} and the statistic is the same,
- * with an allowance for accumulated floating-point error due to the optimized
- * routine.
- *
- * @param observed Observed frequency counts.
- * @return G-test statistic
- * @throws IllegalArgumentException if the sample size is less than 2;
- * {@code observed} has negative entries; or all the observations are zero.
- * @see #test(long[])
- */
- public double statistic(long[] observed) {
- Arguments.checkValuesRequiredSize(observed.length, 2);
- Arguments.checkNonNegative(observed);
- final double e = LongMean.of(observed).getAsDouble();
- if (e == 0) {
- throw new InferenceException(InferenceException.NO_DATA);
- }
- // g = 2 * sum{o * ln(o/e)}
- // = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ]
- // The second form has more cancellation as the sums are larger.
- // Separate sum for positive and negative terms.
- final Sum sum = Sum.create();
- final Sum sum2 = Sum.create();
- for (final double o : observed) {
- if (o > e) {
- // Positive term
- sum.add(o * Math.log(o / e));
- } else if (o > 0) {
- // Negative term
- // Process non-zero counts to avoid 0 * -inf = NaN
- sum2.add(o * Math.log(o / e));
- }
- }
- return sum.add(sum2).getAsDouble() * 2;
- }
- /**
- * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected}
- * frequency counts.
- *
- * <p><strong>Note:</strong>This implementation rescales the values
- * if necessary to ensure that the sum of the expected and observed counts
- * are equal.
- *
- * @param expected Expected frequency counts.
- * @param observed Observed frequency counts.
- * @return G-test statistic
- * @throws IllegalArgumentException if the sample size is less than 2; the array
- * sizes do not match; {@code expected} has entries that are not strictly
- * positive; {@code observed} has negative entries; or all the observations are zero.
- * @see #test(double[], long[])
- */
- public double statistic(double[] expected, long[] observed) {
- // g = 2 * sum{o * ln(o/e)}
- // The sum of o and e must be the same.
- final double ratio = StatisticUtils.computeRatio(expected, observed);
- // High precision sum to reduce cancellation.
- // Separate sum for positive and negative terms.
- final Sum sum = Sum.create();
- final Sum sum2 = Sum.create();
- for (int i = 0; i < observed.length; i++) {
- final long o = observed[i];
- // Process non-zero counts to avoid 0 * -inf = NaN
- if (o != 0) {
- final double term = o * Math.log(o / (ratio * expected[i]));
- if (term < 0) {
- sum2.add(term);
- } else {
- sum.add(term);
- }
- }
- }
- return sum.add(sum2).getAsDouble() * 2;
- }
- /**
- * Computes a G-test statistic associated with a G-test of
- * independence based on the input {@code counts} array, viewed as a two-way
- * table. The formula used to compute the test statistic is:
- *
- * <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \]
- *
- * <p>and \( H \) is the <a
- * href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
- * Shannon Entropy</a> of the random variable formed by viewing the elements of
- * the argument array as incidence counts:
- *
- * <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \]
- *
- * @param counts 2-way table.
- * @return G-test statistic
- * @throws IllegalArgumentException if the number of rows or columns is less
- * than 2; the array is non-rectangular; the array has negative entries; or the
- * sum of a row or column is zero.
- * @see ChiSquareTest#test(long[][])
- */
- public double statistic(long[][] counts) {
- Arguments.checkCategoriesRequiredSize(counts.length, 2);
- Arguments.checkValuesRequiredSize(counts[0].length, 2);
- Arguments.checkRectangular(counts);
- Arguments.checkNonNegative(counts);
- final int ni = counts.length;
- final int nj = counts[0].length;
- // Compute row, column and total sums
- final double[] sumi = new double[ni];
- final double[] sumj = new double[nj];
- double n = 0;
- // We can sum data on the first pass. See below for computation details.
- final Sum sum = Sum.create();
- for (int i = 0; i < ni; i++) {
- for (int j = 0; j < nj; j++) {
- final long c = counts[i][j];
- sumi[i] += c;
- sumj[j] += c;
- if (c > 1) {
- sum.add(c * Math.log(c));
- }
- }
- checkNonZero(sumi[i], "Row", i);
- n += sumi[i];
- }
- for (int j = 0; j < nj; j++) {
- checkNonZero(sumj[j], "Column", j);
- }
- // This computes a modified form of the Shannon entropy H without requiring
- // normalisation of observations to probabilities and without negation,
- // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].
- // H = -sum (p * log(p))
- // H' = n * sum (p * log(p))
- // = n * sum (o/n * log(o/n))
- // = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
- // = sum(o * log(o)) - n log(n)
- // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
- sum.addProduct(n, Math.log(n));
- // Negative terms
- final Sum sum2 = Sum.create();
- // All these counts are above zero so no check for zeros
- for (final double c : sumi) {
- sum2.add(c * -Math.log(c));
- }
- for (final double c : sumj) {
- sum2.add(c * -Math.log(c));
- }
- return sum.add(sum2).getAsDouble() * 2;
- }
- /**
- * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
- * counts conform to a uniform distribution (each category is equally likely).
- *
- * @param observed Observed frequency counts.
- * @return test result
- * @throws IllegalArgumentException if the sample size is less than 2;
- * {@code observed} has negative entries; or all the observations are zero
- * @see #statistic(long[])
- */
- public SignificanceResult test(long[] observed) {
- final int df = observed.length - 1;
- final double g = statistic(observed);
- final double p = computeP(g, df);
- return new BaseSignificanceResult(g, p);
- }
- /**
- * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
- * counts conform to the {@code expected} counts.
- *
- * <p>The test can be configured to apply an adjustment to the degrees of freedom
- * if the observed data has been used to create the expected counts.
- *
- * @param expected Expected frequency counts.
- * @param observed Observed frequency counts.
- * @return test result
- * @throws IllegalArgumentException if the sample size is less than 2; the array
- * sizes do not match; {@code expected} has entries that are not strictly
- * positive; {@code observed} has negative entries; all the observations are zero; or
- * the adjusted degrees of freedom are not strictly positive
- * @see #withDegreesOfFreedomAdjustment(int)
- * @see #statistic(double[], long[])
- */
- public SignificanceResult test(double[] expected, long[] observed) {
- final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
- final double g = statistic(expected, observed);
- final double p = computeP(g, df);
- return new BaseSignificanceResult(g, p);
- }
- /**
- * Perform a G-test of independence based on the input
- * {@code counts} array, viewed as a two-way table.
- *
- * @param counts 2-way table.
- * @return test result
- * @throws IllegalArgumentException if the number of rows or columns is less
- * than 2; the array is non-rectangular; the array has negative entries; or the
- * sum of a row or column is zero.
- * @see #statistic(long[][])
- */
- public SignificanceResult test(long[][] counts) {
- final double g = statistic(counts);
- final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
- final double p = computeP(g, df);
- return new BaseSignificanceResult(g, p);
- }
- /**
- * Compute the G-test p-value.
- *
- * @param g G-test statistic.
- * @param degreesOfFreedom Degrees of freedom.
- * @return p-value
- */
- private static double computeP(double g, double degreesOfFreedom) {
- return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g);
- }
- /**
- * Check the array value is non-zero.
- *
- * @param value Value
- * @param name Name of the array
- * @param index Index in the array
- * @throws IllegalArgumentException if the value is zero
- */
- private static void checkNonZero(double value, String name, int index) {
- if (value == 0) {
- throw new InferenceException(InferenceException.ZERO_AT, name, index);
- }
- }
- }