FisherExactTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.inference;

  18. import java.util.Objects;
  19. import org.apache.commons.statistics.distribution.HypergeometricDistribution;

  20. /**
  21.  * Implements Fisher's exact test.
  22.  *
  23.  * <p>Performs an exact test for the statistical significance of the association (contingency)
  24.  * between two kinds of categorical classification.
  25.  *
  26.  * <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance
  27.  * and not random.
  28.  *
  29.  * @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher&#39;s exact test (Wikipedia)</a>
  30.  * @since 1.1
  31.  */
  32. public final class FisherExactTest {
  33.     /** Default instance. */
  34.     private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED);

  35.     /** Alternative hypothesis. */
  36.     private final AlternativeHypothesis alternative;

  37.     /**
  38.      * @param alternative Alternative hypothesis.
  39.      */
  40.     private FisherExactTest(AlternativeHypothesis alternative) {
  41.         this.alternative = alternative;
  42.     }

  43.     /**
  44.      * Return an instance using the default options.
  45.      *
  46.      * <ul>
  47.      * <li>{@link AlternativeHypothesis#TWO_SIDED}
  48.      * </ul>
  49.      *
  50.      * @return default instance
  51.      */
  52.     public static FisherExactTest withDefaults() {
  53.         return DEFAULT;
  54.     }

  55.     /**
  56.      * Return an instance with the configured alternative hypothesis.
  57.      *
  58.      * @param v Value.
  59.      * @return an instance
  60.      */
  61.     public FisherExactTest with(AlternativeHypothesis v) {
  62.         return new FisherExactTest(Objects.requireNonNull(v));
  63.     }

  64.     /**
  65.      * Compute the prior odds ratio for the 2-by-2 contingency table. This is the
  66.      * "sample" or "unconditional" maximum likelihood estimate. For a table of:
  67.      *
  68.      * <p>\[ \left[ {\begin{array}{cc}
  69.      *         a &amp; b \\
  70.      *         c &amp; d \\
  71.      *       \end{array} } \right] \]
  72.      *
  73.      * <p>this is:
  74.      *
  75.      * <p>\[ r = \frac{a d}{b c} \]
  76.      *
  77.      * <p>Special cases:
  78.      * <ul>
  79.      * <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}.
  80.      * <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}.
  81.      * </ul>
  82.      *
  83.      * <p>Note: This statistic is equal to the statistic computed by the SciPy function
  84.      * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum
  85.      * likelihood estimate computed by R function {@code fisher.test}.
  86.      *
  87.      * @param table 2-by-2 contingency table.
  88.      * @return odds ratio
  89.      * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
  90.      * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
  91.      * @see #test(int[][])
  92.      */
  93.     public double statistic(int[][] table) {
  94.         Arguments.checkTable(table);
  95.         final double a = table[0][0];
  96.         final double b = table[0][1];
  97.         final double c = table[1][0];
  98.         final double d = table[1][1];
  99.         return (a * d) / (b * c);
  100.     }

  101.     /**
  102.      * Performs Fisher's exact test on the 2-by-2 contingency table.
  103.      *
  104.      * <p>The test statistic is equal to the prior odds ratio. This is the
  105.      * "sample" or "unconditional" maximum likelihood estimate.
  106.      *
  107.      * <p>The test is defined by the {@link AlternativeHypothesis}.
  108.      *
  109.      * <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned
  110.      * with the same marginals (row and column totals). In this case the possible values {@code x}
  111.      * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}.
  112.      * <ul>
  113.      * <li>'two-sided': the odds ratio of the underlying population is not one; the p-value
  114.      * is the probability that a random table has probability equal to or less than the input table.
  115.      * <li>'greater': the odds ratio of the underlying population is greater than one; the p-value
  116.      * is the probability that a random table has {@code x >= a}.
  117.      * <li>'less': the odds ratio of the underlying population is less than one; the p-value
  118.      * is the probability that a random table has {@code x <= a}.
  119.      * </ul>
  120.      *
  121.      * @param table 2-by-2 contingency table.
  122.      * @return test result
  123.      * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
  124.      * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
  125.      * @see #with(AlternativeHypothesis)
  126.      * @see #statistic(int[][])
  127.      */
  128.     public SignificanceResult test(int[][] table) {
  129.         Arguments.checkTable(table);
  130.         final int a = table[0][0];
  131.         final int b = table[0][1];
  132.         final int c = table[1][0];
  133.         final int d = table[1][1];

  134.         // Odd-ratio.
  135.         final double statistic = ((double) a * d) / ((double) b * c);

  136.         final int nn = a + b + c + d;
  137.         final int k = a + b;
  138.         final int n = a + c;

  139.         // Note: The distribution validates the population size is > 0
  140.         final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n);
  141.         final double p;
  142.         if (alternative == AlternativeHypothesis.GREATER_THAN) {
  143.             p = distribution.survivalProbability(a - 1);
  144.         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
  145.             p = distribution.cumulativeProbability(a);
  146.         } else {
  147.             p = twoSidedTest(a, distribution);
  148.         }
  149.         return new BaseSignificanceResult(statistic, p);
  150.     }

  151.     /**
  152.      * Returns the <i>observed significance level</i>, or p-value, associated with a
  153.      * two-sided test about the observed value.
  154.      *
  155.      * @param k Observed value.
  156.      * @param distribution Hypergeometric distribution.
  157.      * @return p-value
  158.      */
  159.     private static double twoSidedTest(int k, HypergeometricDistribution distribution) {
  160.         // Find all i where Pr(X = i) <= Pr(X = k) and sum them.
  161.         // Exploit the known unimodal distribution to increase the
  162.         // search speed. Note the search depends only on magnitude differences.
  163.         // The current HypergeometricDistribution is faster using log probability
  164.         // as it omits a call to Math.exp.

  165.         // Use the mode as the point of largest probability.
  166.         // The lower or upper mode is important for the search below.
  167.         final int nn = distribution.getPopulationSize();
  168.         final int kk = distribution.getNumberOfSuccesses();
  169.         final int n = distribution.getSampleSize();
  170.         final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0);
  171.         final int m1 = (int) Math.ceil(v) - 1;
  172.         final int m2 = (int) Math.floor(v);
  173.         if (k < m1) {
  174.             final double pk = distribution.logProbability(k);
  175.             // Lower half = cdf(k)
  176.             // Find upper half. As k < lower mode i should never
  177.             // reach the lower mode based on the probability alone.
  178.             // Bracket with the upper mode.
  179.             final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk,
  180.                 distribution::logProbability);
  181.             return distribution.cumulativeProbability(k) +
  182.                    distribution.survivalProbability(i - 1);
  183.         } else if (k > m2) {
  184.             final double pk = distribution.logProbability(k);
  185.             // Upper half = sf(k - 1)
  186.             // Find lower half. As k > upper mode i should never
  187.             // reach the upper mode based on the probability alone.
  188.             // Bracket with the lower mode.
  189.             final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk,
  190.                 distribution::logProbability);
  191.             return distribution.cumulativeProbability(i) +
  192.                    distribution.survivalProbability(k - 1);
  193.         }
  194.         // k == mode
  195.         // Edge case where the sum of probabilities will be either
  196.         // 1 or 1 - Pr(X = mode) where mode != k
  197.         final double pk = distribution.probability(k);
  198.         final double pm = distribution.probability(k == m1 ? m2 : m1);
  199.         return pm > pk ? 1 - pm : 1;
  200.     }
  201. }