Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import java.util.Objects;
020import org.apache.commons.statistics.distribution.HypergeometricDistribution;
021
022/**
023 * Implements Fisher's exact test.
024 *
025 * <p>Performs an exact test for the statistical significance of the association (contingency)
026 * between two kinds of categorical classification.
027 *
028 * <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance
029 * and not random.
030 *
031 * @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher&#39;s exact test (Wikipedia)</a>
032 * @since 1.1
033 */
034public final class FisherExactTest {
035    /** Default instance. */
036    private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED);
037
038    /** Alternative hypothesis. */
039    private final AlternativeHypothesis alternative;
040
041    /**
042     * @param alternative Alternative hypothesis.
043     */
044    private FisherExactTest(AlternativeHypothesis alternative) {
045        this.alternative = alternative;
046    }
047
048    /**
049     * Return an instance using the default options.
050     *
051     * <ul>
052     * <li>{@link AlternativeHypothesis#TWO_SIDED}
053     * </ul>
054     *
055     * @return default instance
056     */
057    public static FisherExactTest withDefaults() {
058        return DEFAULT;
059    }
060
061    /**
062     * Return an instance with the configured alternative hypothesis.
063     *
064     * @param v Value.
065     * @return an instance
066     */
067    public FisherExactTest with(AlternativeHypothesis v) {
068        return new FisherExactTest(Objects.requireNonNull(v));
069    }
070
071    /**
072     * Compute the prior odds ratio for the 2-by-2 contingency table. This is the
073     * "sample" or "unconditional" maximum likelihood estimate. For a table of:
074     *
075     * <p>\[ \left[ {\begin{array}{cc}
076     *         a &amp; b \\
077     *         c &amp; d \\
078     *       \end{array} } \right] \]
079     *
080     * <p>this is:
081     *
082     * <p>\[ r = \frac{a d}{b c} \]
083     *
084     * <p>Special cases:
085     * <ul>
086     * <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}.
087     * <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}.
088     * </ul>
089     *
090     * <p>Note: This statistic is equal to the statistic computed by the SciPy function
091     * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum
092     * likelihood estimate computed by R function {@code fisher.test}.
093     *
094     * @param table 2-by-2 contingency table.
095     * @return odds ratio
096     * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
097     * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
098     * @see #test(int[][])
099     */
100    public double statistic(int[][] table) {
101        Arguments.checkTable(table);
102        final double a = table[0][0];
103        final double b = table[0][1];
104        final double c = table[1][0];
105        final double d = table[1][1];
106        return (a * d) / (b * c);
107    }
108
109    /**
110     * Performs Fisher's exact test on the 2-by-2 contingency table.
111     *
112     * <p>The test statistic is equal to the prior odds ratio. This is the
113     * "sample" or "unconditional" maximum likelihood estimate.
114     *
115     * <p>The test is defined by the {@link AlternativeHypothesis}.
116     *
117     * <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned
118     * with the same marginals (row and column totals). In this case the possible values {@code x}
119     * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}.
120     * <ul>
121     * <li>'two-sided': the odds ratio of the underlying population is not one; the p-value
122     * is the probability that a random table has probability equal to or less than the input table.
123     * <li>'greater': the odds ratio of the underlying population is greater than one; the p-value
124     * is the probability that a random table has {@code x >= a}.
125     * <li>'less': the odds ratio of the underlying population is less than one; the p-value
126     * is the probability that a random table has {@code x <= a}.
127     * </ul>
128     *
129     * @param table 2-by-2 contingency table.
130     * @return test result
131     * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
132     * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
133     * @see #with(AlternativeHypothesis)
134     * @see #statistic(int[][])
135     */
136    public SignificanceResult test(int[][] table) {
137        Arguments.checkTable(table);
138        final int a = table[0][0];
139        final int b = table[0][1];
140        final int c = table[1][0];
141        final int d = table[1][1];
142
143        // Odd-ratio.
144        final double statistic = ((double) a * d) / ((double) b * c);
145
146        final int nn = a + b + c + d;
147        final int k = a + b;
148        final int n = a + c;
149
150        // Note: The distribution validates the population size is > 0
151        final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n);
152        final double p;
153        if (alternative == AlternativeHypothesis.GREATER_THAN) {
154            p = distribution.survivalProbability(a - 1);
155        } else if (alternative == AlternativeHypothesis.LESS_THAN) {
156            p = distribution.cumulativeProbability(a);
157        } else {
158            p = twoSidedTest(a, distribution);
159        }
160        return new BaseSignificanceResult(statistic, p);
161    }
162
163    /**
164     * Returns the <i>observed significance level</i>, or p-value, associated with a
165     * two-sided test about the observed value.
166     *
167     * @param k Observed value.
168     * @param distribution Hypergeometric distribution.
169     * @return p-value
170     */
171    private static double twoSidedTest(int k, HypergeometricDistribution distribution) {
172        // Find all i where Pr(X = i) <= Pr(X = k) and sum them.
173        // Exploit the known unimodal distribution to increase the
174        // search speed. Note the search depends only on magnitude differences.
175        // The current HypergeometricDistribution is faster using log probability
176        // as it omits a call to Math.exp.
177
178        // Use the mode as the point of largest probability.
179        // The lower or upper mode is important for the search below.
180        final int nn = distribution.getPopulationSize();
181        final int kk = distribution.getNumberOfSuccesses();
182        final int n = distribution.getSampleSize();
183        final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0);
184        final int m1 = (int) Math.ceil(v) - 1;
185        final int m2 = (int) Math.floor(v);
186        if (k < m1) {
187            final double pk = distribution.logProbability(k);
188            // Lower half = cdf(k)
189            // Find upper half. As k < lower mode i should never
190            // reach the lower mode based on the probability alone.
191            // Bracket with the upper mode.
192            final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk,
193                distribution::logProbability);
194            return distribution.cumulativeProbability(k) +
195                   distribution.survivalProbability(i - 1);
196        } else if (k > m2) {
197            final double pk = distribution.logProbability(k);
198            // Upper half = sf(k - 1)
199            // Find lower half. As k > upper mode i should never
200            // reach the upper mode based on the probability alone.
201            // Bracket with the lower mode.
202            final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk,
203                distribution::logProbability);
204            return distribution.cumulativeProbability(i) +
205                   distribution.survivalProbability(k - 1);
206        }
207        // k == mode
208        // Edge case where the sum of probabilities will be either
209        // 1 or 1 - Pr(X = mode) where mode != k
210        final double pk = distribution.probability(k);
211        final double pm = distribution.probability(k == m1 ? m2 : m1);
212        return pm > pk ? 1 - pm : 1;
213    }
214}