001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.statistics.distribution;
019
020/**
021 * Implementation of the hypergeometric distribution.
022 *
023 * <p>The probability mass function of \( X \) is:
024 *
025 * <p>\[ f(k; N, K, n) = \frac{\binom{K}{k} \binom{N - K}{n-k}}{\binom{N}{n}} \]
026 *
027 * <p>for \( N \in \{0, 1, 2, \dots\} \) the population size,
028 * \( K \in \{0, 1, \dots, N\} \) the number of success states,
029 * \( n \in \{0, 1, \dots, N\} \) the number of samples,
030 * \( k \in \{\max(0, n+K-N), \dots, \min(n, K)\} \) the number of successes, and
031 *
032 * <p>\[ \binom{a}{b} = \frac{a!}{b! \, (a-b)!} \]
033 *
034 * <p>is the binomial coefficient.
035 *
036 * @see <a href="https://en.wikipedia.org/wiki/Hypergeometric_distribution">Hypergeometric distribution (Wikipedia)</a>
037 * @see <a href="https://mathworld.wolfram.com/HypergeometricDistribution.html">Hypergeometric distribution (MathWorld)</a>
038 */
039public final class HypergeometricDistribution extends AbstractDiscreteDistribution {
040    /** The number of successes in the population. */
041    private final int numberOfSuccesses;
042    /** The population size. */
043    private final int populationSize;
044    /** The sample size. */
045    private final int sampleSize;
046    /** The lower bound of the support (inclusive). */
047    private final int lowerBound;
048    /** The upper bound of the support (inclusive). */
049    private final int upperBound;
050    /** Binomial probability of success (sampleSize / populationSize). */
051    private final double p;
052    /** Binomial probability of failure ((populationSize - sampleSize) / populationSize). */
053    private final double q;
054
055    /**
056     * @param populationSize Population size.
057     * @param numberOfSuccesses Number of successes in the population.
058     * @param sampleSize Sample size.
059     */
060    private HypergeometricDistribution(int populationSize,
061                                       int numberOfSuccesses,
062                                       int sampleSize) {
063        this.numberOfSuccesses = numberOfSuccesses;
064        this.populationSize = populationSize;
065        this.sampleSize = sampleSize;
066        lowerBound = getLowerDomain(populationSize, numberOfSuccesses, sampleSize);
067        upperBound = getUpperDomain(numberOfSuccesses, sampleSize);
068        p = (double) sampleSize / (double) populationSize;
069        q = (double) (populationSize - sampleSize) / (double) populationSize;
070    }
071
072    /**
073     * Creates a hypergeometric distribution.
074     *
075     * @param populationSize Population size.
076     * @param numberOfSuccesses Number of successes in the population.
077     * @param sampleSize Sample size.
078     * @return the distribution
079     * @throws IllegalArgumentException if {@code numberOfSuccesses < 0}, or
080     * {@code populationSize <= 0} or {@code numberOfSuccesses > populationSize}, or
081     * {@code sampleSize > populationSize}.
082     */
083    public static HypergeometricDistribution of(int populationSize,
084                                                int numberOfSuccesses,
085                                                int sampleSize) {
086        if (populationSize <= 0) {
087            throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE,
088                                            populationSize);
089        }
090        if (numberOfSuccesses < 0) {
091            throw new DistributionException(DistributionException.NEGATIVE,
092                                            numberOfSuccesses);
093        }
094        if (sampleSize < 0) {
095            throw new DistributionException(DistributionException.NEGATIVE,
096                                            sampleSize);
097        }
098
099        if (numberOfSuccesses > populationSize) {
100            throw new DistributionException(DistributionException.TOO_LARGE,
101                                            numberOfSuccesses, populationSize);
102        }
103        if (sampleSize > populationSize) {
104            throw new DistributionException(DistributionException.TOO_LARGE,
105                                            sampleSize, populationSize);
106        }
107        return new HypergeometricDistribution(populationSize, numberOfSuccesses, sampleSize);
108    }
109
110    /**
111     * Return the lowest domain value for the given hypergeometric distribution
112     * parameters.
113     *
114     * @param nn Population size.
115     * @param k Number of successes in the population.
116     * @param n Sample size.
117     * @return the lowest domain value of the hypergeometric distribution.
118     */
119    private static int getLowerDomain(int nn, int k, int n) {
120        // Avoid overflow given N > n:
121        // n + K - N == K - (N - n)
122        return Math.max(0, k - (nn - n));
123    }
124
125    /**
126     * Return the highest domain value for the given hypergeometric distribution
127     * parameters.
128     *
129     * @param k Number of successes in the population.
130     * @param n Sample size.
131     * @return the highest domain value of the hypergeometric distribution.
132     */
133    private static int getUpperDomain(int k, int n) {
134        return Math.min(n, k);
135    }
136
137    /**
138     * Gets the population size parameter of this distribution.
139     *
140     * @return the population size.
141     */
142    public int getPopulationSize() {
143        return populationSize;
144    }
145
146    /**
147     * Gets the number of successes parameter of this distribution.
148     *
149     * @return the number of successes.
150     */
151    public int getNumberOfSuccesses() {
152        return numberOfSuccesses;
153    }
154
155    /**
156     * Gets the sample size parameter of this distribution.
157     *
158     * @return the sample size.
159     */
160    public int getSampleSize() {
161        return sampleSize;
162    }
163
164    /** {@inheritDoc} */
165    @Override
166    public double probability(int x) {
167        return Math.exp(logProbability(x));
168    }
169
170    /** {@inheritDoc} */
171    @Override
172    public double logProbability(int x) {
173        if (x < lowerBound || x > upperBound) {
174            return Double.NEGATIVE_INFINITY;
175        }
176        return computeLogProbability(x);
177    }
178
179    /**
180     * Compute the log probability.
181     *
182     * @param x Value.
183     * @return log(P(X = x))
184     */
185    private double computeLogProbability(int x) {
186        final double p1 =
187                SaddlePointExpansionUtils.logBinomialProbability(x, numberOfSuccesses, p, q);
188        final double p2 =
189                SaddlePointExpansionUtils.logBinomialProbability(sampleSize - x,
190                        populationSize - numberOfSuccesses, p, q);
191        final double p3 =
192                SaddlePointExpansionUtils.logBinomialProbability(sampleSize, populationSize, p, q);
193        return p1 + p2 - p3;
194    }
195
196    /** {@inheritDoc} */
197    @Override
198    public double cumulativeProbability(int x) {
199        if (x < lowerBound) {
200            return 0.0;
201        } else if (x >= upperBound) {
202            return 1.0;
203        }
204        return innerCumulativeProbability(lowerBound, x);
205    }
206
207    /** {@inheritDoc} */
208    @Override
209    public double survivalProbability(int x) {
210        if (x < lowerBound) {
211            return 1.0;
212        } else if (x >= upperBound) {
213            return 0.0;
214        }
215        return innerCumulativeProbability(upperBound, x + 1);
216    }
217
218    /**
219     * For this distribution, {@code X}, this method returns
220     * {@code P(x0 <= X <= x1)}.
221     * This probability is computed by summing the point probabilities for the
222     * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined
223     * using a comparison of the input bounds.
224     * This should be called by using {@code x0} as the domain limit and {@code x1}
225     * as the internal value. This will result in an initial sum of increasing larger magnitudes.
226     *
227     * @param x0 Inclusive domain bound.
228     * @param x1 Inclusive internal bound.
229     * @return {@code P(x0 <= X <= x1)}.
230     */
231    private double innerCumulativeProbability(int x0, int x1) {
232        // Assume the range is within the domain.
233        // Reuse the computation for probability(x) but avoid checking the domain for each call.
234        int x = x0;
235        double ret = Math.exp(computeLogProbability(x));
236        if (x0 < x1) {
237            while (x != x1) {
238                x++;
239                ret += Math.exp(computeLogProbability(x));
240            }
241        } else {
242            while (x != x1) {
243                x--;
244                ret += Math.exp(computeLogProbability(x));
245            }
246        }
247        return ret;
248    }
249
250    /**
251     * {@inheritDoc}
252     *
253     * <p>For population size \( N \), number of successes \( K \), and sample
254     * size \( n \), the mean is:
255     *
256     * <p>\[ n \frac{K}{N} \]
257     */
258    @Override
259    public double getMean() {
260        return getSampleSize() * (getNumberOfSuccesses() / (double) getPopulationSize());
261    }
262
263    /**
264     * {@inheritDoc}
265     *
266     * <p>For population size \( N \), number of successes \( K \), and sample
267     * size \( n \), the variance is:
268     *
269     * <p>\[ n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1} \]
270     */
271    @Override
272    public double getVariance() {
273        final double N = getPopulationSize();
274        final double K = getNumberOfSuccesses();
275        final double n = getSampleSize();
276        return (n * K * (N - K) * (N - n)) / (N * N * (N - 1));
277    }
278
279    /**
280     * {@inheritDoc}
281     *
282     * <p>For population size \( N \), number of successes \( K \), and sample
283     * size \( n \), the lower bound of the support is \( \max \{ 0, n + K - N \} \).
284     *
285     * @return lower bound of the support
286     */
287    @Override
288    public int getSupportLowerBound() {
289        return lowerBound;
290    }
291
292    /**
293     * {@inheritDoc}
294     *
295     * <p>For number of successes \( K \), and sample
296     * size \( n \), the upper bound of the support is \( \min \{ n, K \} \).
297     *
298     * @return upper bound of the support
299     */
300    @Override
301    public int getSupportUpperBound() {
302        return upperBound;
303    }
304}