001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.math3.distribution;
019
020import org.apache.commons.math3.exception.NotPositiveException;
021import org.apache.commons.math3.exception.NotStrictlyPositiveException;
022import org.apache.commons.math3.exception.NumberIsTooLargeException;
023import org.apache.commons.math3.exception.util.LocalizedFormats;
024import org.apache.commons.math3.util.FastMath;
025import org.apache.commons.math3.random.RandomGenerator;
026import org.apache.commons.math3.random.Well19937c;
027
028/**
029 * Implementation of the hypergeometric distribution.
030 *
031 * @see <a href="http://en.wikipedia.org/wiki/Hypergeometric_distribution">Hypergeometric distribution (Wikipedia)</a>
032 * @see <a href="http://mathworld.wolfram.com/HypergeometricDistribution.html">Hypergeometric distribution (MathWorld)</a>
033 */
034public class HypergeometricDistribution extends AbstractIntegerDistribution {
035    /** Serializable version identifier. */
036    private static final long serialVersionUID = -436928820673516179L;
037    /** The number of successes in the population. */
038    private final int numberOfSuccesses;
039    /** The population size. */
040    private final int populationSize;
041    /** The sample size. */
042    private final int sampleSize;
043    /** Cached numerical variance */
044    private double numericalVariance = Double.NaN;
045    /** Whether or not the numerical variance has been calculated */
046    private boolean numericalVarianceIsCalculated = false;
047
048    /**
049     * Construct a new hypergeometric distribution with the specified population
050     * size, number of successes in the population, and sample size.
051     *
052     * @param populationSize Population size.
053     * @param numberOfSuccesses Number of successes in the population.
054     * @param sampleSize Sample size.
055     * @throws NotPositiveException if {@code numberOfSuccesses < 0}.
056     * @throws NotStrictlyPositiveException if {@code populationSize <= 0}.
057     * @throws NumberIsTooLargeException if {@code numberOfSuccesses > populationSize},
058     * or {@code sampleSize > populationSize}.
059     */
060    public HypergeometricDistribution(int populationSize, int numberOfSuccesses, int sampleSize)
061    throws NotPositiveException, NotStrictlyPositiveException, NumberIsTooLargeException {
062        this(new Well19937c(), populationSize, numberOfSuccesses, sampleSize);
063    }
064
065    /**
066     * Creates a new hypergeometric distribution.
067     *
068     * @param rng Random number generator.
069     * @param populationSize Population size.
070     * @param numberOfSuccesses Number of successes in the population.
071     * @param sampleSize Sample size.
072     * @throws NotPositiveException if {@code numberOfSuccesses < 0}.
073     * @throws NotStrictlyPositiveException if {@code populationSize <= 0}.
074     * @throws NumberIsTooLargeException if {@code numberOfSuccesses > populationSize},
075     * or {@code sampleSize > populationSize}.
076     * @since 3.1
077     */
078    public HypergeometricDistribution(RandomGenerator rng,
079                                      int populationSize,
080                                      int numberOfSuccesses,
081                                      int sampleSize)
082    throws NotPositiveException, NotStrictlyPositiveException, NumberIsTooLargeException {
083        super(rng);
084
085        if (populationSize <= 0) {
086            throw new NotStrictlyPositiveException(LocalizedFormats.POPULATION_SIZE,
087                                                   populationSize);
088        }
089        if (numberOfSuccesses < 0) {
090            throw new NotPositiveException(LocalizedFormats.NUMBER_OF_SUCCESSES,
091                                           numberOfSuccesses);
092        }
093        if (sampleSize < 0) {
094            throw new NotPositiveException(LocalizedFormats.NUMBER_OF_SAMPLES,
095                                           sampleSize);
096        }
097
098        if (numberOfSuccesses > populationSize) {
099            throw new NumberIsTooLargeException(LocalizedFormats.NUMBER_OF_SUCCESS_LARGER_THAN_POPULATION_SIZE,
100                                                numberOfSuccesses, populationSize, true);
101        }
102        if (sampleSize > populationSize) {
103            throw new NumberIsTooLargeException(LocalizedFormats.SAMPLE_SIZE_LARGER_THAN_POPULATION_SIZE,
104                                                sampleSize, populationSize, true);
105        }
106
107        this.numberOfSuccesses = numberOfSuccesses;
108        this.populationSize = populationSize;
109        this.sampleSize = sampleSize;
110    }
111
112    /** {@inheritDoc} */
113    public double cumulativeProbability(int x) {
114        double ret;
115
116        int[] domain = getDomain(populationSize, numberOfSuccesses, sampleSize);
117        if (x < domain[0]) {
118            ret = 0.0;
119        } else if (x >= domain[1]) {
120            ret = 1.0;
121        } else {
122            ret = innerCumulativeProbability(domain[0], x, 1);
123        }
124
125        return ret;
126    }
127
128    /**
129     * Return the domain for the given hypergeometric distribution parameters.
130     *
131     * @param n Population size.
132     * @param m Number of successes in the population.
133     * @param k Sample size.
134     * @return a two element array containing the lower and upper bounds of the
135     * hypergeometric distribution.
136     */
137    private int[] getDomain(int n, int m, int k) {
138        return new int[] { getLowerDomain(n, m, k), getUpperDomain(m, k) };
139    }
140
141    /**
142     * Return the lowest domain value for the given hypergeometric distribution
143     * parameters.
144     *
145     * @param n Population size.
146     * @param m Number of successes in the population.
147     * @param k Sample size.
148     * @return the lowest domain value of the hypergeometric distribution.
149     */
150    private int getLowerDomain(int n, int m, int k) {
151        return FastMath.max(0, m - (n - k));
152    }
153
154    /**
155     * Access the number of successes.
156     *
157     * @return the number of successes.
158     */
159    public int getNumberOfSuccesses() {
160        return numberOfSuccesses;
161    }
162
163    /**
164     * Access the population size.
165     *
166     * @return the population size.
167     */
168    public int getPopulationSize() {
169        return populationSize;
170    }
171
172    /**
173     * Access the sample size.
174     *
175     * @return the sample size.
176     */
177    public int getSampleSize() {
178        return sampleSize;
179    }
180
181    /**
182     * Return the highest domain value for the given hypergeometric distribution
183     * parameters.
184     *
185     * @param m Number of successes in the population.
186     * @param k Sample size.
187     * @return the highest domain value of the hypergeometric distribution.
188     */
189    private int getUpperDomain(int m, int k) {
190        return FastMath.min(k, m);
191    }
192
193    /** {@inheritDoc} */
194    public double probability(int x) {
195        final double logProbability = logProbability(x);
196        return logProbability == Double.NEGATIVE_INFINITY ? 0 : FastMath.exp(logProbability);
197    }
198
199    /** {@inheritDoc} */
200    @Override
201    public double logProbability(int x) {
202        double ret;
203
204        int[] domain = getDomain(populationSize, numberOfSuccesses, sampleSize);
205        if (x < domain[0] || x > domain[1]) {
206            ret = Double.NEGATIVE_INFINITY;
207        } else {
208            double p = (double) sampleSize / (double) populationSize;
209            double q = (double) (populationSize - sampleSize) / (double) populationSize;
210            double p1 = SaddlePointExpansion.logBinomialProbability(x,
211                    numberOfSuccesses, p, q);
212            double p2 =
213                    SaddlePointExpansion.logBinomialProbability(sampleSize - x,
214                            populationSize - numberOfSuccesses, p, q);
215            double p3 =
216                    SaddlePointExpansion.logBinomialProbability(sampleSize, populationSize, p, q);
217            ret = p1 + p2 - p3;
218        }
219
220        return ret;
221    }
222
223    /**
224     * For this distribution, {@code X}, this method returns {@code P(X >= x)}.
225     *
226     * @param x Value at which the CDF is evaluated.
227     * @return the upper tail CDF for this distribution.
228     * @since 1.1
229     */
230    public double upperCumulativeProbability(int x) {
231        double ret;
232
233        final int[] domain = getDomain(populationSize, numberOfSuccesses, sampleSize);
234        if (x <= domain[0]) {
235            ret = 1.0;
236        } else if (x > domain[1]) {
237            ret = 0.0;
238        } else {
239            ret = innerCumulativeProbability(domain[1], x, -1);
240        }
241
242        return ret;
243    }
244
245    /**
246     * For this distribution, {@code X}, this method returns
247     * {@code P(x0 <= X <= x1)}.
248     * This probability is computed by summing the point probabilities for the
249     * values {@code x0, x0 + 1, x0 + 2, ..., x1}, in the order directed by
250     * {@code dx}.
251     *
252     * @param x0 Inclusive lower bound.
253     * @param x1 Inclusive upper bound.
254     * @param dx Direction of summation (1 indicates summing from x0 to x1, and
255     * 0 indicates summing from x1 to x0).
256     * @return {@code P(x0 <= X <= x1)}.
257     */
258    private double innerCumulativeProbability(int x0, int x1, int dx) {
259        double ret = probability(x0);
260        while (x0 != x1) {
261            x0 += dx;
262            ret += probability(x0);
263        }
264        return ret;
265    }
266
267    /**
268     * {@inheritDoc}
269     *
270     * For population size {@code N}, number of successes {@code m}, and sample
271     * size {@code n}, the mean is {@code n * m / N}.
272     */
273    public double getNumericalMean() {
274        return getSampleSize() * (getNumberOfSuccesses() / (double) getPopulationSize());
275    }
276
277    /**
278     * {@inheritDoc}
279     *
280     * For population size {@code N}, number of successes {@code m}, and sample
281     * size {@code n}, the variance is
282     * {@code [n * m * (N - n) * (N - m)] / [N^2 * (N - 1)]}.
283     */
284    public double getNumericalVariance() {
285        if (!numericalVarianceIsCalculated) {
286            numericalVariance = calculateNumericalVariance();
287            numericalVarianceIsCalculated = true;
288        }
289        return numericalVariance;
290    }
291
292    /**
293     * Used by {@link #getNumericalVariance()}.
294     *
295     * @return the variance of this distribution
296     */
297    protected double calculateNumericalVariance() {
298        final double N = getPopulationSize();
299        final double m = getNumberOfSuccesses();
300        final double n = getSampleSize();
301        return (n * m * (N - n) * (N - m)) / (N * N * (N - 1));
302    }
303
304    /**
305     * {@inheritDoc}
306     *
307     * For population size {@code N}, number of successes {@code m}, and sample
308     * size {@code n}, the lower bound of the support is
309     * {@code max(0, n + m - N)}.
310     *
311     * @return lower bound of the support
312     */
313    public int getSupportLowerBound() {
314        return FastMath.max(0,
315                            getSampleSize() + getNumberOfSuccesses() - getPopulationSize());
316    }
317
318    /**
319     * {@inheritDoc}
320     *
321     * For number of successes {@code m} and sample size {@code n}, the upper
322     * bound of the support is {@code min(m, n)}.
323     *
324     * @return upper bound of the support
325     */
326    public int getSupportUpperBound() {
327        return FastMath.min(getNumberOfSuccesses(), getSampleSize());
328    }
329
330    /**
331     * {@inheritDoc}
332     *
333     * The support of this distribution is connected.
334     *
335     * @return {@code true}
336     */
337    public boolean isSupportConnected() {
338        return true;
339    }
340}