001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.statistics.distribution; 019 020/** 021 * Implementation of the hypergeometric distribution. 022 * 023 * <p>The probability mass function of \( X \) is: 024 * 025 * <p>\[ f(k; N, K, n) = \frac{\binom{K}{k} \binom{N - K}{n-k}}{\binom{N}{n}} \] 026 * 027 * <p>for \( N \in \{0, 1, 2, \dots\} \) the population size, 028 * \( K \in \{0, 1, \dots, N\} \) the number of success states, 029 * \( n \in \{0, 1, \dots, N\} \) the number of samples, 030 * \( k \in \{\max(0, n+K-N), \dots, \min(n, K)\} \) the number of successes, and 031 * 032 * <p>\[ \binom{a}{b} = \frac{a!}{b! \, (a-b)!} \] 033 * 034 * <p>is the binomial coefficient. 035 * 036 * @see <a href="https://en.wikipedia.org/wiki/Hypergeometric_distribution">Hypergeometric distribution (Wikipedia)</a> 037 * @see <a href="https://mathworld.wolfram.com/HypergeometricDistribution.html">Hypergeometric distribution (MathWorld)</a> 038 */ 039public final class HypergeometricDistribution extends AbstractDiscreteDistribution { 040 /** The number of successes in the population. */ 041 private final int numberOfSuccesses; 042 /** The population size. */ 043 private final int populationSize; 044 /** The sample size. */ 045 private final int sampleSize; 046 /** The lower bound of the support (inclusive). */ 047 private final int lowerBound; 048 /** The upper bound of the support (inclusive). */ 049 private final int upperBound; 050 /** Binomial probability of success (sampleSize / populationSize). */ 051 private final double p; 052 /** Binomial probability of failure ((populationSize - sampleSize) / populationSize). */ 053 private final double q; 054 055 /** 056 * @param populationSize Population size. 057 * @param numberOfSuccesses Number of successes in the population. 058 * @param sampleSize Sample size. 059 */ 060 private HypergeometricDistribution(int populationSize, 061 int numberOfSuccesses, 062 int sampleSize) { 063 this.numberOfSuccesses = numberOfSuccesses; 064 this.populationSize = populationSize; 065 this.sampleSize = sampleSize; 066 lowerBound = getLowerDomain(populationSize, numberOfSuccesses, sampleSize); 067 upperBound = getUpperDomain(numberOfSuccesses, sampleSize); 068 p = (double) sampleSize / (double) populationSize; 069 q = (double) (populationSize - sampleSize) / (double) populationSize; 070 } 071 072 /** 073 * Creates a hypergeometric distribution. 074 * 075 * @param populationSize Population size. 076 * @param numberOfSuccesses Number of successes in the population. 077 * @param sampleSize Sample size. 078 * @return the distribution 079 * @throws IllegalArgumentException if {@code numberOfSuccesses < 0}, or 080 * {@code populationSize <= 0} or {@code numberOfSuccesses > populationSize}, or 081 * {@code sampleSize > populationSize}. 082 */ 083 public static HypergeometricDistribution of(int populationSize, 084 int numberOfSuccesses, 085 int sampleSize) { 086 if (populationSize <= 0) { 087 throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE, 088 populationSize); 089 } 090 if (numberOfSuccesses < 0) { 091 throw new DistributionException(DistributionException.NEGATIVE, 092 numberOfSuccesses); 093 } 094 if (sampleSize < 0) { 095 throw new DistributionException(DistributionException.NEGATIVE, 096 sampleSize); 097 } 098 099 if (numberOfSuccesses > populationSize) { 100 throw new DistributionException(DistributionException.TOO_LARGE, 101 numberOfSuccesses, populationSize); 102 } 103 if (sampleSize > populationSize) { 104 throw new DistributionException(DistributionException.TOO_LARGE, 105 sampleSize, populationSize); 106 } 107 return new HypergeometricDistribution(populationSize, numberOfSuccesses, sampleSize); 108 } 109 110 /** 111 * Return the lowest domain value for the given hypergeometric distribution 112 * parameters. 113 * 114 * @param nn Population size. 115 * @param k Number of successes in the population. 116 * @param n Sample size. 117 * @return the lowest domain value of the hypergeometric distribution. 118 */ 119 private static int getLowerDomain(int nn, int k, int n) { 120 // Avoid overflow given N > n: 121 // n + K - N == K - (N - n) 122 return Math.max(0, k - (nn - n)); 123 } 124 125 /** 126 * Return the highest domain value for the given hypergeometric distribution 127 * parameters. 128 * 129 * @param k Number of successes in the population. 130 * @param n Sample size. 131 * @return the highest domain value of the hypergeometric distribution. 132 */ 133 private static int getUpperDomain(int k, int n) { 134 return Math.min(n, k); 135 } 136 137 /** 138 * Gets the population size parameter of this distribution. 139 * 140 * @return the population size. 141 */ 142 public int getPopulationSize() { 143 return populationSize; 144 } 145 146 /** 147 * Gets the number of successes parameter of this distribution. 148 * 149 * @return the number of successes. 150 */ 151 public int getNumberOfSuccesses() { 152 return numberOfSuccesses; 153 } 154 155 /** 156 * Gets the sample size parameter of this distribution. 157 * 158 * @return the sample size. 159 */ 160 public int getSampleSize() { 161 return sampleSize; 162 } 163 164 /** {@inheritDoc} */ 165 @Override 166 public double probability(int x) { 167 return Math.exp(logProbability(x)); 168 } 169 170 /** {@inheritDoc} */ 171 @Override 172 public double logProbability(int x) { 173 if (x < lowerBound || x > upperBound) { 174 return Double.NEGATIVE_INFINITY; 175 } 176 return computeLogProbability(x); 177 } 178 179 /** 180 * Compute the log probability. 181 * 182 * @param x Value. 183 * @return log(P(X = x)) 184 */ 185 private double computeLogProbability(int x) { 186 final double p1 = 187 SaddlePointExpansionUtils.logBinomialProbability(x, numberOfSuccesses, p, q); 188 final double p2 = 189 SaddlePointExpansionUtils.logBinomialProbability(sampleSize - x, 190 populationSize - numberOfSuccesses, p, q); 191 final double p3 = 192 SaddlePointExpansionUtils.logBinomialProbability(sampleSize, populationSize, p, q); 193 return p1 + p2 - p3; 194 } 195 196 /** {@inheritDoc} */ 197 @Override 198 public double cumulativeProbability(int x) { 199 if (x < lowerBound) { 200 return 0.0; 201 } else if (x >= upperBound) { 202 return 1.0; 203 } 204 return innerCumulativeProbability(lowerBound, x); 205 } 206 207 /** {@inheritDoc} */ 208 @Override 209 public double survivalProbability(int x) { 210 if (x < lowerBound) { 211 return 1.0; 212 } else if (x >= upperBound) { 213 return 0.0; 214 } 215 return innerCumulativeProbability(upperBound, x + 1); 216 } 217 218 /** 219 * For this distribution, {@code X}, this method returns 220 * {@code P(x0 <= X <= x1)}. 221 * This probability is computed by summing the point probabilities for the 222 * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined 223 * using a comparison of the input bounds. 224 * This should be called by using {@code x0} as the domain limit and {@code x1} 225 * as the internal value. This will result in an initial sum of increasing larger magnitudes. 226 * 227 * @param x0 Inclusive domain bound. 228 * @param x1 Inclusive internal bound. 229 * @return {@code P(x0 <= X <= x1)}. 230 */ 231 private double innerCumulativeProbability(int x0, int x1) { 232 // Assume the range is within the domain. 233 // Reuse the computation for probability(x) but avoid checking the domain for each call. 234 int x = x0; 235 double ret = Math.exp(computeLogProbability(x)); 236 if (x0 < x1) { 237 while (x != x1) { 238 x++; 239 ret += Math.exp(computeLogProbability(x)); 240 } 241 } else { 242 while (x != x1) { 243 x--; 244 ret += Math.exp(computeLogProbability(x)); 245 } 246 } 247 return ret; 248 } 249 250 /** 251 * {@inheritDoc} 252 * 253 * <p>For population size \( N \), number of successes \( K \), and sample 254 * size \( n \), the mean is: 255 * 256 * <p>\[ n \frac{K}{N} \] 257 */ 258 @Override 259 public double getMean() { 260 return getSampleSize() * (getNumberOfSuccesses() / (double) getPopulationSize()); 261 } 262 263 /** 264 * {@inheritDoc} 265 * 266 * <p>For population size \( N \), number of successes \( K \), and sample 267 * size \( n \), the variance is: 268 * 269 * <p>\[ n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1} \] 270 */ 271 @Override 272 public double getVariance() { 273 final double N = getPopulationSize(); 274 final double K = getNumberOfSuccesses(); 275 final double n = getSampleSize(); 276 return (n * K * (N - K) * (N - n)) / (N * N * (N - 1)); 277 } 278 279 /** 280 * {@inheritDoc} 281 * 282 * <p>For population size \( N \), number of successes \( K \), and sample 283 * size \( n \), the lower bound of the support is \( \max \{ 0, n + K - N \} \). 284 * 285 * @return lower bound of the support 286 */ 287 @Override 288 public int getSupportLowerBound() { 289 return lowerBound; 290 } 291 292 /** 293 * {@inheritDoc} 294 * 295 * <p>For number of successes \( K \), and sample 296 * size \( n \), the upper bound of the support is \( \min \{ n, K \} \). 297 * 298 * @return upper bound of the support 299 */ 300 @Override 301 public int getSupportUpperBound() { 302 return upperBound; 303 } 304}