1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.statistics.inference; 18 19 import org.apache.commons.statistics.distribution.HypergeometricDistribution; 20 21 /** 22 * Provide a wrapper around the {@link HypergeometricDistribution} that caches 23 * all probability mass values. 24 * 25 * <p>This class extracts the logic from the HypergeometricDistribution implementation 26 * used for the cumulative probability functions. It allows fast computation of 27 * the CDF and SF for the entire supported domain. 28 * 29 * @since 1.1 30 */ 31 class Hypergeom { 32 /** 1/2. */ 33 private static final double HALF = 0.5; 34 /** The lower bound of the support (inclusive). */ 35 private final int lowerBound; 36 /** The upper bound of the support (inclusive). */ 37 private final int upperBound; 38 /** Cached probability values. This holds values from x=0 even though the supported 39 * lower bound may be above x=0. This allows x to be used as an index without offsetting 40 * using the lower bound. */ 41 private final double[] prob; 42 /** Cached midpoint, m, of the CDF/SF. This is not the true median. It is the value where 43 * the CDF is closest to 0.5; as such the CDF(m) may be below 0.5 if the next value 44 * CDF(m+1) is further from 0.5. Used for the cumulative probability functions. */ 45 private final int m; 46 /** Cached CDF of the midpoint. 47 * Used for the cumulative probability functions. */ 48 private final double midCDF; 49 /** Lower mode. */ 50 private final int m1; 51 /** Upper mode. */ 52 private final int m2; 53 54 /** 55 * @param populationSize Population size. 56 * @param numberOfSuccesses Number of successes in the population. 57 * @param sampleSize Sample size. 58 */ 59 Hypergeom(int populationSize, 60 int numberOfSuccesses, 61 int sampleSize) { 62 final HypergeometricDistribution dist = 63 HypergeometricDistribution.of(populationSize, numberOfSuccesses, sampleSize); 64 65 // Cache all values required to compute the cumulative probability functions 66 67 // Bounds 68 lowerBound = dist.getSupportLowerBound(); 69 upperBound = dist.getSupportUpperBound(); 70 71 // PMF values 72 prob = new double[upperBound + 1]; 73 for (int x = lowerBound; x <= upperBound; x++) { 74 prob[x] = dist.probability(x); 75 } 76 77 // Compute mid-point for CDF/SF computation 78 // Find the closest sum(PDF) to 0.5. 79 int x = lowerBound; 80 double p0 = 0; 81 double p1 = prob[x]; 82 // No check of the upper bound required here as the CDF should sum to 1 and 0.5 83 // is exceeded before a bounds error. 84 while (p1 < HALF) { 85 x++; 86 p0 = p1; 87 p1 += prob[x]; 88 } 89 // p1 >= 0.5 > p0 90 // Pick closet 91 if (p1 - HALF >= HALF - p0) { 92 x--; 93 p1 = p0; 94 } 95 m = x; 96 midCDF = p1; 97 98 // Compute the mode (lower != upper in the case where v is integer). 99 // This value is used by the UnconditionedExactTest and is cached here for convenience. 100 final double v = ((double) numberOfSuccesses + 1) * ((double) sampleSize + 1) / (populationSize + 2.0); 101 m1 = (int) Math.ceil(v) - 1; 102 m2 = (int) Math.floor(v); 103 } 104 105 /** 106 * Get the lower bound of the support. 107 * 108 * @return lower bound 109 */ 110 int getSupportLowerBound() { 111 return lowerBound; 112 } 113 114 /** 115 * Get the upper bound of the support. 116 * 117 * @return upper bound 118 */ 119 int getSupportUpperBound() { 120 return upperBound; 121 } 122 123 /** 124 * Get the lower mode of the distribution. 125 * 126 * @return lower mode 127 */ 128 int getLowerMode() { 129 return m1; 130 } 131 132 /** 133 * Get the upper mode of the distribution. 134 * 135 * @return upper mode 136 */ 137 int getUpperMode() { 138 return m2; 139 } 140 141 /** 142 * Compute the probability mass function (PMF) at the specified value. 143 * 144 * @param x Value. 145 * @return P(X = x) 146 * @throws IndexOutOfBoundsException if the value {@code x} is not in the supported domain. 147 */ 148 double pmf(int x) { 149 return prob[x]; 150 } 151 152 /** 153 * Compute the cumulative distribution function (CDF) at the specified value. 154 * 155 * @param x Value. 156 * @return P(X <= x) 157 */ 158 double cdf(int x) { 159 if (x < lowerBound) { 160 return 0.0; 161 } else if (x >= upperBound) { 162 return 1.0; 163 } 164 if (x < m) { 165 return innerCumulativeProbability(lowerBound, x); 166 } else if (x > m) { 167 return 1 - innerCumulativeProbability(upperBound, x + 1); 168 } 169 // cdf(x) 170 return midCDF; 171 } 172 173 /** 174 * Compute the survival function (SF) at the specified value. This is the complementary 175 * cumulative distribution function. 176 * 177 * @param x Value. 178 * @return P(X > x) 179 */ 180 double sf(int x) { 181 if (x < lowerBound) { 182 return 1.0; 183 } else if (x >= upperBound) { 184 return 0.0; 185 } 186 if (x < m) { 187 return 1 - innerCumulativeProbability(lowerBound, x); 188 } else if (x > m) { 189 return innerCumulativeProbability(upperBound, x + 1); 190 } 191 // 1 - cdf(x) 192 return 1 - midCDF; 193 } 194 195 /** 196 * For this distribution, {@code X}, this method returns 197 * {@code P(x0 <= X <= x1)}. 198 * This probability is computed by summing the point probabilities for the 199 * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined 200 * using a comparison of the input bounds. 201 * This should be called by using {@code x0} as the domain limit and {@code x1} 202 * as the internal value. This will result in a sum of increasingly larger magnitudes. 203 * 204 * @param x0 Inclusive domain bound. 205 * @param x1 Inclusive internal bound. 206 * @return {@code P(x0 <= X <= x1)}. 207 */ 208 private double innerCumulativeProbability(int x0, int x1) { 209 // Assume the range is within the domain. 210 int x = x0; 211 double ret = prob[x]; 212 if (x0 < x1) { 213 while (x != x1) { 214 x++; 215 ret += prob[x]; 216 } 217 } else { 218 while (x != x1) { 219 x--; 220 ret += prob[x]; 221 } 222 } 223 return ret; 224 } 225 }