1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.statistics.inference;
18
19 import org.apache.commons.statistics.distribution.HypergeometricDistribution;
20
21 /**
22 * Provide a wrapper around the {@link HypergeometricDistribution} that caches
23 * all probability mass values.
24 *
25 * <p>This class extracts the logic from the HypergeometricDistribution implementation
26 * used for the cumulative probability functions. It allows fast computation of
27 * the CDF and SF for the entire supported domain.
28 *
29 * @since 1.1
30 */
31 class Hypergeom {
32 /** 1/2. */
33 private static final double HALF = 0.5;
34 /** The lower bound of the support (inclusive). */
35 private final int lowerBound;
36 /** The upper bound of the support (inclusive). */
37 private final int upperBound;
38 /** Cached probability values. This holds values from x=0 even though the supported
39 * lower bound may be above x=0. This allows x to be used as an index without offsetting
40 * using the lower bound. */
41 private final double[] prob;
42 /** Cached midpoint, m, of the CDF/SF. This is not the true median. It is the value where
43 * the CDF is closest to 0.5; as such the CDF(m) may be below 0.5 if the next value
44 * CDF(m+1) is further from 0.5. Used for the cumulative probability functions. */
45 private final int m;
46 /** Cached CDF of the midpoint.
47 * Used for the cumulative probability functions. */
48 private final double midCDF;
49 /** Lower mode. */
50 private final int m1;
51 /** Upper mode. */
52 private final int m2;
53
54 /**
55 * @param populationSize Population size.
56 * @param numberOfSuccesses Number of successes in the population.
57 * @param sampleSize Sample size.
58 */
59 Hypergeom(int populationSize,
60 int numberOfSuccesses,
61 int sampleSize) {
62 final HypergeometricDistribution dist =
63 HypergeometricDistribution.of(populationSize, numberOfSuccesses, sampleSize);
64
65 // Cache all values required to compute the cumulative probability functions
66
67 // Bounds
68 lowerBound = dist.getSupportLowerBound();
69 upperBound = dist.getSupportUpperBound();
70
71 // PMF values
72 prob = new double[upperBound + 1];
73 for (int x = lowerBound; x <= upperBound; x++) {
74 prob[x] = dist.probability(x);
75 }
76
77 // Compute mid-point for CDF/SF computation
78 // Find the closest sum(PDF) to 0.5.
79 int x = lowerBound;
80 double p0 = 0;
81 double p1 = prob[x];
82 // No check of the upper bound required here as the CDF should sum to 1 and 0.5
83 // is exceeded before a bounds error.
84 while (p1 < HALF) {
85 x++;
86 p0 = p1;
87 p1 += prob[x];
88 }
89 // p1 >= 0.5 > p0
90 // Pick closet
91 if (p1 - HALF >= HALF - p0) {
92 x--;
93 p1 = p0;
94 }
95 m = x;
96 midCDF = p1;
97
98 // Compute the mode (lower != upper in the case where v is integer).
99 // This value is used by the UnconditionedExactTest and is cached here for convenience.
100 final double v = ((double) numberOfSuccesses + 1) * ((double) sampleSize + 1) / (populationSize + 2.0);
101 m1 = (int) Math.ceil(v) - 1;
102 m2 = (int) Math.floor(v);
103 }
104
105 /**
106 * Get the lower bound of the support.
107 *
108 * @return lower bound
109 */
110 int getSupportLowerBound() {
111 return lowerBound;
112 }
113
114 /**
115 * Get the upper bound of the support.
116 *
117 * @return upper bound
118 */
119 int getSupportUpperBound() {
120 return upperBound;
121 }
122
123 /**
124 * Get the lower mode of the distribution.
125 *
126 * @return lower mode
127 */
128 int getLowerMode() {
129 return m1;
130 }
131
132 /**
133 * Get the upper mode of the distribution.
134 *
135 * @return upper mode
136 */
137 int getUpperMode() {
138 return m2;
139 }
140
141 /**
142 * Compute the probability mass function (PMF) at the specified value.
143 *
144 * @param x Value.
145 * @return P(X = x)
146 * @throws IndexOutOfBoundsException if the value {@code x} is not in the supported domain.
147 */
148 double pmf(int x) {
149 return prob[x];
150 }
151
152 /**
153 * Compute the cumulative distribution function (CDF) at the specified value.
154 *
155 * @param x Value.
156 * @return P(X <= x)
157 */
158 double cdf(int x) {
159 if (x < lowerBound) {
160 return 0.0;
161 } else if (x >= upperBound) {
162 return 1.0;
163 }
164 if (x < m) {
165 return innerCumulativeProbability(lowerBound, x);
166 } else if (x > m) {
167 return 1 - innerCumulativeProbability(upperBound, x + 1);
168 }
169 // cdf(x)
170 return midCDF;
171 }
172
173 /**
174 * Compute the survival function (SF) at the specified value. This is the complementary
175 * cumulative distribution function.
176 *
177 * @param x Value.
178 * @return P(X > x)
179 */
180 double sf(int x) {
181 if (x < lowerBound) {
182 return 1.0;
183 } else if (x >= upperBound) {
184 return 0.0;
185 }
186 if (x < m) {
187 return 1 - innerCumulativeProbability(lowerBound, x);
188 } else if (x > m) {
189 return innerCumulativeProbability(upperBound, x + 1);
190 }
191 // 1 - cdf(x)
192 return 1 - midCDF;
193 }
194
195 /**
196 * For this distribution, {@code X}, this method returns
197 * {@code P(x0 <= X <= x1)}.
198 * This probability is computed by summing the point probabilities for the
199 * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined
200 * using a comparison of the input bounds.
201 * This should be called by using {@code x0} as the domain limit and {@code x1}
202 * as the internal value. This will result in a sum of increasingly larger magnitudes.
203 *
204 * @param x0 Inclusive domain bound.
205 * @param x1 Inclusive internal bound.
206 * @return {@code P(x0 <= X <= x1)}.
207 */
208 private double innerCumulativeProbability(int x0, int x1) {
209 // Assume the range is within the domain.
210 int x = x0;
211 double ret = prob[x];
212 if (x0 < x1) {
213 while (x != x1) {
214 x++;
215 ret += prob[x];
216 }
217 } else {
218 while (x != x1) {
219 x--;
220 ret += prob[x];
221 }
222 }
223 return ret;
224 }
225 }