View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.statistics.inference;
18  
19  import org.apache.commons.statistics.distribution.HypergeometricDistribution;
20  
21  /**
22   * Provide a wrapper around the {@link HypergeometricDistribution} that caches
23   * all probability mass values.
24   *
25   * <p>This class extracts the logic from the HypergeometricDistribution implementation
26   * used for the cumulative probability functions. It allows fast computation of
27   * the CDF and SF for the entire supported domain.
28   *
29   * @since 1.1
30   */
31  class Hypergeom {
32      /** 1/2. */
33      private static final double HALF = 0.5;
34      /** The lower bound of the support (inclusive). */
35      private final int lowerBound;
36      /** The upper bound of the support (inclusive). */
37      private final int upperBound;
38      /** Cached probability values. This holds values from x=0 even though the supported
39       * lower bound may be above x=0. This allows x to be used as an index without offsetting
40       * using the lower bound. */
41      private final double[] prob;
42      /** Cached midpoint, m, of the CDF/SF. This is not the true median. It is the value where
43       * the CDF is closest to 0.5; as such the CDF(m) may be below 0.5 if the next value
44       * CDF(m+1) is further from 0.5. Used for the cumulative probability functions. */
45      private final int m;
46      /** Cached CDF of the midpoint.
47       * Used for the cumulative probability functions. */
48      private final double midCDF;
49      /** Lower mode. */
50      private final int m1;
51      /** Upper mode. */
52      private final int m2;
53  
54      /**
55       * @param populationSize Population size.
56       * @param numberOfSuccesses Number of successes in the population.
57       * @param sampleSize Sample size.
58       */
59      Hypergeom(int populationSize,
60                int numberOfSuccesses,
61                int sampleSize) {
62          final HypergeometricDistribution dist =
63              HypergeometricDistribution.of(populationSize, numberOfSuccesses, sampleSize);
64  
65          // Cache all values required to compute the cumulative probability functions
66  
67          // Bounds
68          lowerBound = dist.getSupportLowerBound();
69          upperBound = dist.getSupportUpperBound();
70  
71          // PMF values
72          prob = new double[upperBound + 1];
73          for (int x = lowerBound; x <= upperBound; x++) {
74              prob[x] = dist.probability(x);
75          }
76  
77          // Compute mid-point for CDF/SF computation
78          // Find the closest sum(PDF) to 0.5.
79          int x = lowerBound;
80          double p0 = 0;
81          double p1 = prob[x];
82          // No check of the upper bound required here as the CDF should sum to 1 and 0.5
83          // is exceeded before a bounds error.
84          while (p1 < HALF) {
85              x++;
86              p0 = p1;
87              p1 += prob[x];
88          }
89          // p1 >= 0.5 > p0
90          // Pick closet
91          if (p1 - HALF >= HALF - p0) {
92              x--;
93              p1 = p0;
94          }
95          m = x;
96          midCDF = p1;
97  
98          // Compute the mode (lower != upper in the case where v is integer).
99          // This value is used by the UnconditionedExactTest and is cached here for convenience.
100         final double v = ((double) numberOfSuccesses + 1) * ((double) sampleSize + 1) / (populationSize + 2.0);
101         m1 = (int) Math.ceil(v) - 1;
102         m2 = (int) Math.floor(v);
103     }
104 
105     /**
106      * Get the lower bound of the support.
107      *
108      * @return lower bound
109      */
110     int getSupportLowerBound() {
111         return lowerBound;
112     }
113 
114     /**
115      * Get the upper bound of the support.
116      *
117      * @return upper bound
118      */
119     int getSupportUpperBound() {
120         return upperBound;
121     }
122 
123     /**
124      * Get the lower mode of the distribution.
125      *
126      * @return lower mode
127      */
128     int getLowerMode() {
129         return m1;
130     }
131 
132     /**
133      * Get the upper mode of the distribution.
134      *
135      * @return upper mode
136      */
137     int getUpperMode() {
138         return m2;
139     }
140 
141     /**
142      * Compute the probability mass function (PMF) at the specified value.
143      *
144      * @param x Value.
145      * @return P(X = x)
146      * @throws IndexOutOfBoundsException if the value {@code x} is not in the supported domain.
147      */
148     double pmf(int x) {
149         return prob[x];
150     }
151 
152     /**
153      * Compute the cumulative distribution function (CDF) at the specified value.
154      *
155      * @param x Value.
156      * @return P(X <= x)
157      */
158     double cdf(int x) {
159         if (x < lowerBound) {
160             return 0.0;
161         } else if (x >= upperBound) {
162             return 1.0;
163         }
164         if (x < m) {
165             return innerCumulativeProbability(lowerBound, x);
166         } else if (x > m) {
167             return 1 - innerCumulativeProbability(upperBound, x + 1);
168         }
169         // cdf(x)
170         return midCDF;
171     }
172 
173     /**
174      * Compute the survival function (SF) at the specified value. This is the complementary
175      * cumulative distribution function.
176      *
177      * @param x Value.
178      * @return P(X > x)
179      */
180     double sf(int x) {
181         if (x < lowerBound) {
182             return 1.0;
183         } else if (x >= upperBound) {
184             return 0.0;
185         }
186         if (x < m) {
187             return 1 - innerCumulativeProbability(lowerBound, x);
188         } else if (x > m) {
189             return innerCumulativeProbability(upperBound, x + 1);
190         }
191         // 1 - cdf(x)
192         return 1 - midCDF;
193     }
194 
195     /**
196      * For this distribution, {@code X}, this method returns
197      * {@code P(x0 <= X <= x1)}.
198      * This probability is computed by summing the point probabilities for the
199      * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined
200      * using a comparison of the input bounds.
201      * This should be called by using {@code x0} as the domain limit and {@code x1}
202      * as the internal value. This will result in a sum of increasingly larger magnitudes.
203      *
204      * @param x0 Inclusive domain bound.
205      * @param x1 Inclusive internal bound.
206      * @return {@code P(x0 <= X <= x1)}.
207      */
208     private double innerCumulativeProbability(int x0, int x1) {
209         // Assume the range is within the domain.
210         int x = x0;
211         double ret = prob[x];
212         if (x0 < x1) {
213             while (x != x1) {
214                 x++;
215                 ret += prob[x];
216             }
217         } else {
218             while (x != x1) {
219                 x--;
220                 ret += prob[x];
221             }
222         }
223         return ret;
224     }
225 }