View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.statistics.inference;
18  
19  import org.apache.commons.statistics.descriptive.LongMean;
20  import org.apache.commons.statistics.distribution.ChiSquaredDistribution;
21  
22  /**
23   * Implements chi-square test statistics.
24   *
25   * <p>This implementation handles both known and unknown distributions.
26   *
27   * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
28   * but provided by one sample, or when the hypothesis under test is that the two
29   * samples come from the same underlying distribution.
30   *
31   * @see <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-square test (Wikipedia)</a>
32   * @since 1.1
33   */
34  public final class ChiSquareTest {
35      /** Name for the row. */
36      private static final String ROW = "row";
37      /** Name for the column. */
38      private static final String COLUMN = "column";
39      /** Default instance. */
40      private static final ChiSquareTest DEFAULT = new ChiSquareTest(0);
41  
42      /** Degrees of freedom adjustment. */
43      private final int degreesOfFreedomAdjustment;
44  
45      /**
46       * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
47       */
48      private ChiSquareTest(int degreesOfFreedomAdjustment) {
49          this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
50      }
51  
52      /**
53       * Return an instance using the default options.
54       *
55       * <ul>
56       * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
57       * </ul>
58       *
59       * @return default instance
60       */
61      public static ChiSquareTest withDefaults() {
62          return DEFAULT;
63      }
64  
65      /**
66       * Return an instance with the configured degrees of freedom adjustment.
67       *
68       * <p>The default degrees of freedom for a sample of length {@code n} are
69       * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
70       * more parameters from the data in order to get the numbers for your null
71       * hypothesis. For a distribution with {@code p} parameters where up to
72       * {@code p} parameters have been estimated from the data the degrees of freedom
73       * is in the range {@code [n - 1 - p, n - 1]}.
74       *
75       * @param v Value.
76       * @return an instance
77       * @throws IllegalArgumentException if the value is negative
78       */
79      public ChiSquareTest withDegreesOfFreedomAdjustment(int v) {
80          return new ChiSquareTest(Arguments.checkNonNegative(v));
81      }
82  
83      /**
84       * Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a
85       * uniform expected value (each category is equally likely).
86       *
87       * <p>Note: This is a specialized version of a comparison of {@code observed}
88       * with an {@code expected} array of uniform values. The result is faster than
89       * calling {@link #statistic(double[], long[])} and the statistic is the same,
90       * with an allowance for accumulated floating-point error due to the optimized
91       * routine.
92       *
93       * @param observed Observed frequency counts.
94       * @return Chi-square statistic
95       * @throws IllegalArgumentException if the sample size is less than 2;
96       * {@code observed} has negative entries; or all the observations are zero.
97       * @see #test(long[])
98       */
99      public double statistic(long[] observed) {
100         Arguments.checkValuesRequiredSize(observed.length, 2);
101         Arguments.checkNonNegative(observed);
102         final double e = LongMean.of(observed).getAsDouble();
103         if (e == 0) {
104             throw new InferenceException(InferenceException.NO_DATA);
105         }
106         // chi2 = sum{ (o-e)^2 / e }. Use a single division at the end.
107         double chi2 = 0;
108         for (final long o : observed) {
109             final double d = o - e;
110             chi2 += d * d;
111         }
112         return chi2 / e;
113     }
114 
115     /**
116      * Computes the chi-square goodness-of-fit statistic comparing {@code observed} and
117      * {@code expected} frequency counts.
118      *
119      * <p><strong>Note:</strong>This implementation rescales the {@code expected}
120      * array if necessary to ensure that the sum of the expected and observed counts
121      * are equal.
122      *
123      * @param expected Expected frequency counts.
124      * @param observed Observed frequency counts.
125      * @return Chi-square statistic
126      * @throws IllegalArgumentException if the sample size is less than 2; the array
127      * sizes do not match; {@code expected} has entries that are not strictly
128      * positive; {@code observed} has negative entries; or all the observations are zero.
129      * @see #test(double[], long[])
130      */
131     public double statistic(double[] expected, long[] observed) {
132         final double ratio = StatisticUtils.computeRatio(expected, observed);
133         // chi2 = sum{ (o-e)^2 / e }
134         double chi2 = 0;
135         for (int i = 0; i < observed.length; i++) {
136             final double e = ratio * expected[i];
137             final double d = observed[i] - e;
138             chi2 += d * d / e;
139         }
140         return chi2;
141     }
142 
143     /**
144      * Computes the chi-square statistic associated with a chi-square test of
145      * independence based on the input {@code counts} array, viewed as a two-way
146      * table in row-major format.
147      *
148      * @param counts 2-way table.
149      * @return Chi-square statistic
150      * @throws IllegalArgumentException if the number of rows or columns is less
151      * than 2; the array is non-rectangular; the array has negative entries; or the
152      * sum of a row or column is zero.
153      * @see #test(long[][])
154      */
155     public double statistic(long[][] counts) {
156         Arguments.checkCategoriesRequiredSize(counts.length, 2);
157         Arguments.checkValuesRequiredSize(counts[0].length, 2);
158         Arguments.checkRectangular(counts);
159         Arguments.checkNonNegative(counts);
160 
161         final int nRows = counts.length;
162         final int nCols = counts[0].length;
163 
164         // compute row, column and total sums
165         final double[] rowSum = new double[nRows];
166         final double[] colSum = new double[nCols];
167         double sum = 0;
168         for (int row = 0; row < nRows; row++) {
169             for (int col = 0; col < nCols; col++) {
170                 rowSum[row] += counts[row][col];
171                 colSum[col] += counts[row][col];
172             }
173             checkNonZero(rowSum[row], ROW, row);
174             sum += rowSum[row];
175         }
176 
177         for (int col = 0; col < nCols; col++) {
178             checkNonZero(colSum[col], COLUMN, col);
179         }
180 
181         // Compute expected counts and chi-square
182         double chi2 = 0;
183         for (int row = 0; row < nRows; row++) {
184             for (int col = 0; col < nCols; col++) {
185                 final double e = (rowSum[row] * colSum[col]) / sum;
186                 final double d = counts[row][col] - e;
187                 chi2 += d * d / e;
188             }
189         }
190         return chi2;
191     }
192 
193     /**
194      * Computes a chi-square statistic associated with a chi-square test of
195      * independence of frequency counts in {@code observed1} and {@code observed2}.
196      * The sums of frequency counts in the two samples are not required to be the
197      * same. The formula used to compute the test statistic is:
198      *
199      * <p>\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \]
200      *
201      * <p>where
202      *
203      * <p>\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \]
204      *
205      * <p>Note: This is a specialized version of a 2-by-n contingency table. The
206      * result is faster than calling {@link #statistic(long[][])} with the table
207      * composed as {@code new long[][]{observed1, observed2}}. The statistic is the
208      * same, with an allowance for accumulated floating-point error due to the
209      * optimized routine.
210      *
211      * @param observed1 Observed frequency counts of the first data set.
212      * @param observed2 Observed frequency counts of the second data set.
213      * @return Chi-square statistic
214      * @throws IllegalArgumentException if the sample size is less than 2; the array
215      * sizes do not match; either array has entries that are negative; either all
216      * counts of {@code observed1} or {@code observed2} are zero; or if the count at
217      * some index is zero for both arrays.
218      * @see ChiSquareTest#test(long[], long[])
219      */
220     public double statistic(long[] observed1, long[] observed2) {
221         Arguments.checkValuesRequiredSize(observed1.length, 2);
222         Arguments.checkValuesSizeMatch(observed1.length, observed2.length);
223         Arguments.checkNonNegative(observed1);
224         Arguments.checkNonNegative(observed2);
225 
226         // Compute and compare count sums
227         long colSum1 = 0;
228         long colSum2 = 0;
229         for (int i = 0; i < observed1.length; i++) {
230             final long obs1 = observed1[i];
231             final long obs2 = observed2[i];
232             checkNonZero(obs1 | obs2, ROW, i);
233             colSum1 += obs1;
234             colSum2 += obs2;
235         }
236         // Create the same exception message as chiSquare(long[][])
237         checkNonZero(colSum1, COLUMN, 0);
238         checkNonZero(colSum2, COLUMN, 1);
239 
240         // Compare and compute weight only if different
241         final boolean unequalCounts = colSum1 != colSum2;
242         final double weight = unequalCounts ?
243             Math.sqrt((double) colSum1 / colSum2) : 1;
244         // Compute chi-square
245         // This exploits an algebraic rearrangement of the generic n*m contingency table case
246         // for a single sum squared addition per row.
247         double chi2 = 0;
248         for (int i = 0; i < observed1.length; i++) {
249             final double obs1 = observed1[i];
250             final double obs2 = observed2[i];
251             // apply weights
252             final double d = unequalCounts ?
253                     obs1 / weight - obs2 * weight :
254                     obs1 - obs2;
255             chi2 += (d * d) / (obs1 + obs2);
256         }
257         return chi2;
258     }
259 
260     /**
261      * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that
262      * the {@code observed} counts conform to a uniform distribution (each category
263      * is equally likely).
264      *
265      * @param observed Observed frequency counts.
266      * @return test result
267      * @throws IllegalArgumentException if the sample size is less than 2;
268      * {@code observed} has negative entries; or all the observations are zero
269      * @see #statistic(long[])
270      */
271     public SignificanceResult test(long[] observed) {
272         final int df = observed.length - 1;
273         final double chi2 = statistic(observed);
274         final double p = computeP(chi2, df);
275         return new BaseSignificanceResult(chi2, p);
276     }
277 
278     /**
279      * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the
280      * {@code observed} counts conform to the {@code expected} counts.
281      *
282      * <p>The test can be configured to apply an adjustment to the degrees of freedom
283      * if the observed data has been used to create the expected counts.
284      *
285      * @param expected Expected frequency counts.
286      * @param observed Observed frequency counts.
287      * @return test result
288      * @throws IllegalArgumentException if the sample size is less than 2; the array
289      * sizes do not match; {@code expected} has entries that are not strictly
290      * positive; {@code observed} has negative entries; all the observations are zero; or
291      * the adjusted degrees of freedom are not strictly positive
292      * @see #withDegreesOfFreedomAdjustment(int)
293      * @see #statistic(double[], long[])
294      */
295     public SignificanceResult test(double[] expected, long[] observed) {
296         final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
297         final double chi2 = statistic(expected, observed);
298         final double p = computeP(chi2, df);
299         return new BaseSignificanceResult(chi2, p);
300     }
301 
302     /**
303      * Perform a chi-square test of independence based on the input {@code counts} array,
304      * viewed as a two-way table.
305      *
306      * @param counts 2-way table.
307      * @return test result
308      * @throws IllegalArgumentException if the number of rows or columns is less
309      * than 2; the array is non-rectangular; the array has negative entries; or the
310      * sum of a row or column is zero.
311      * @see #statistic(long[][])
312      */
313     public SignificanceResult test(long[][] counts) {
314         final double chi2 = statistic(counts);
315         final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
316         final double p = computeP(chi2, df);
317         return new BaseSignificanceResult(chi2, p);
318     }
319 
320     /**
321      * Perform a chi-square test of independence of frequency counts in
322      * {@code observed1} and {@code observed2}.
323      *
324      * <p>Note: This is a specialized version of a 2-by-n contingency table.
325      *
326      * @param observed1 Observed frequency counts of the first data set.
327      * @param observed2 Observed frequency counts of the second data set.
328      * @return test result
329      * @throws IllegalArgumentException if the sample size is less than 2; the array
330      * sizes do not match; either array has entries that are negative; either all
331      * counts of {@code observed1} or {@code observed2} are zero; or if the count at
332      * some index is zero for both arrays.
333      * @see #statistic(long[], long[])
334      */
335     public SignificanceResult test(long[] observed1, long[] observed2) {
336         final double chi2 = statistic(observed1, observed2);
337         final double p = computeP(chi2, observed1.length - 1.0);
338         return new BaseSignificanceResult(chi2, p);
339     }
340 
341     /**
342      * Compute the chi-square test p-value.
343      *
344      * @param chi2 Chi-square statistic.
345      * @param degreesOfFreedom Degrees of freedom.
346      * @return p-value
347      */
348     private static double computeP(double chi2, double degreesOfFreedom) {
349         return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2);
350     }
351 
352     /**
353      * Check the array value is non-zero.
354      *
355      * @param value Value
356      * @param name Name of the array
357      * @param index Index in the array
358      * @throws IllegalArgumentException if the value is zero
359      */
360     private static void checkNonZero(double value, String name, int index) {
361         if (value == 0) {
362             throw new InferenceException(InferenceException.ZERO_AT, name, index);
363         }
364     }
365 }