ChiSquareTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.inference;

  18. import org.apache.commons.statistics.descriptive.LongMean;
  19. import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

  20. /**
  21.  * Implements chi-square test statistics.
  22.  *
  23.  * <p>This implementation handles both known and unknown distributions.
  24.  *
  25.  * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
  26.  * but provided by one sample, or when the hypothesis under test is that the two
  27.  * samples come from the same underlying distribution.
  28.  *
  29.  * @see <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-square test (Wikipedia)</a>
  30.  * @since 1.1
  31.  */
  32. public final class ChiSquareTest {
  33.     /** Name for the row. */
  34.     private static final String ROW = "row";
  35.     /** Name for the column. */
  36.     private static final String COLUMN = "column";
  37.     /** Default instance. */
  38.     private static final ChiSquareTest DEFAULT = new ChiSquareTest(0);

  39.     /** Degrees of freedom adjustment. */
  40.     private final int degreesOfFreedomAdjustment;

  41.     /**
  42.      * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
  43.      */
  44.     private ChiSquareTest(int degreesOfFreedomAdjustment) {
  45.         this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
  46.     }

  47.     /**
  48.      * Return an instance using the default options.
  49.      *
  50.      * <ul>
  51.      * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
  52.      * </ul>
  53.      *
  54.      * @return default instance
  55.      */
  56.     public static ChiSquareTest withDefaults() {
  57.         return DEFAULT;
  58.     }

  59.     /**
  60.      * Return an instance with the configured degrees of freedom adjustment.
  61.      *
  62.      * <p>The default degrees of freedom for a sample of length {@code n} are
  63.      * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
  64.      * more parameters from the data in order to get the numbers for your null
  65.      * hypothesis. For a distribution with {@code p} parameters where up to
  66.      * {@code p} parameters have been estimated from the data the degrees of freedom
  67.      * is in the range {@code [n - 1 - p, n - 1]}.
  68.      *
  69.      * @param v Value.
  70.      * @return an instance
  71.      * @throws IllegalArgumentException if the value is negative
  72.      */
  73.     public ChiSquareTest withDegreesOfFreedomAdjustment(int v) {
  74.         return new ChiSquareTest(Arguments.checkNonNegative(v));
  75.     }

  76.     /**
  77.      * Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a
  78.      * uniform expected value (each category is equally likely).
  79.      *
  80.      * <p>Note: This is a specialized version of a comparison of {@code observed}
  81.      * with an {@code expected} array of uniform values. The result is faster than
  82.      * calling {@link #statistic(double[], long[])} and the statistic is the same,
  83.      * with an allowance for accumulated floating-point error due to the optimized
  84.      * routine.
  85.      *
  86.      * @param observed Observed frequency counts.
  87.      * @return Chi-square statistic
  88.      * @throws IllegalArgumentException if the sample size is less than 2;
  89.      * {@code observed} has negative entries; or all the observations are zero.
  90.      * @see #test(long[])
  91.      */
  92.     public double statistic(long[] observed) {
  93.         Arguments.checkValuesRequiredSize(observed.length, 2);
  94.         Arguments.checkNonNegative(observed);
  95.         final double e = LongMean.of(observed).getAsDouble();
  96.         if (e == 0) {
  97.             throw new InferenceException(InferenceException.NO_DATA);
  98.         }
  99.         // chi2 = sum{ (o-e)^2 / e }. Use a single division at the end.
  100.         double chi2 = 0;
  101.         for (final long o : observed) {
  102.             final double d = o - e;
  103.             chi2 += d * d;
  104.         }
  105.         return chi2 / e;
  106.     }

  107.     /**
  108.      * Computes the chi-square goodness-of-fit statistic comparing {@code observed} and
  109.      * {@code expected} frequency counts.
  110.      *
  111.      * <p><strong>Note:</strong>This implementation rescales the {@code expected}
  112.      * array if necessary to ensure that the sum of the expected and observed counts
  113.      * are equal.
  114.      *
  115.      * @param expected Expected frequency counts.
  116.      * @param observed Observed frequency counts.
  117.      * @return Chi-square statistic
  118.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  119.      * sizes do not match; {@code expected} has entries that are not strictly
  120.      * positive; {@code observed} has negative entries; or all the observations are zero.
  121.      * @see #test(double[], long[])
  122.      */
  123.     public double statistic(double[] expected, long[] observed) {
  124.         final double ratio = StatisticUtils.computeRatio(expected, observed);
  125.         // chi2 = sum{ (o-e)^2 / e }
  126.         double chi2 = 0;
  127.         for (int i = 0; i < observed.length; i++) {
  128.             final double e = ratio * expected[i];
  129.             final double d = observed[i] - e;
  130.             chi2 += d * d / e;
  131.         }
  132.         return chi2;
  133.     }

  134.     /**
  135.      * Computes the chi-square statistic associated with a chi-square test of
  136.      * independence based on the input {@code counts} array, viewed as a two-way
  137.      * table in row-major format.
  138.      *
  139.      * @param counts 2-way table.
  140.      * @return Chi-square statistic
  141.      * @throws IllegalArgumentException if the number of rows or columns is less
  142.      * than 2; the array is non-rectangular; the array has negative entries; or the
  143.      * sum of a row or column is zero.
  144.      * @see #test(long[][])
  145.      */
  146.     public double statistic(long[][] counts) {
  147.         Arguments.checkCategoriesRequiredSize(counts.length, 2);
  148.         Arguments.checkValuesRequiredSize(counts[0].length, 2);
  149.         Arguments.checkRectangular(counts);
  150.         Arguments.checkNonNegative(counts);

  151.         final int nRows = counts.length;
  152.         final int nCols = counts[0].length;

  153.         // compute row, column and total sums
  154.         final double[] rowSum = new double[nRows];
  155.         final double[] colSum = new double[nCols];
  156.         double sum = 0;
  157.         for (int row = 0; row < nRows; row++) {
  158.             for (int col = 0; col < nCols; col++) {
  159.                 rowSum[row] += counts[row][col];
  160.                 colSum[col] += counts[row][col];
  161.             }
  162.             checkNonZero(rowSum[row], ROW, row);
  163.             sum += rowSum[row];
  164.         }

  165.         for (int col = 0; col < nCols; col++) {
  166.             checkNonZero(colSum[col], COLUMN, col);
  167.         }

  168.         // Compute expected counts and chi-square
  169.         double chi2 = 0;
  170.         for (int row = 0; row < nRows; row++) {
  171.             for (int col = 0; col < nCols; col++) {
  172.                 final double e = (rowSum[row] * colSum[col]) / sum;
  173.                 final double d = counts[row][col] - e;
  174.                 chi2 += d * d / e;
  175.             }
  176.         }
  177.         return chi2;
  178.     }

  179.     /**
  180.      * Computes a chi-square statistic associated with a chi-square test of
  181.      * independence of frequency counts in {@code observed1} and {@code observed2}.
  182.      * The sums of frequency counts in the two samples are not required to be the
  183.      * same. The formula used to compute the test statistic is:
  184.      *
  185.      * <p>\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \]
  186.      *
  187.      * <p>where
  188.      *
  189.      * <p>\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \]
  190.      *
  191.      * <p>Note: This is a specialized version of a 2-by-n contingency table. The
  192.      * result is faster than calling {@link #statistic(long[][])} with the table
  193.      * composed as {@code new long[][]{observed1, observed2}}. The statistic is the
  194.      * same, with an allowance for accumulated floating-point error due to the
  195.      * optimized routine.
  196.      *
  197.      * @param observed1 Observed frequency counts of the first data set.
  198.      * @param observed2 Observed frequency counts of the second data set.
  199.      * @return Chi-square statistic
  200.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  201.      * sizes do not match; either array has entries that are negative; either all
  202.      * counts of {@code observed1} or {@code observed2} are zero; or if the count at
  203.      * some index is zero for both arrays.
  204.      * @see ChiSquareTest#test(long[], long[])
  205.      */
  206.     public double statistic(long[] observed1, long[] observed2) {
  207.         Arguments.checkValuesRequiredSize(observed1.length, 2);
  208.         Arguments.checkValuesSizeMatch(observed1.length, observed2.length);
  209.         Arguments.checkNonNegative(observed1);
  210.         Arguments.checkNonNegative(observed2);

  211.         // Compute and compare count sums
  212.         long colSum1 = 0;
  213.         long colSum2 = 0;
  214.         for (int i = 0; i < observed1.length; i++) {
  215.             final long obs1 = observed1[i];
  216.             final long obs2 = observed2[i];
  217.             checkNonZero(obs1 | obs2, ROW, i);
  218.             colSum1 += obs1;
  219.             colSum2 += obs2;
  220.         }
  221.         // Create the same exception message as chiSquare(long[][])
  222.         checkNonZero(colSum1, COLUMN, 0);
  223.         checkNonZero(colSum2, COLUMN, 1);

  224.         // Compare and compute weight only if different
  225.         final boolean unequalCounts = colSum1 != colSum2;
  226.         final double weight = unequalCounts ?
  227.             Math.sqrt((double) colSum1 / colSum2) : 1;
  228.         // Compute chi-square
  229.         // This exploits an algebraic rearrangement of the generic n*m contingency table case
  230.         // for a single sum squared addition per row.
  231.         double chi2 = 0;
  232.         for (int i = 0; i < observed1.length; i++) {
  233.             final double obs1 = observed1[i];
  234.             final double obs2 = observed2[i];
  235.             // apply weights
  236.             final double d = unequalCounts ?
  237.                     obs1 / weight - obs2 * weight :
  238.                     obs1 - obs2;
  239.             chi2 += (d * d) / (obs1 + obs2);
  240.         }
  241.         return chi2;
  242.     }

  243.     /**
  244.      * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that
  245.      * the {@code observed} counts conform to a uniform distribution (each category
  246.      * is equally likely).
  247.      *
  248.      * @param observed Observed frequency counts.
  249.      * @return test result
  250.      * @throws IllegalArgumentException if the sample size is less than 2;
  251.      * {@code observed} has negative entries; or all the observations are zero
  252.      * @see #statistic(long[])
  253.      */
  254.     public SignificanceResult test(long[] observed) {
  255.         final int df = observed.length - 1;
  256.         final double chi2 = statistic(observed);
  257.         final double p = computeP(chi2, df);
  258.         return new BaseSignificanceResult(chi2, p);
  259.     }

  260.     /**
  261.      * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the
  262.      * {@code observed} counts conform to the {@code expected} counts.
  263.      *
  264.      * <p>The test can be configured to apply an adjustment to the degrees of freedom
  265.      * if the observed data has been used to create the expected counts.
  266.      *
  267.      * @param expected Expected frequency counts.
  268.      * @param observed Observed frequency counts.
  269.      * @return test result
  270.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  271.      * sizes do not match; {@code expected} has entries that are not strictly
  272.      * positive; {@code observed} has negative entries; all the observations are zero; or
  273.      * the adjusted degrees of freedom are not strictly positive
  274.      * @see #withDegreesOfFreedomAdjustment(int)
  275.      * @see #statistic(double[], long[])
  276.      */
  277.     public SignificanceResult test(double[] expected, long[] observed) {
  278.         final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
  279.         final double chi2 = statistic(expected, observed);
  280.         final double p = computeP(chi2, df);
  281.         return new BaseSignificanceResult(chi2, p);
  282.     }

  283.     /**
  284.      * Perform a chi-square test of independence based on the input {@code counts} array,
  285.      * viewed as a two-way table.
  286.      *
  287.      * @param counts 2-way table.
  288.      * @return test result
  289.      * @throws IllegalArgumentException if the number of rows or columns is less
  290.      * than 2; the array is non-rectangular; the array has negative entries; or the
  291.      * sum of a row or column is zero.
  292.      * @see #statistic(long[][])
  293.      */
  294.     public SignificanceResult test(long[][] counts) {
  295.         final double chi2 = statistic(counts);
  296.         final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
  297.         final double p = computeP(chi2, df);
  298.         return new BaseSignificanceResult(chi2, p);
  299.     }

  300.     /**
  301.      * Perform a chi-square test of independence of frequency counts in
  302.      * {@code observed1} and {@code observed2}.
  303.      *
  304.      * <p>Note: This is a specialized version of a 2-by-n contingency table.
  305.      *
  306.      * @param observed1 Observed frequency counts of the first data set.
  307.      * @param observed2 Observed frequency counts of the second data set.
  308.      * @return test result
  309.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  310.      * sizes do not match; either array has entries that are negative; either all
  311.      * counts of {@code observed1} or {@code observed2} are zero; or if the count at
  312.      * some index is zero for both arrays.
  313.      * @see #statistic(long[], long[])
  314.      */
  315.     public SignificanceResult test(long[] observed1, long[] observed2) {
  316.         final double chi2 = statistic(observed1, observed2);
  317.         final double p = computeP(chi2, observed1.length - 1.0);
  318.         return new BaseSignificanceResult(chi2, p);
  319.     }

  320.     /**
  321.      * Compute the chi-square test p-value.
  322.      *
  323.      * @param chi2 Chi-square statistic.
  324.      * @param degreesOfFreedom Degrees of freedom.
  325.      * @return p-value
  326.      */
  327.     private static double computeP(double chi2, double degreesOfFreedom) {
  328.         return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2);
  329.     }

  330.     /**
  331.      * Check the array value is non-zero.
  332.      *
  333.      * @param value Value
  334.      * @param name Name of the array
  335.      * @param index Index in the array
  336.      * @throws IllegalArgumentException if the value is zero
  337.      */
  338.     private static void checkNonZero(double value, String name, int index) {
  339.         if (value == 0) {
  340.             throw new InferenceException(InferenceException.ZERO_AT, name, index);
  341.         }
  342.     }
  343. }