StatisticUtils.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.inference;

  18. import java.util.Arrays;
  19. import java.util.Collection;
  20. import org.apache.commons.numbers.core.DD;
  21. import org.apache.commons.numbers.core.Precision;
  22. import org.apache.commons.numbers.core.Sum;
  23. import org.apache.commons.statistics.descriptive.Mean;

  24. /**
  25.  * Utility computation methods.
  26.  *
  27.  * @since 1.1
  28.  */
  29. final class StatisticUtils {
  30.     /** No instances. */
  31.     private StatisticUtils() {}

  32.     /**
  33.      * Compute {@code x - y}.
  34.      *
  35.      * <p>If {@code y} is zero the original array is returned, else a new array is created
  36.      * with the difference.
  37.      *
  38.      * @param x Array.
  39.      * @param y Value.
  40.      * @return x - y
  41.      * @throws NullPointerException if {@code x} is null and {@code y} is non-zero
  42.      */
  43.     static double[] subtract(double[] x, double y) {
  44.         return y == 0 ? x : Arrays.stream(x).map(v -> v - y).toArray();
  45.     }

  46.     /**
  47.      * Compute the degrees of freedom as {@code n - 1 - m}.
  48.      *
  49.      * <p>This method is common functionality shared between the Chi-square test and
  50.      * G-test. The pre-conditions for those tests are performed by this method.
  51.      *
  52.      * @param n Number of observations.
  53.      * @param m Adjustment (assumed to be positive).
  54.      * @return the degrees of freedom
  55.      * @throws IllegalArgumentException if the degrees of freedom is not strictly positive
  56.      */
  57.     static int computeDegreesOfFreedom(int n, int m) {
  58.         final int df = n - 1 - m;
  59.         if (df <= 0) {
  60.             throw new InferenceException("Invalid degrees of freedom: " + df);
  61.         }
  62.         return df;
  63.     }

  64.     /**
  65.      * Gets the ratio between the sum of the observed and expected values.
  66.      * The ratio can be used to scale the expected values to have the same sum
  67.      * as the observed values:
  68.      *
  69.      * <pre>
  70.      * sum(o) = sum(e * ratio)
  71.      * </pre>
  72.      *
  73.      * <p>This method is common functionality shared between the Chi-square test and
  74.      * G-test. The pre-conditions for those tests are performed by this method.
  75.      *
  76.      * @param expected Expected values.
  77.      * @param observed Observed values.
  78.      * @return the ratio
  79.      * @throws IllegalArgumentException if the sample size is less than 2; the array
  80.      * sizes do not match; {@code expected} has entries that are not strictly
  81.      * positive; {@code observed} has negative entries; or all the observations are zero.
  82.      */
  83.     static double computeRatio(double[] expected, long[] observed) {
  84.         Arguments.checkValuesRequiredSize(expected.length, 2);
  85.         Arguments.checkValuesSizeMatch(expected.length, observed.length);
  86.         Arguments.checkStrictlyPositive(expected);
  87.         Arguments.checkNonNegative(observed);
  88.         DD e = DD.ZERO;
  89.         DD o = DD.ZERO;
  90.         for (int i = 0; i < observed.length; i++) {
  91.             e = e.add(expected[i]);
  92.             o = add(o, observed[i]);
  93.         }
  94.         if (o.doubleValue() == 0) {
  95.             throw new InferenceException(InferenceException.NO_DATA);
  96.         }
  97.         // sum(o) / sum(e)
  98.         final double ratio = o.divide(e).doubleValue();
  99.         // Allow a sum within 1 ulp of 1.0
  100.         return Precision.equals(ratio, 1.0, 0) ? 1.0 : ratio;
  101.     }

  102.     /**
  103.      * Adds the value to the sum.
  104.      *
  105.      * @param sum Sum.
  106.      * @param v Value.
  107.      * @return the new sum
  108.      */
  109.     private static DD add(DD sum, long v) {
  110.         // The condition here is a high probability branch if the sample is
  111.         // frequency counts which are typically in the 32-bit integer range,
  112.         // i.e. all the upper bits are zero.
  113.         return (v >>> Integer.SIZE) == 0 ?
  114.             sum.add(v) :
  115.             sum.add(DD.of(v));
  116.     }

  117.     // Specialised statistic methods not directly supported by o.a.c.statistics.descriptive

  118.     /**
  119.      * Returns the arithmetic mean of the entries in the input arrays,
  120.      * or {@code NaN} if the combined length of the arrays is zero.
  121.      *
  122.      * <p>Supports a combined length above the maximum array size.
  123.      *
  124.      * <p>A two-pass, corrected algorithm is used, starting with the definitional formula
  125.      * computed using the array of stored values and then correcting this by adding the
  126.      * mean deviation of the data values from the arithmetic mean. See, e.g. "Comparison
  127.      * of Several Algorithms for Computing Sample Means and Variances," Robert F. Ling,
  128.      * Journal of the American Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp.
  129.      * 859-866.
  130.      *
  131.      * @param samples Values.
  132.      * @return the mean of the values or NaN if length = 0
  133.      */
  134.     static double mean(Collection<double[]> samples) {
  135.         final double mean = samples.stream()
  136.             .map(Mean::of)
  137.             .reduce(Mean::combine)
  138.             .orElseGet(Mean::create)
  139.             .getAsDouble();
  140.         // Second-pass correction.
  141.         // Note: The correction may not be finite in the event of extreme values.
  142.         // In this case the calling method computation will fail when the mean
  143.         // is used and we do not check for overflow here.
  144.         final long n = samples.stream().mapToInt(x -> x.length).sum();
  145.         return mean + samples.stream()
  146.             .flatMapToDouble(Arrays::stream).map(v -> v - mean).sum() / n;
  147.     }

  148.     /**
  149.      * Returns the mean of the (signed) differences between corresponding elements of the
  150.      * input arrays.
  151.      *
  152.      * <pre>
  153.      * sum(x[i] - y[i]) / x.length
  154.      * </pre>
  155.      *
  156.      * <p>This method avoids intermediate array allocation.
  157.      *
  158.      * @param x First array.
  159.      * @param y Second array.
  160.      * @return mean of paired differences
  161.      * @throws IllegalArgumentException if the arrays do not have the same length.
  162.      */
  163.     static double meanDifference(double[] x, double[] y) {
  164.         final int n = x.length;
  165.         if (n != y.length) {
  166.             throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
  167.         }
  168.         // STATISTICS-84: Use a single-pass extended precision sum.
  169.         final Sum sum = Sum.create();
  170.         for (int i = 0; i < n; i++) {
  171.             sum.add(x[i] - y[i]);
  172.         }
  173.         return sum.getAsDouble() / n;
  174.     }

  175.     /**
  176.      * Returns the variance of the (signed) differences between corresponding elements of
  177.      * the input arrays, or {@code NaN} if the arrays are empty.
  178.      *
  179.      * <pre>
  180.      * var(x[i] - y[i])
  181.      * </pre>
  182.      *
  183.      * <p>Returns the bias-corrected sample variance (using {@code n - 1} in the denominator).
  184.      * Returns 0 for a single-value (i.e. length = 1) sample.
  185.      *
  186.      * <p>This method avoids intermediate array allocation.
  187.      *
  188.      * <p>Uses a two-pass algorithm. Specifically, these methods use the "corrected
  189.      * two-pass algorithm" from Chan, Golub, Levesque, <i>Algorithms for Computing the
  190.      * Sample Variance</i>, American Statistician, vol. 37, no. 3 (1983) pp.
  191.      * 242-247.
  192.      *
  193.      * @param x First array.
  194.      * @param y Second array.
  195.      * @param mean the mean difference between corresponding entries
  196.      * @return variance of paired differences
  197.      * @throws IllegalArgumentException if the arrays do not have the same length.
  198.      * @see #meanDifference(double[], double[])
  199.      */
  200.     static double varianceDifference(double[] x, double[] y, double mean) {
  201.         final int n = x.length;
  202.         if (n != y.length) {
  203.             throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
  204.         }
  205.         if (n == 1) {
  206.             return 0;
  207.         }
  208.         // As per o.a.c.statistics.descriptive.Variance
  209.         double s = 0;
  210.         double ss = 0;
  211.         for (int i = 0; i < n; i++) {
  212.             final double dx = (x[i] - y[i]) - mean;
  213.             s += dx;
  214.             ss += dx * dx;
  215.         }
  216.         // sum-of-squared deviations = sum(x^2) - sum(x)^2 / n
  217.         // Divide by (n-1) for sample variance
  218.         return (ss - (s * s / n)) / (n - 1);
  219.     }
  220. }