TTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.statistics.inference;

import java.util.EnumSet;
import java.util.Objects;
import org.apache.commons.statistics.descriptive.DoubleStatistics;
import org.apache.commons.statistics.descriptive.Statistic;
import org.apache.commons.statistics.distribution.TDistribution;

/**
 * Implements Student's t-test statistics.
 *
 * <p>Tests can be:
 * <ul>
 * <li>One-sample or two-sample
 * <li>One-sided or two-sided
 * <li>Paired or unpaired (for two-sample tests)
 * <li>Homoscedastic (equal variance assumption) or heteroscedastic (for two sample tests)
 * </ul>
 *
 * <p>Input to tests can be either {@code double[]} arrays or the mean, variance, and size
 * of the sample.
 *
 * @see <a href="https://en.wikipedia.org/wiki/Student%27s_t-test">Student&#39;s t-test (Wikipedia)</a>
 * @since 1.1
 */
public final class TTest {
    /** Default instance. */
    private static final TTest DEFAULT = new TTest(AlternativeHypothesis.TWO_SIDED, false, 0);

    /** Alternative hypothesis. */
    private final AlternativeHypothesis alternative;
    /** Assume the two samples have the same population variance. */
    private final boolean equalVariances;
    /** The true value of the mean (or difference in means for a two sample test). */
    private final double mu;

    /**
     * Result for the t-test.
     *
     * <p>This class is immutable.
     */
    public static final class Result extends BaseSignificanceResult {
        /** Degrees of freedom. */
        private final double degreesOfFreedom;

        /**
         * Create an instance.
         *
         * @param statistic Test statistic.
         * @param degreesOfFreedom Degrees of freedom.
         * @param p Result p-value.
         */
        Result(double statistic, double degreesOfFreedom, double p) {
            super(statistic, p);
            this.degreesOfFreedom = degreesOfFreedom;
        }

        /**
         * Gets the degrees of freedom.
         *
         * @return the degrees of freedom
         */
        public double getDegreesOfFreedom() {
            return degreesOfFreedom;
        }
    }

    /**
     * @param alternative Alternative hypothesis.
     * @param equalVariances Assume the two samples have the same population variance.
     * @param mu true value of the mean (or difference in means for a two sample test).
     */
    private TTest(AlternativeHypothesis alternative, boolean equalVariances, double mu) {
        this.alternative = alternative;
        this.equalVariances = equalVariances;
        this.mu = mu;
    }

    /**
     * Return an instance using the default options.
     *
     * <ul>
     * <li>{@link AlternativeHypothesis#TWO_SIDED}
     * <li>{@link DataDispersion#HETEROSCEDASTIC}
     * <li>{@linkplain #withMu(double) mu = 0}
     * </ul>
     *
     * @return default instance
     */
    public static TTest withDefaults() {
        return DEFAULT;
    }

    /**
     * Return an instance with the configured alternative hypothesis.
     *
     * @param v Value.
     * @return an instance
     */
    public TTest with(AlternativeHypothesis v) {
        return new TTest(Objects.requireNonNull(v), equalVariances, mu);
    }

    /**
     * Return an instance with the configured assumption on the data dispersion.
     *
     * <p>Applies to the two-sample independent t-test.
     * The statistic can compare the means without the assumption of equal
     * sub-population variances (heteroscedastic); otherwise the means are compared
     * under the assumption of equal sub-population variances (homoscedastic).
     *
     * @param v Value.
     * @return an instance
     * @see #test(double[], double[])
     * @see #test(double, double, long, double, double, long)
     */
    public TTest with(DataDispersion v) {
        return new TTest(alternative, Objects.requireNonNull(v) == DataDispersion.HOMOSCEDASTIC, mu);
    }

    /**
     * Return an instance with the configured {@code mu}.
     *
     * <p>For the one-sample test this is the expected mean.
     *
     * <p>For the two-sample test this is the expected difference between the means.
     *
     * @param v Value.
     * @return an instance
     * @throws IllegalArgumentException if the value is not finite
     */
    public TTest withMu(double v) {
        return new TTest(alternative, equalVariances, Arguments.checkFinite(v));
    }

    /**
     * Computes a one-sample t statistic comparing the mean of the dataset to {@code mu}.
     *
     * <p>The returned t-statistic is:
     *
     * <p>\[ t = \frac{m - \mu}{ \sqrt{ \frac{v}{n} } } \]
     *
     * @param m Sample mean.
     * @param v Sample variance.
     * @param n Sample size.
     * @return t statistic
     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
     * variance is negative
     * @see #withMu(double)
     */
    public double statistic(double m, double v, long n) {
        Arguments.checkNonNegative(v);
        checkSampleSize(n);
        return computeT(m - mu, v, n);
    }

    /**
     * Computes a one-sample t statistic comparing the mean of the sample to {@code mu}.
     *
     * @param x Sample values.
     * @return t statistic
     * @throws IllegalArgumentException if the number of samples is {@code < 2}
     * @see #statistic(double, double, long)
     * @see #withMu(double)
     */
    public double statistic(double[] x) {
        final long n = checkSampleSize(x.length);
        final DoubleStatistics s = DoubleStatistics.of(
            EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x);
        final double m = s.getAsDouble(Statistic.MEAN);
        final double v = s.getAsDouble(Statistic.VARIANCE);
        return computeT(m - mu, v, n);
    }

    /**
     * Computes a paired two-sample t-statistic on related samples comparing the mean difference
     * between the samples to {@code mu}.
     *
     * <p>The t-statistic returned is functionally equivalent to what would be returned by computing
     * the one-sample t-statistic {@link #statistic(double[])}, with
     * the sample array consisting of the (signed) differences between corresponding
     * entries in {@code x} and {@code y}.
     *
     * @param x First sample values.
     * @param y Second sample values.
     * @return t statistic
     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
     * the size of the samples is not equal
     * @see #withMu(double)
     */
    public double pairedStatistic(double[] x, double[] y) {
        final long n = checkSampleSize(x.length);
        final double m = StatisticUtils.meanDifference(x, y);
        final double v = StatisticUtils.varianceDifference(x, y, m);
        return computeT(m - mu, v, n);
    }

    /**
     * Computes a two-sample t statistic on independent samples comparing the difference in means
     * of the datasets to {@code mu}.
     *
     * <p>Use the {@link DataDispersion} to control the computation of the variance.
     *
     * <p>The heteroscedastic t-statistic is:
     *
     * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ \frac{v_1}{n_1} + \frac{v_2}{n_2} } } \]
     *
     * <p>The homoscedastic t-statistic is:
     *
     * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ v (\frac{1}{n_1} + \frac{1}{n_2}) } } \]
     *
     * <p>where \( v \) is the pooled variance estimate:
     *
     * <p>\[ v = \frac{(n_1-1)v_1 + (n_2-1)v_2}{n_1 + n_2 - 2} \]
     *
     * @param m1 First sample mean.
     * @param v1 First sample variance.
     * @param n1 First sample size.
     * @param m2 Second sample mean.
     * @param v2 Second sample variance.
     * @param n2 Second sample size.
     * @return t statistic
     * @throws IllegalArgumentException if the number of samples in either dataset is
     * {@code < 2}; or the variances are negative.
     * @see #withMu(double)
     * @see #with(DataDispersion)
     */
    public double statistic(double m1, double v1, long n1,
                            double m2, double v2, long n2) {
        Arguments.checkNonNegative(v1);
        Arguments.checkNonNegative(v2);
        checkSampleSize(n1);
        checkSampleSize(n2);
        return equalVariances ?
            computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
            computeT(mu, m1, v1, n1, m2, v2, n2);
    }

    /**
     * Computes a two-sample t statistic on independent samples comparing the difference
     * in means of the samples to {@code mu}.
     *
     * <p>Use the {@link DataDispersion} to control the computation of the variance.
     *
     * @param x First sample values.
     * @param y Second sample values.
     * @return t statistic
     * @throws IllegalArgumentException if the number of samples in either dataset is {@code < 2}
     * @see #withMu(double)
     * @see #with(DataDispersion)
     */
    public double statistic(double[] x, double[] y) {
        final long n1 = checkSampleSize(x.length);
        final long n2 = checkSampleSize(y.length);
        final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
        final DoubleStatistics s1 = b.build(x);
        final double m1 = s1.getAsDouble(Statistic.MEAN);
        final double v1 = s1.getAsDouble(Statistic.VARIANCE);
        final DoubleStatistics s2 = b.build(y);
        final double m2 = s2.getAsDouble(Statistic.MEAN);
        final double v2 = s2.getAsDouble(Statistic.VARIANCE);
        return equalVariances ?
            computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
            computeT(mu, m1, v1, n1, m2, v2, n2);
    }

    /**
     * Perform a one-sample t-test comparing the mean of the dataset to {@code mu}.
     *
     * <p>Degrees of freedom are \( v = n - 1 \).
     *
     * @param m Sample mean.
     * @param v Sample variance.
     * @param n Sample size.
     * @return test result
     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
     * variance is negative
     * @see #statistic(double, double, long)
     */
    public Result test(double m, double v, long n) {
        final double t = statistic(m, v, n);
        final double df = n - 1.0;
        final double p = computeP(t, df);
        return new Result(t, df, p);
    }

    /**
     * Performs a one-sample t-test comparing the mean of the sample to {@code mu}.
     *
     * <p>Degrees of freedom are \( v = n - 1 \).
     *
     * @param sample Sample values.
     * @return the test result
     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
     * the size of the samples is not equal
     * @see #statistic(double[])
     */
    public Result test(double[] sample) {
        final double t = statistic(sample);
        final double df = sample.length - 1.0;
        final double p = computeP(t, df);
        return new Result(t, df, p);
    }

    /**
     * Performs a paired two-sample t-test on related samples comparing the mean difference between
     * the samples to {@code mu}.
     *
     * <p>The test is functionally equivalent to what would be returned by computing
     * the one-sample t-test {@link #test(double[])}, with
     * the sample array consisting of the (signed) differences between corresponding
     * entries in {@code x} and {@code y}.
     *
     * @param x First sample values.
     * @param y Second sample values.
     * @return the test result
     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
     * the size of the samples is not equal
     * @see #pairedStatistic(double[], double[])
     */
    public Result pairedTest(double[] x, double[] y) {
        final double t = pairedStatistic(x, y);
        final double df = x.length - 1.0;
        final double p = computeP(t, df);
        return new Result(t, df, p);
    }

    /**
     * Performs a two-sample t-test on independent samples comparing the difference in means of the
     * datasets to {@code mu}.
     *
     * <p>Use the {@link DataDispersion} to control the computation of the variance.
     *
     * <p>The heteroscedastic degrees of freedom are estimated using the
     * Welch-Satterthwaite approximation:
     *
     * <p>\[ v = \frac{ (\frac{v_1}{n_1} + \frac{v_2}{n_2})^2 }
     *                { \frac{(v_1/n_1)^2}{n_1-1} + \frac{(v_2/n_2)^2}{n_2-1} } \]
     *
     * <p>The homoscedastic degrees of freedom are \( v = n_1 + n_2 - 2 \).
     *
     * @param m1 First sample mean.
     * @param v1 First sample variance.
     * @param n1 First sample size.
     * @param m2 Second sample mean.
     * @param v2 Second sample variance.
     * @param n2 Second sample size.
     * @return test result
     * @throws IllegalArgumentException if the number of samples in either dataset is
     * {@code < 2}; or the variances are negative.
     * @see #statistic(double, double, long, double, double, long)
     */
    public Result test(double m1, double v1, long n1,
                       double m2, double v2, long n2) {
        final double t = statistic(m1, v1, n1, m2, v2, n2);
        final double df = equalVariances ?
                -2.0 + n1 + n2 :
                computeDf(v1, n1, v2, n2);
        final double p = computeP(t, df);
        return new Result(t, df, p);
    }

    /**
     * Performs a two-sample t-test on independent samples comparing the difference in means of
     * the samples to {@code mu}.
     *
     * <p>Use the {@link DataDispersion} to control the computation of the variance.
     *
     * @param x First sample values.
     * @param y Second sample values.
     * @return the test result
     * @throws IllegalArgumentException if the number of samples in either dataset
     * is {@code < 2}
     * @see #statistic(double[], double[])
     * @see #test(double, double, long, double, double, long)
     */
    public Result test(double[] x, double[] y) {
        // Here we do not call statistic(double[], double[]) because the degreesOfFreedom
        // requires the variance. So repeat the computation and compute p.
        final long n1 = checkSampleSize(x.length);
        final long n2 = checkSampleSize(y.length);
        final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
        final DoubleStatistics s1 = b.build(x);
        final double m1 = s1.getAsDouble(Statistic.MEAN);
        final double v1 = s1.getAsDouble(Statistic.VARIANCE);
        final DoubleStatistics s2 = b.build(y);
        final double m2 = s2.getAsDouble(Statistic.MEAN);
        final double v2 = s2.getAsDouble(Statistic.VARIANCE);
        final double t;
        final double df;
        if (equalVariances) {
            t = computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2);
            df = -2.0 + n1 + n2;
        } else {
            t = computeT(mu, m1, v1, n1, m2, v2, n2);
            df = computeDf(v1, n1, v2, n2);
        }
        final double p = computeP(t, df);
        return new Result(t, df, p);
    }

    /**
     * Computes t statistic for one-sample t-test.
     *
     * @param m Sample mean.
     * @param v Sample variance.
     * @param n Sample size.
     * @return t test statistic
     */
    private static double computeT(double m, double v, long n) {
        return m / Math.sqrt(v / n);
    }

    /**
     * Computes t statistic for two-sample t-test without the assumption of equal
     * samples sizes or sub-population variances.
     *
     * @param mu Expected difference between means.
     * @param m1 First sample mean.
     * @param v1 First sample variance.
     * @param n1 First sample size.
     * @param m2 Second sample mean.
     * @param v2 Second sample variance.
     * @param n2 Second sample size.
     * @return t test statistic
     */
    private static double computeT(double mu,
                                   double m1, double v1, long n1,
                                   double m2, double v2, long n2)  {
        return (m1 - m2 - mu) / Math.sqrt((v1 / n1) + (v2 / n2));
    }

    /**
     * Computes approximate degrees of freedom for two-sample t-test without the
     * assumption of equal samples sizes or sub-population variances.
     *
     * @param v1 First sample variance.
     * @param n1 First sample size.
     * @param v2 Second sample variance.
     * @param n2 Second sample size.
     * @return approximate degrees of freedom
     */
    private static double computeDf(double v1, long n1,
                                    double v2, long n2) {
        // Sample sizes are specified as a double to avoid integer overflow
        final double d1 = n1;
        final double d2 = n2;
        return (((v1 / d1) + (v2 / d2)) * ((v1 / d1) + (v2 / d2))) /
            ((v1 * v1) / (d1 * d1 * (n1 - 1)) + (v2 * v2) / (d2 * d2 * (n2 - 1)));
    }

    /**
     * Computes t statistic for two-sample t-test under the hypothesis of equal
     * sub-population variances.
     *
     * @param mu Expected difference between means.
     * @param m1 First sample mean.
     * @param v1 First sample variance.
     * @param n1 First sample size.
     * @param m2 Second sample mean.
     * @param v2 Second sample variance.
     * @param n2 Second sample size.
     * @return t test statistic
     */
    private static double computeHomoscedasticT(double mu,
                                                double m1, double v1, long n1,
                                                double m2, double v2, long n2)  {
        final double pooledVariance = ((n1 - 1) * v1 + (n2 - 1) * v2) / (-2.0 + n1 + n2);
        return (m1 - m2 - mu) / Math.sqrt(pooledVariance * (1.0 / n1 + 1.0 / n2));
    }

    /**
     * Computes p-value for the specified t statistic.
     *
     * @param t T statistic.
     * @param degreesOfFreedom Degrees of freedom.
     * @return p-value for t-test
     */
    private double computeP(double t, double degreesOfFreedom) {
        if (alternative == AlternativeHypothesis.LESS_THAN) {
            return TDistribution.of(degreesOfFreedom).cumulativeProbability(t);
        }
        if (alternative == AlternativeHypothesis.GREATER_THAN) {
            return TDistribution.of(degreesOfFreedom).survivalProbability(t);
        }
        // two-sided
        return 2.0 * TDistribution.of(degreesOfFreedom).survivalProbability(Math.abs(t));
    }

    /**
     * Check sample data size.
     *
     * @param n Data size.
     * @return the sample size
     * @throws IllegalArgumentException if the number of samples {@code < 2}
     */
    private static long checkSampleSize(long n) {
        if (n <= 1) {
            throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
        }
        return n;
    }
}