1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.statistics.inference;
18
19 import java.util.EnumSet;
20 import java.util.Objects;
21 import org.apache.commons.statistics.descriptive.DoubleStatistics;
22 import org.apache.commons.statistics.descriptive.Statistic;
23 import org.apache.commons.statistics.distribution.TDistribution;
24
25 /**
26 * Implements Student's t-test statistics.
27 *
28 * <p>Tests can be:
29 * <ul>
30 * <li>One-sample or two-sample
31 * <li>One-sided or two-sided
32 * <li>Paired or unpaired (for two-sample tests)
33 * <li>Homoscedastic (equal variance assumption) or heteroscedastic (for two sample tests)
34 * </ul>
35 *
36 * <p>Input to tests can be either {@code double[]} arrays or the mean, variance, and size
37 * of the sample.
38 *
39 * @see <a href="https://en.wikipedia.org/wiki/Student%27s_t-test">Student's t-test (Wikipedia)</a>
40 * @since 1.1
41 */
42 public final class TTest {
43 /** Default instance. */
44 private static final TTest DEFAULT = new TTest(AlternativeHypothesis.TWO_SIDED, false, 0);
45
46 /** Alternative hypothesis. */
47 private final AlternativeHypothesis alternative;
48 /** Assume the two samples have the same population variance. */
49 private final boolean equalVariances;
50 /** The true value of the mean (or difference in means for a two sample test). */
51 private final double mu;
52
53 /**
54 * Result for the t-test.
55 *
56 * <p>This class is immutable.
57 */
58 public static final class Result extends BaseSignificanceResult {
59 /** Degrees of freedom. */
60 private final double degreesOfFreedom;
61
62 /**
63 * Create an instance.
64 *
65 * @param statistic Test statistic.
66 * @param degreesOfFreedom Degrees of freedom.
67 * @param p Result p-value.
68 */
69 Result(double statistic, double degreesOfFreedom, double p) {
70 super(statistic, p);
71 this.degreesOfFreedom = degreesOfFreedom;
72 }
73
74 /**
75 * Gets the degrees of freedom.
76 *
77 * @return the degrees of freedom
78 */
79 public double getDegreesOfFreedom() {
80 return degreesOfFreedom;
81 }
82 }
83
84 /**
85 * @param alternative Alternative hypothesis.
86 * @param equalVariances Assume the two samples have the same population variance.
87 * @param mu true value of the mean (or difference in means for a two sample test).
88 */
89 private TTest(AlternativeHypothesis alternative, boolean equalVariances, double mu) {
90 this.alternative = alternative;
91 this.equalVariances = equalVariances;
92 this.mu = mu;
93 }
94
95 /**
96 * Return an instance using the default options.
97 *
98 * <ul>
99 * <li>{@link AlternativeHypothesis#TWO_SIDED}
100 * <li>{@link DataDispersion#HETEROSCEDASTIC}
101 * <li>{@linkplain #withMu(double) mu = 0}
102 * </ul>
103 *
104 * @return default instance
105 */
106 public static TTest withDefaults() {
107 return DEFAULT;
108 }
109
110 /**
111 * Return an instance with the configured alternative hypothesis.
112 *
113 * @param v Value.
114 * @return an instance
115 */
116 public TTest with(AlternativeHypothesis v) {
117 return new TTest(Objects.requireNonNull(v), equalVariances, mu);
118 }
119
120 /**
121 * Return an instance with the configured assumption on the data dispersion.
122 *
123 * <p>Applies to the two-sample independent t-test.
124 * The statistic can compare the means without the assumption of equal
125 * sub-population variances (heteroscedastic); otherwise the means are compared
126 * under the assumption of equal sub-population variances (homoscedastic).
127 *
128 * @param v Value.
129 * @return an instance
130 * @see #test(double[], double[])
131 * @see #test(double, double, long, double, double, long)
132 */
133 public TTest with(DataDispersion v) {
134 return new TTest(alternative, Objects.requireNonNull(v) == DataDispersion.HOMOSCEDASTIC, mu);
135 }
136
137 /**
138 * Return an instance with the configured {@code mu}.
139 *
140 * <p>For the one-sample test this is the expected mean.
141 *
142 * <p>For the two-sample test this is the expected difference between the means.
143 *
144 * @param v Value.
145 * @return an instance
146 * @throws IllegalArgumentException if the value is not finite
147 */
148 public TTest withMu(double v) {
149 return new TTest(alternative, equalVariances, Arguments.checkFinite(v));
150 }
151
152 /**
153 * Computes a one-sample t statistic comparing the mean of the dataset to {@code mu}.
154 *
155 * <p>The returned t-statistic is:
156 *
157 * <p>\[ t = \frac{m - \mu}{ \sqrt{ \frac{v}{n} } } \]
158 *
159 * @param m Sample mean.
160 * @param v Sample variance.
161 * @param n Sample size.
162 * @return t statistic
163 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
164 * variance is negative
165 * @see #withMu(double)
166 */
167 public double statistic(double m, double v, long n) {
168 Arguments.checkNonNegative(v);
169 checkSampleSize(n);
170 return computeT(m - mu, v, n);
171 }
172
173 /**
174 * Computes a one-sample t statistic comparing the mean of the sample to {@code mu}.
175 *
176 * @param x Sample values.
177 * @return t statistic
178 * @throws IllegalArgumentException if the number of samples is {@code < 2}
179 * @see #statistic(double, double, long)
180 * @see #withMu(double)
181 */
182 public double statistic(double[] x) {
183 final long n = checkSampleSize(x.length);
184 final DoubleStatistics s = DoubleStatistics.of(
185 EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x);
186 final double m = s.getAsDouble(Statistic.MEAN);
187 final double v = s.getAsDouble(Statistic.VARIANCE);
188 return computeT(m - mu, v, n);
189 }
190
191 /**
192 * Computes a paired two-sample t-statistic on related samples comparing the mean difference
193 * between the samples to {@code mu}.
194 *
195 * <p>The t-statistic returned is functionally equivalent to what would be returned by computing
196 * the one-sample t-statistic {@link #statistic(double[])}, with
197 * the sample array consisting of the (signed) differences between corresponding
198 * entries in {@code x} and {@code y}.
199 *
200 * @param x First sample values.
201 * @param y Second sample values.
202 * @return t statistic
203 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
204 * the size of the samples is not equal
205 * @see #withMu(double)
206 */
207 public double pairedStatistic(double[] x, double[] y) {
208 final long n = checkSampleSize(x.length);
209 final double m = StatisticUtils.meanDifference(x, y);
210 final double v = StatisticUtils.varianceDifference(x, y, m);
211 return computeT(m - mu, v, n);
212 }
213
214 /**
215 * Computes a two-sample t statistic on independent samples comparing the difference in means
216 * of the datasets to {@code mu}.
217 *
218 * <p>Use the {@link DataDispersion} to control the computation of the variance.
219 *
220 * <p>The heteroscedastic t-statistic is:
221 *
222 * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ \frac{v_1}{n_1} + \frac{v_2}{n_2} } } \]
223 *
224 * <p>The homoscedastic t-statistic is:
225 *
226 * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ v (\frac{1}{n_1} + \frac{1}{n_2}) } } \]
227 *
228 * <p>where \( v \) is the pooled variance estimate:
229 *
230 * <p>\[ v = \frac{(n_1-1)v_1 + (n_2-1)v_2}{n_1 + n_2 - 2} \]
231 *
232 * @param m1 First sample mean.
233 * @param v1 First sample variance.
234 * @param n1 First sample size.
235 * @param m2 Second sample mean.
236 * @param v2 Second sample variance.
237 * @param n2 Second sample size.
238 * @return t statistic
239 * @throws IllegalArgumentException if the number of samples in either dataset is
240 * {@code < 2}; or the variances are negative.
241 * @see #withMu(double)
242 * @see #with(DataDispersion)
243 */
244 public double statistic(double m1, double v1, long n1,
245 double m2, double v2, long n2) {
246 Arguments.checkNonNegative(v1);
247 Arguments.checkNonNegative(v2);
248 checkSampleSize(n1);
249 checkSampleSize(n2);
250 return equalVariances ?
251 computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
252 computeT(mu, m1, v1, n1, m2, v2, n2);
253 }
254
255 /**
256 * Computes a two-sample t statistic on independent samples comparing the difference
257 * in means of the samples to {@code mu}.
258 *
259 * <p>Use the {@link DataDispersion} to control the computation of the variance.
260 *
261 * @param x First sample values.
262 * @param y Second sample values.
263 * @return t statistic
264 * @throws IllegalArgumentException if the number of samples in either dataset is {@code < 2}
265 * @see #withMu(double)
266 * @see #with(DataDispersion)
267 */
268 public double statistic(double[] x, double[] y) {
269 final long n1 = checkSampleSize(x.length);
270 final long n2 = checkSampleSize(y.length);
271 final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
272 final DoubleStatistics s1 = b.build(x);
273 final double m1 = s1.getAsDouble(Statistic.MEAN);
274 final double v1 = s1.getAsDouble(Statistic.VARIANCE);
275 final DoubleStatistics s2 = b.build(y);
276 final double m2 = s2.getAsDouble(Statistic.MEAN);
277 final double v2 = s2.getAsDouble(Statistic.VARIANCE);
278 return equalVariances ?
279 computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
280 computeT(mu, m1, v1, n1, m2, v2, n2);
281 }
282
283 /**
284 * Perform a one-sample t-test comparing the mean of the dataset to {@code mu}.
285 *
286 * <p>Degrees of freedom are \( v = n - 1 \).
287 *
288 * @param m Sample mean.
289 * @param v Sample variance.
290 * @param n Sample size.
291 * @return test result
292 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
293 * variance is negative
294 * @see #statistic(double, double, long)
295 */
296 public Result test(double m, double v, long n) {
297 final double t = statistic(m, v, n);
298 final double df = n - 1.0;
299 final double p = computeP(t, df);
300 return new Result(t, df, p);
301 }
302
303 /**
304 * Performs a one-sample t-test comparing the mean of the sample to {@code mu}.
305 *
306 * <p>Degrees of freedom are \( v = n - 1 \).
307 *
308 * @param sample Sample values.
309 * @return the test result
310 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
311 * the size of the samples is not equal
312 * @see #statistic(double[])
313 */
314 public Result test(double[] sample) {
315 final double t = statistic(sample);
316 final double df = sample.length - 1.0;
317 final double p = computeP(t, df);
318 return new Result(t, df, p);
319 }
320
321 /**
322 * Performs a paired two-sample t-test on related samples comparing the mean difference between
323 * the samples to {@code mu}.
324 *
325 * <p>The test is functionally equivalent to what would be returned by computing
326 * the one-sample t-test {@link #test(double[])}, with
327 * the sample array consisting of the (signed) differences between corresponding
328 * entries in {@code x} and {@code y}.
329 *
330 * @param x First sample values.
331 * @param y Second sample values.
332 * @return the test result
333 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
334 * the size of the samples is not equal
335 * @see #pairedStatistic(double[], double[])
336 */
337 public Result pairedTest(double[] x, double[] y) {
338 final double t = pairedStatistic(x, y);
339 final double df = x.length - 1.0;
340 final double p = computeP(t, df);
341 return new Result(t, df, p);
342 }
343
344 /**
345 * Performs a two-sample t-test on independent samples comparing the difference in means of the
346 * datasets to {@code mu}.
347 *
348 * <p>Use the {@link DataDispersion} to control the computation of the variance.
349 *
350 * <p>The heteroscedastic degrees of freedom are estimated using the
351 * Welch-Satterthwaite approximation:
352 *
353 * <p>\[ v = \frac{ (\frac{v_1}{n_1} + \frac{v_2}{n_2})^2 }
354 * { \frac{(v_1/n_1)^2}{n_1-1} + \frac{(v_2/n_2)^2}{n_2-1} } \]
355 *
356 * <p>The homoscedastic degrees of freedom are \( v = n_1 + n_2 - 2 \).
357 *
358 * @param m1 First sample mean.
359 * @param v1 First sample variance.
360 * @param n1 First sample size.
361 * @param m2 Second sample mean.
362 * @param v2 Second sample variance.
363 * @param n2 Second sample size.
364 * @return test result
365 * @throws IllegalArgumentException if the number of samples in either dataset is
366 * {@code < 2}; or the variances are negative.
367 * @see #statistic(double, double, long, double, double, long)
368 */
369 public Result test(double m1, double v1, long n1,
370 double m2, double v2, long n2) {
371 final double t = statistic(m1, v1, n1, m2, v2, n2);
372 final double df = equalVariances ?
373 -2.0 + n1 + n2 :
374 computeDf(v1, n1, v2, n2);
375 final double p = computeP(t, df);
376 return new Result(t, df, p);
377 }
378
379 /**
380 * Performs a two-sample t-test on independent samples comparing the difference in means of
381 * the samples to {@code mu}.
382 *
383 * <p>Use the {@link DataDispersion} to control the computation of the variance.
384 *
385 * @param x First sample values.
386 * @param y Second sample values.
387 * @return the test result
388 * @throws IllegalArgumentException if the number of samples in either dataset
389 * is {@code < 2}
390 * @see #statistic(double[], double[])
391 * @see #test(double, double, long, double, double, long)
392 */
393 public Result test(double[] x, double[] y) {
394 // Here we do not call statistic(double[], double[]) because the degreesOfFreedom
395 // requires the variance. So repeat the computation and compute p.
396 final long n1 = checkSampleSize(x.length);
397 final long n2 = checkSampleSize(y.length);
398 final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
399 final DoubleStatistics s1 = b.build(x);
400 final double m1 = s1.getAsDouble(Statistic.MEAN);
401 final double v1 = s1.getAsDouble(Statistic.VARIANCE);
402 final DoubleStatistics s2 = b.build(y);
403 final double m2 = s2.getAsDouble(Statistic.MEAN);
404 final double v2 = s2.getAsDouble(Statistic.VARIANCE);
405 final double t;
406 final double df;
407 if (equalVariances) {
408 t = computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2);
409 df = -2.0 + n1 + n2;
410 } else {
411 t = computeT(mu, m1, v1, n1, m2, v2, n2);
412 df = computeDf(v1, n1, v2, n2);
413 }
414 final double p = computeP(t, df);
415 return new Result(t, df, p);
416 }
417
418 /**
419 * Computes t statistic for one-sample t-test.
420 *
421 * @param m Sample mean.
422 * @param v Sample variance.
423 * @param n Sample size.
424 * @return t test statistic
425 */
426 private static double computeT(double m, double v, long n) {
427 return m / Math.sqrt(v / n);
428 }
429
430 /**
431 * Computes t statistic for two-sample t-test without the assumption of equal
432 * samples sizes or sub-population variances.
433 *
434 * @param mu Expected difference between means.
435 * @param m1 First sample mean.
436 * @param v1 First sample variance.
437 * @param n1 First sample size.
438 * @param m2 Second sample mean.
439 * @param v2 Second sample variance.
440 * @param n2 Second sample size.
441 * @return t test statistic
442 */
443 private static double computeT(double mu,
444 double m1, double v1, long n1,
445 double m2, double v2, long n2) {
446 return (m1 - m2 - mu) / Math.sqrt((v1 / n1) + (v2 / n2));
447 }
448
449 /**
450 * Computes approximate degrees of freedom for two-sample t-test without the
451 * assumption of equal samples sizes or sub-population variances.
452 *
453 * @param v1 First sample variance.
454 * @param n1 First sample size.
455 * @param v2 Second sample variance.
456 * @param n2 Second sample size.
457 * @return approximate degrees of freedom
458 */
459 private static double computeDf(double v1, long n1,
460 double v2, long n2) {
461 // Sample sizes are specified as a double to avoid integer overflow
462 final double d1 = n1;
463 final double d2 = n2;
464 return (((v1 / d1) + (v2 / d2)) * ((v1 / d1) + (v2 / d2))) /
465 ((v1 * v1) / (d1 * d1 * (n1 - 1)) + (v2 * v2) / (d2 * d2 * (n2 - 1)));
466 }
467
468 /**
469 * Computes t statistic for two-sample t-test under the hypothesis of equal
470 * sub-population variances.
471 *
472 * @param mu Expected difference between means.
473 * @param m1 First sample mean.
474 * @param v1 First sample variance.
475 * @param n1 First sample size.
476 * @param m2 Second sample mean.
477 * @param v2 Second sample variance.
478 * @param n2 Second sample size.
479 * @return t test statistic
480 */
481 private static double computeHomoscedasticT(double mu,
482 double m1, double v1, long n1,
483 double m2, double v2, long n2) {
484 final double pooledVariance = ((n1 - 1) * v1 + (n2 - 1) * v2) / (-2.0 + n1 + n2);
485 return (m1 - m2 - mu) / Math.sqrt(pooledVariance * (1.0 / n1 + 1.0 / n2));
486 }
487
488 /**
489 * Computes p-value for the specified t statistic.
490 *
491 * @param t T statistic.
492 * @param degreesOfFreedom Degrees of freedom.
493 * @return p-value for t-test
494 */
495 private double computeP(double t, double degreesOfFreedom) {
496 if (alternative == AlternativeHypothesis.LESS_THAN) {
497 return TDistribution.of(degreesOfFreedom).cumulativeProbability(t);
498 }
499 if (alternative == AlternativeHypothesis.GREATER_THAN) {
500 return TDistribution.of(degreesOfFreedom).survivalProbability(t);
501 }
502 // two-sided
503 return 2.0 * TDistribution.of(degreesOfFreedom).survivalProbability(Math.abs(t));
504 }
505
506 /**
507 * Check sample data size.
508 *
509 * @param n Data size.
510 * @return the sample size
511 * @throws IllegalArgumentException if the number of samples {@code < 2}
512 */
513 private static long checkSampleSize(long n) {
514 if (n <= 1) {
515 throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
516 }
517 return n;
518 }
519 }