Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import java.lang.ref.SoftReference;
020import java.util.Arrays;
021import java.util.EnumSet;
022import java.util.Objects;
023import java.util.concurrent.locks.ReentrantLock;
024import java.util.stream.IntStream;
025import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
026import org.apache.commons.statistics.distribution.NormalDistribution;
027import org.apache.commons.statistics.ranking.NaNStrategy;
028import org.apache.commons.statistics.ranking.NaturalRanking;
029import org.apache.commons.statistics.ranking.RankingAlgorithm;
030import org.apache.commons.statistics.ranking.TiesStrategy;
031
032/**
033 * Implements the Mann-Whitney U test (also called Wilcoxon rank-sum test).
034 *
035 * @see <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">
036 * Mann-Whitney U test (Wikipedia)</a>
037 * @since 1.1
038 */
039public final class MannWhitneyUTest {
040    /** Limit on sample size for the exact p-value computation for the auto mode. */
041    private static final int AUTO_LIMIT = 50;
042    /** Ranking instance. */
043    private static final RankingAlgorithm RANKING = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE);
044    /** Value for an unset f computation. */
045    private static final double UNSET = -1;
046    /** An object to use for synchonization when accessing the cache of F. */
047    private static final ReentrantLock LOCK = new ReentrantLock();
048    /** A reference to a previously computed storage for f.
049     * Use of a SoftReference ensures this is garbage collected before an OutOfMemoryError.
050     * The value should only be accessed, checked for size and optionally
051     * modified when holding the lock. When the storage is determined to be the correct
052     * size it can be returned for read/write to the array when not holding the lock. */
053    private static SoftReference<double[][][]> cacheF = new SoftReference<>(null);
054    /** Default instance. */
055    private static final MannWhitneyUTest DEFAULT = new MannWhitneyUTest(
056        AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, true, 0);
057
058    /** Alternative hypothesis. */
059    private final AlternativeHypothesis alternative;
060    /** Method to compute the p-value. */
061    private final PValueMethod pValueMethod;
062    /** Perform continuity correction. */
063    private final boolean continuityCorrection;
064    /** Expected location shift. */
065    private final double mu;
066
067    /**
068     * Result for the Mann-Whitney U test.
069     *
070     * <p>This class is immutable.
071     *
072     * @since 1.1
073     */
074    public static final class Result extends BaseSignificanceResult {
075        /** Flag indicating the data has tied values. */
076        private final boolean tiedValues;
077
078        /**
079         * Create an instance.
080         *
081         * @param statistic Test statistic.
082         * @param tiedValues Flag indicating the data has tied values.
083         * @param p Result p-value.
084         */
085        Result(double statistic, boolean tiedValues, double p) {
086            super(statistic, p);
087            this.tiedValues = tiedValues;
088        }
089
090        /**
091         * {@inheritDoc}
092         *
093         * <p>This is the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using
094         * the original sample lengths {@code n} and {@code m} using:
095         * <pre>
096         * u2 = (long) n * m - u1;
097         * </pre>
098         */
099        @Override
100        public double getStatistic() {
101            // Note: This method is here for documentation
102            return super.getStatistic();
103        }
104
105        /**
106         * Return {@code true} if the data had tied values.
107         *
108         * <p>Note: The exact computation cannot be used when there are tied values.
109         *
110         * @return {@code true} if there were tied values
111         */
112        public boolean hasTiedValues() {
113            return tiedValues;
114        }
115    }
116
117    /**
118     * @param alternative Alternative hypothesis.
119     * @param method P-value method.
120     * @param continuityCorrection true to perform continuity correction.
121     * @param mu Expected location shift.
122     */
123    private MannWhitneyUTest(AlternativeHypothesis alternative, PValueMethod method,
124        boolean continuityCorrection, double mu) {
125        this.alternative = alternative;
126        this.pValueMethod = method;
127        this.continuityCorrection = continuityCorrection;
128        this.mu = mu;
129    }
130
131    /**
132     * Return an instance using the default options.
133     *
134     * <ul>
135     * <li>{@link AlternativeHypothesis#TWO_SIDED}
136     * <li>{@link PValueMethod#AUTO}
137     * <li>{@link ContinuityCorrection#ENABLED}
138     * <li>{@linkplain #withMu(double) mu = 0}
139     * </ul>
140     *
141     * @return default instance
142     */
143    public static MannWhitneyUTest withDefaults() {
144        return DEFAULT;
145    }
146
147    /**
148     * Return an instance with the configured alternative hypothesis.
149     *
150     * @param v Value.
151     * @return an instance
152     */
153    public MannWhitneyUTest with(AlternativeHypothesis v) {
154        return new MannWhitneyUTest(Objects.requireNonNull(v), pValueMethod, continuityCorrection, mu);
155    }
156
157    /**
158     * Return an instance with the configured p-value method.
159     *
160     * @param v Value.
161     * @return an instance
162     * @throws IllegalArgumentException if the value is not in the allowed options or is null
163     */
164    public MannWhitneyUTest with(PValueMethod v) {
165        return new MannWhitneyUTest(alternative,
166            Arguments.checkOption(v, EnumSet.of(PValueMethod.AUTO, PValueMethod.EXACT, PValueMethod.ASYMPTOTIC)),
167            continuityCorrection, mu);
168    }
169
170    /**
171     * Return an instance with the configured continuity correction.
172     *
173     * <p>If {@link ContinuityCorrection#ENABLED ENABLED}, adjust the U rank statistic by
174     * 0.5 towards the mean value when computing the z-statistic if a normal approximation is used
175     * to compute the p-value.
176     *
177     * @param v Value.
178     * @return an instance
179     */
180    public MannWhitneyUTest with(ContinuityCorrection v) {
181        return new MannWhitneyUTest(alternative, pValueMethod,
182            Objects.requireNonNull(v) == ContinuityCorrection.ENABLED, mu);
183    }
184
185    /**
186     * Return an instance with the configured location shift {@code mu}.
187     *
188     * @param v Value.
189     * @return an instance
190     * @throws IllegalArgumentException if the value is not finite
191     */
192    public MannWhitneyUTest withMu(double v) {
193        return new MannWhitneyUTest(alternative, pValueMethod, continuityCorrection, Arguments.checkFinite(v));
194    }
195
196    /**
197     * Computes the Mann-Whitney U statistic comparing two independent
198     * samples possibly of different length.
199     *
200     * <p>This statistic can be used to perform a Mann-Whitney U test evaluating the
201     * null hypothesis that the two independent samples differ by a location shift of {@code mu}.
202     *
203     * <p>This returns the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using:
204     * <pre>
205     * u2 = (long) x.length * y.length - u1;
206     * </pre>
207     *
208     * @param x First sample values.
209     * @param y Second sample values.
210     * @return Mann-Whitney U<sub>1</sub> statistic
211     * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
212     * NaN values.
213     * @see #withMu(double)
214     */
215    public double statistic(double[] x, double[] y) {
216        checkSamples(x, y);
217
218        final double[] z = concatenateSamples(mu, x, y);
219        final double[] ranks = RANKING.apply(z);
220
221        // The ranks for x is in the first x.length entries in ranks because x
222        // is in the first x.length entries in z
223        final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
224
225        // U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
226        // e.g. x, n1 is the number of observations in sample 1.
227        return sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
228    }
229
230    /**
231     * Performs a Mann-Whitney U test comparing the location for two independent
232     * samples. The location is specified using {@link #withMu(double) mu}.
233     *
234     * <p>The test is defined by the {@link AlternativeHypothesis}.
235     * <ul>
236     * <li>'two-sided': the distribution underlying {@code (x - mu)} is not equal to the
237     * distribution underlying {@code y}.
238     * <li>'greater': the distribution underlying {@code (x - mu)} is stochastically greater than
239     * the distribution underlying {@code y}.
240     * <li>'less': the distribution underlying {@code (x - mu)} is stochastically less than
241     * the distribution underlying {@code y}.
242     * </ul>
243     *
244     * <p>If the p-value method is {@linkplain PValueMethod#AUTO auto} an exact p-value is
245     * computed if the samples contain less than 50 values; otherwise a normal
246     * approximation is used.
247     *
248     * <p>Computation of the exact p-value is only valid if there are no tied
249     * ranks in the data; otherwise the p-value resorts to the asymptotic
250     * approximation using a tie correction and an optional continuity correction.
251     *
252     * <p><strong>Note: </strong>
253     * Exact computation requires tabulation of values not exceeding size
254     * {@code (n+1)*(m+1)*(u+1)} where {@code u} is the minimum of the U<sub>1</sub> and
255     * U<sub>2</sub> statistics and {@code n} and {@code m} are the sample sizes.
256     * This may use a very large amount of memory and result in an {@link OutOfMemoryError}.
257     * Exact computation requires a finite binomial coefficient {@code binom(n+m, m)}
258     * which is limited to {@code n+m <= 1029} for any {@code n} and {@code m},
259     * or {@code min(n, m) <= 37} for any {@code max(n, m)}.
260     * An {@link OutOfMemoryError} is not expected using the
261     * limits configured for the {@linkplain PValueMethod#AUTO auto} p-value computation
262     * as the maximum required memory is approximately 23 MiB.
263     *
264     * @param x First sample values.
265     * @param y Second sample values.
266     * @return test result
267     * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
268     * NaN values.
269     * @throws OutOfMemoryError if the exact computation is <em>user-requested</em> for
270     * large samples and there is not enough memory.
271     * @see #statistic(double[], double[])
272     * @see #withMu(double)
273     * @see #with(AlternativeHypothesis)
274     * @see #with(ContinuityCorrection)
275     */
276    public Result test(double[] x, double[] y) {
277        // Computation as above. The ranks are required for tie correction.
278        checkSamples(x, y);
279        final double[] z = concatenateSamples(mu, x, y);
280        final double[] ranks = RANKING.apply(z);
281        final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
282        final double u1 = sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
283
284        final double c = WilcoxonSignedRankTest.calculateTieCorrection(ranks);
285        final boolean tiedValues = c != 0;
286
287        PValueMethod method = pValueMethod;
288        final int n = x.length;
289        final int m = y.length;
290        if (method == PValueMethod.AUTO && Math.max(n, m) < AUTO_LIMIT) {
291            method = PValueMethod.EXACT;
292        }
293        // Exact p requires no ties.
294        // The method will fail-fast if the computation is not possible due
295        // to the size of the data.
296        double p = method == PValueMethod.EXACT && !tiedValues ?
297            calculateExactPValue(u1, n, m, alternative) : -1;
298        if (p < 0) {
299            p = calculateAsymptoticPValue(u1, n, m, c);
300        }
301        return new Result(u1, tiedValues, p);
302    }
303
304    /**
305     * Ensures that the provided arrays fulfil the assumptions.
306     *
307     * @param x First sample values.
308     * @param y Second sample values.
309     * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length.
310     */
311    private static void checkSamples(double[] x, double[] y) {
312        Arguments.checkValuesRequiredSize(x.length, 1);
313        Arguments.checkValuesRequiredSize(y.length, 1);
314    }
315
316    /**
317     * Concatenate the samples into one array. Subtract {@code mu} from the first sample.
318     *
319     * @param mu Expected difference between means.
320     * @param x First sample values.
321     * @param y Second sample values.
322     * @return concatenated array
323     */
324    private static double[] concatenateSamples(double mu, double[] x, double[] y) {
325        final double[] z = new double[x.length + y.length];
326        System.arraycopy(x, 0, z, 0, x.length);
327        System.arraycopy(y, 0, z, x.length, y.length);
328        if (mu != 0) {
329            for (int i = 0; i < x.length; i++) {
330                z[i] -= mu;
331            }
332        }
333        return z;
334    }
335
336    /**
337     * Calculate the asymptotic p-value using a Normal approximation.
338     *
339     * @param u Mann-Whitney U value.
340     * @param n1 Number of subjects in first sample.
341     * @param n2 Number of subjects in second sample.
342     * @param c Tie-correction
343     * @return two-sided asymptotic p-value
344     */
345    private double calculateAsymptoticPValue(double u, int n1, int n2, double c) {
346        // Use long to avoid overflow
347        final long n1n2 = (long) n1 * n2;
348        final long n = (long) n1 + n2;
349
350        // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
351        final double e = n1n2 * 0.5;
352        final double variance = (n1n2 / 12.0) * ((n + 1.0) - c / n / (n - 1));
353
354        double z = u - e;
355        if (continuityCorrection) {
356            // +/- 0.5 is a continuity correction towards the expected.
357            if (alternative == AlternativeHypothesis.GREATER_THAN) {
358                z -= 0.5;
359            } else if (alternative == AlternativeHypothesis.LESS_THAN) {
360                z += 0.5;
361            } else {
362                // two-sided. Shift towards the expected of zero.
363                // Use of signum ignores x==0 (i.e. not copySign(0.5, z))
364                z -= Math.signum(z) * 0.5;
365            }
366        }
367        z /= Math.sqrt(variance);
368
369        final NormalDistribution standardNormal = NormalDistribution.of(0, 1);
370        if (alternative == AlternativeHypothesis.GREATER_THAN) {
371            return standardNormal.survivalProbability(z);
372        }
373        if (alternative == AlternativeHypothesis.LESS_THAN) {
374            return standardNormal.cumulativeProbability(z);
375        }
376        // two-sided
377        return 2 * standardNormal.survivalProbability(Math.abs(z));
378    }
379
380    /**
381     * Calculate the exact p-value. If the value cannot be computed this returns -1.
382     *
383     * <p>Note: Computation may run out of memory during array allocation, or method
384     * recursion.
385     *
386     * @param u Mann-Whitney U value.
387     * @param m Number of subjects in first sample.
388     * @param n Number of subjects in second sample.
389     * @param alternative Alternative hypothesis.
390     * @return exact p-value (or -1) (two-sided, greater, or less using the options)
391     */
392    // package-private for testing
393    static double calculateExactPValue(double u, int m, int n, AlternativeHypothesis alternative) {
394        // Check the computation can be attempted.
395        // u must be an integer
396        if ((int) u != u) {
397            return -1;
398        }
399        // Note: n+m will not overflow as we concatenated the samples to a single array.
400        final double binom = BinomialCoefficientDouble.value(n + m, m);
401        if (binom == Double.POSITIVE_INFINITY) {
402            return -1;
403        }
404
405        // Use u_min for the CDF.
406        final int u1 = (int) u;
407        final int u2 = (int) ((long) m * n - u1);
408        // Use m < n to support symmetry.
409        final int n1 = Math.min(m, n);
410        final int n2 = Math.max(m, n);
411
412        // Return the correct side:
413        if (alternative == AlternativeHypothesis.GREATER_THAN) {
414            // sf(u1 - 1)
415            return sf(u1 - 1, u2 + 1, n1, n2, binom);
416        }
417        if (alternative == AlternativeHypothesis.LESS_THAN) {
418            // cdf(u1)
419            return cdf(u1, u2, n1, n2, binom);
420        }
421        // two-sided: 2 * sf(max(u1, u2) - 1) or 2 * cdf(min(u1, u2))
422        final double p = 2 * computeCdf(Math.min(u1, u2), n1, n2, binom);
423        // Clip to range: [0, 1]
424        return Math.min(1, p);
425    }
426
427    /**
428     * Compute the cumulative density function of the Mann-Whitney U1 statistic.
429     * The U2 statistic is passed for convenience to exploit symmetry in the distribution.
430     *
431     * @param u1 Mann-Whitney U1 statistic
432     * @param u2 Mann-Whitney U2 statistic
433     * @param m First sample size.
434     * @param n Second sample size.
435     * @param binom binom(n+m, m) (must be finite)
436     * @return {@code Pr(X <= k)}
437     */
438    private static double cdf(int u1, int u2, int m, int n, double binom) {
439        // Exploit symmetry. Note the distribution is discrete thus requiring (u2 - 1).
440        return u2 > u1 ?
441            computeCdf(u1, m, n, binom) :
442            1 - computeCdf(u2 - 1, m, n, binom);
443    }
444
445    /**
446     * Compute the survival function of the Mann-Whitney U1 statistic.
447     * The U2 statistic is passed for convenience to exploit symmetry in the distribution.
448     *
449     * @param u1 Mann-Whitney U1 statistic
450     * @param u2 Mann-Whitney U2 statistic
451     * @param m First sample size.
452     * @param n Second sample size.
453     * @param binom binom(n+m, m) (must be finite)
454     * @return {@code Pr(X > k)}
455     */
456    private static double sf(int u1, int u2, int m, int n, double binom) {
457        // Opposite of the CDF
458        return u2 > u1 ?
459            1 - computeCdf(u1, m, n, binom) :
460            computeCdf(u2 - 1, m, n, binom);
461    }
462
463    /**
464     * Compute the cumulative density function of the Mann-Whitney U statistic.
465     *
466     * <p>This should be called with the lower of U1 or U2 for computational efficiency.
467     *
468     * <p>Uses the recursive formula provided in Bucchianico, A.D, (1999)
469     * Combinatorics, computer algebra and the Wilcoxon-Mann-Whitney test, Journal
470     * of Statistical Planning and Inference, Volume 79, Issue 2, 349-364.
471     *
472     * @param k Mann-Whitney U statistic
473     * @param m First sample size.
474     * @param n Second sample size.
475     * @param binom binom(n+m, m) (must be finite)
476     * @return {@code Pr(X <= k)}
477     */
478    private static double computeCdf(int k, int m, int n, double binom) {
479        // Theorem 2.5:
480        // f(m, n, k) = 0 if k < 0, m < 0, n < 0, k > nm
481        if (k < 0) {
482            return 0;
483        }
484        // Recursively compute f(m, n, k)
485        final double[][][] f = getF(m, n, k);
486
487        // P(X=k) = f(m, n, k) / binom(m+n, m)
488        // P(X<=k) = sum_0^k (P(X=i))
489
490        // Called with k = min(u1, u2) : max(p) ~ 0.5 so no need to clip to [0, 1]
491        return IntStream.rangeClosed(0, k).mapToDouble(i -> fmnk(f, m, n, i)).sum() / binom;
492    }
493
494    /**
495     * Gets the storage for f(m, n, k).
496     *
497     * <p>This may be cached for performance.
498     *
499     * @param m M.
500     * @param n N.
501     * @param k K.
502     * @return the storage for f
503     */
504    private static double[][][] getF(int m, int n, int k) {
505        // Obtain any previous computation of f and expand it if required.
506        // F is only modified within this synchronized block.
507        // Any concurrent threads using a reference returned by this method
508        // will not receive an index out-of-bounds as f is only ever expanded.
509        try {
510            LOCK.lock();
511            // Note: f(x<m, y<n, z<k) is always the same.
512            // Cache the array and re-use any previous computation.
513            double[][][] f = cacheF.get();
514
515            // Require:
516            // f = new double[m + 1][n + 1][k + 1]
517            // f(m, n, 0) == 1; otherwise -1 if not computed
518            // m+n <= 1029 for any m,n; k < mn/2 (due to symmetry using min(u1, u2))
519            // Size m=n=515: approximately 516^2 * 515^2/2 = 398868 doubles ~ 3.04 GiB
520            if (f == null) {
521                f = new double[m + 1][n + 1][k + 1];
522                for (final double[][] a : f) {
523                    for (final double[] b : a) {
524                        initialize(b);
525                    }
526                }
527                // Cache for reuse.
528                cacheF = new SoftReference<>(f);
529                return f;
530            }
531
532            // Grow if required: m1 < m+1 => m1-(m+1) < 0 => m1 - m < 1
533            final int m1 = f.length;
534            final int n1 = f[0].length;
535            final int k1 = f[0][0].length;
536            final boolean growM = m1 - m < 1;
537            final boolean growN = n1 - n < 1;
538            final boolean growK = k1 - k < 1;
539            if (growM | growN | growK) {
540                // Some part of the previous f is too small.
541                // Atomically grow without destroying the previous computation.
542                // Any other thread using the previous f will not go out of bounds
543                // by keeping the new f dimensions at least as large.
544                // Note: Doing this in-place allows the memory to be gradually
545                // increased rather than allocating a new [m + 1][n + 1][k + 1]
546                // and copying all old values.
547                final int sn = Math.max(n1, n + 1);
548                final int sk = Math.max(k1, k + 1);
549                if (growM) {
550                    // Entirely new region
551                    f = Arrays.copyOf(f, m + 1);
552                    for (int x = m1; x <= m; x++) {
553                        f[x] = new double[sn][sk];
554                        for (final double[] b : f[x]) {
555                            initialize(b);
556                        }
557                    }
558                }
559                // Expand previous in place if required
560                if (growN) {
561                    for (int x = 0; x < m1; x++) {
562                        f[x] = Arrays.copyOf(f[x], sn);
563                        for (int y = n1; y < sn; y++) {
564                            final double[] b = f[x][y] = new double[sk];
565                            initialize(b);
566                        }
567                    }
568                }
569                if (growK) {
570                    for (int x = 0; x < m1; x++) {
571                        for (int y = 0; y < n1; y++) {
572                            final double[] b = f[x][y] = Arrays.copyOf(f[x][y], sk);
573                            for (int z = k1; z < sk; z++) {
574                                b[z] = UNSET;
575                            }
576                        }
577                    }
578                }
579                // Avoided an OutOfMemoryError. Cache for reuse.
580                cacheF = new SoftReference<>(f);
581            }
582            return f;
583        } finally {
584            LOCK.unlock();
585        }
586    }
587
588    /**
589     * Initialize the array for f(m, n, x).
590     * Set value to 1 for x=0; otherwise {@link #UNSET}.
591     *
592     * @param fmn Array.
593     */
594    private static void initialize(double[] fmn) {
595        Arrays.fill(fmn, UNSET);
596        // f(m, n, 0) == 1 if m >= 0, n >= 0
597        fmn[0] = 1;
598    }
599
600    /**
601     * Compute f(m; n; k), the number of subsets of {0; 1; ...; n} with m elements such
602     * that the elements of this subset add up to k.
603     *
604     * <p>The function is computed recursively.
605     *
606     * @param f Tabulated values of f[m][n][k].
607     * @param m M
608     * @param n N
609     * @param k K
610     * @return f(m; n; k)
611     */
612    private static double fmnk(double[][][] f, int m, int n, int k) {
613        // Theorem 2.5:
614        // Omit conditions that will not be met: k > mn
615        // f(m, n, k) = 0 if k < 0, m < 0, n < 0
616        if ((k | m | n) < 0) {
617            return 0;
618        }
619        // Compute on demand
620        double fmnk = f[m][n][k];
621        if (fmnk < 0) {
622            // f(m, n, 0) == 1 if m >= 0, n >= 0
623            // This is already computed.
624
625            // Recursion from formula (3):
626            // f(m, n, k) = f(m-1, n, k-n) + f(m, n-1, k)
627            f[m][n][k] = fmnk = fmnk(f, m - 1, n, k - n) + fmnk(f, m, n - 1, k);
628        }
629        return fmnk;
630    }
631}