MannWhitneyUTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.math4.legacy.stat.inference;

  18. import org.apache.commons.statistics.distribution.NormalDistribution;
  19. import org.apache.commons.math4.legacy.exception.ConvergenceException;
  20. import org.apache.commons.math4.legacy.exception.MaxCountExceededException;
  21. import org.apache.commons.math4.legacy.exception.NoDataException;
  22. import org.apache.commons.math4.legacy.exception.NullArgumentException;
  23. import org.apache.commons.math4.legacy.stat.ranking.NaNStrategy;
  24. import org.apache.commons.math4.legacy.stat.ranking.NaturalRanking;
  25. import org.apache.commons.math4.legacy.stat.ranking.TiesStrategy;
  26. import org.apache.commons.math4.core.jdkmath.JdkMath;

  27. import java.util.stream.IntStream;

  28. /**
  29.  * An implementation of the Mann-Whitney U test (also called Wilcoxon rank-sum test).
  30.  *
  31.  */
  32. public class MannWhitneyUTest {

  33.     /** Ranking algorithm. */
  34.     private NaturalRanking naturalRanking;

  35.     /**
  36.      * Create a test instance using where NaN's are left in place and ties get
  37.      * the average of applicable ranks. Use this unless you are very sure of
  38.      * what you are doing.
  39.      */
  40.     public MannWhitneyUTest() {
  41.         naturalRanking = new NaturalRanking(NaNStrategy.FIXED,
  42.                 TiesStrategy.AVERAGE);
  43.     }

  44.     /**
  45.      * Create a test instance using the given strategies for NaN's and ties.
  46.      * Only use this if you are sure of what you are doing.
  47.      *
  48.      * @param nanStrategy
  49.      *            specifies the strategy that should be used for Double.NaN's
  50.      * @param tiesStrategy
  51.      *            specifies the strategy that should be used for ties
  52.      */
  53.     public MannWhitneyUTest(final NaNStrategy nanStrategy,
  54.                             final TiesStrategy tiesStrategy) {
  55.         naturalRanking = new NaturalRanking(nanStrategy, tiesStrategy);
  56.     }

  57.     /**
  58.      * Ensures that the provided arrays fulfills the assumptions.
  59.      *
  60.      * @param x first sample
  61.      * @param y second sample
  62.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  63.      * @throws NoDataException if {@code x} or {@code y} are zero-length.
  64.      */
  65.     private void ensureDataConformance(final double[] x, final double[] y)
  66.         throws NullArgumentException, NoDataException {

  67.         if (x == null ||
  68.             y == null) {
  69.             throw new NullArgumentException();
  70.         }
  71.         if (x.length == 0 ||
  72.             y.length == 0) {
  73.             throw new NoDataException();
  74.         }
  75.     }

  76.     /** Concatenate the samples into one array.
  77.      * @param x first sample
  78.      * @param y second sample
  79.      * @return concatenated array
  80.      */
  81.     private double[] concatenateSamples(final double[] x, final double[] y) {
  82.         final double[] z = new double[x.length + y.length];

  83.         System.arraycopy(x, 0, z, 0, x.length);
  84.         System.arraycopy(y, 0, z, x.length, y.length);

  85.         return z;
  86.     }

  87.     /**
  88.      * Computes the <a
  89.      * href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney
  90.      * U statistic</a> comparing mean for two independent samples possibly of
  91.      * different length.
  92.      * <p>
  93.      * This statistic can be used to perform a Mann-Whitney U test evaluating
  94.      * the null hypothesis that the two independent samples has equal mean.
  95.      * </p>
  96.      * <p>
  97.      * Let X<sub>i</sub> denote the i'th individual of the first sample and
  98.      * Y<sub>j</sub> the j'th individual in the second sample. Note that the
  99.      * samples would often have different length.
  100.      * </p>
  101.      * <p>
  102.      * <strong>Preconditions</strong>:
  103.      * <ul>
  104.      * <li>All observations in the two samples are independent.</li>
  105.      * <li>The observations are at least ordinal (continuous are also ordinal).</li>
  106.      * </ul>
  107.      *
  108.      * @param x the first sample
  109.      * @param y the second sample
  110.      * @return Mann-Whitney U statistic (minimum of U<sup>x</sup> and U<sup>y</sup>)
  111.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  112.      * @throws NoDataException if {@code x} or {@code y} are zero-length.
  113.      */
  114.     public double mannWhitneyU(final double[] x, final double[] y)
  115.         throws NullArgumentException, NoDataException {

  116.         ensureDataConformance(x, y);

  117.         final double[] z = concatenateSamples(x, y);
  118.         final double[] ranks = naturalRanking.rank(z);

  119.         double sumRankX = 0;

  120.         /*
  121.          * The ranks for x is in the first x.length entries in ranks because x
  122.          * is in the first x.length entries in z
  123.          */
  124.         sumRankX = IntStream.range(0, x.length).mapToDouble(i -> ranks[i]).sum();

  125.         /*
  126.          * U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
  127.          * e.g. x, n1 is the number of observations in sample 1.
  128.          */
  129.         final double u1 = sumRankX - ((long) x.length * (x.length + 1)) / 2;

  130.         /*
  131.          * It can be shown that U1 + U2 = n1 * n2
  132.          */
  133.         final double u2 = (long) x.length * y.length - u1;

  134.         return JdkMath.min(u1, u2);
  135.     }

  136.     /**
  137.      * @param umin smallest Mann-Whitney U value
  138.      * @param n1 number of subjects in first sample
  139.      * @param n2 number of subjects in second sample
  140.      * @return two-sided asymptotic p-value
  141.      * @throws ConvergenceException if the p-value can not be computed
  142.      * due to a convergence error
  143.      * @throws MaxCountExceededException if the maximum number of
  144.      * iterations is exceeded
  145.      */
  146.     private double calculateAsymptoticPValue(final double umin,
  147.                                              final int n1,
  148.                                              final int n2)
  149.         throws ConvergenceException, MaxCountExceededException {

  150.         /* long multiplication to avoid overflow (double not used due to efficiency
  151.          * and to avoid precision loss)
  152.          */
  153.         final long n1n2prod = (long) n1 * n2;

  154.         // http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U#Normal_approximation
  155.         final double eU = n1n2prod / 2.0;
  156.         final double varU = n1n2prod * (n1 + n2 + 1) / 12.0;

  157.         final double z = (umin - eU) / JdkMath.sqrt(varU);

  158.         // No try-catch or advertised exception because args are valid
  159.         // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
  160.         final NormalDistribution standardNormal = NormalDistribution.of(0, 1);

  161.         return 2 * standardNormal.cumulativeProbability(z);
  162.     }

  163.     /**
  164.      * Returns the asymptotic <i>observed significance level</i>, or <a href=
  165.      * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
  166.      * p-value</a>, associated with a <a
  167.      * href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney
  168.      * U statistic</a> comparing mean for two independent samples.
  169.      * <p>
  170.      * Let X<sub>i</sub> denote the i'th individual of the first sample and
  171.      * Y<sub>j</sub> the j'th individual in the second sample. Note that the
  172.      * samples would often have different length.
  173.      * </p>
  174.      * <p>
  175.      * <strong>Preconditions</strong>:
  176.      * <ul>
  177.      * <li>All observations in the two samples are independent.</li>
  178.      * <li>The observations are at least ordinal (continuous are also ordinal).</li>
  179.      * </ul><p>
  180.      * Ties give rise to biased variance at the moment. See e.g. <a
  181.      * href="http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf"
  182.      * >http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf</a>.</p>
  183.      *
  184.      * @param x the first sample
  185.      * @param y the second sample
  186.      * @return asymptotic p-value
  187.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  188.      * @throws NoDataException if {@code x} or {@code y} are zero-length.
  189.      * @throws ConvergenceException if the p-value can not be computed due to a
  190.      * convergence error
  191.      * @throws MaxCountExceededException if the maximum number of iterations
  192.      * is exceeded
  193.      */
  194.     public double mannWhitneyUTest(final double[] x, final double[] y)
  195.         throws NullArgumentException, NoDataException,
  196.         ConvergenceException, MaxCountExceededException {

  197.         ensureDataConformance(x, y);

  198.         final double uMin = mannWhitneyU(x, y);

  199.         return calculateAsymptoticPValue(uMin, x.length, y.length);
  200.     }
  201. }