PearsonsCorrelation.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.math4.legacy.stat.correlation;

  18. import org.apache.commons.statistics.distribution.TDistribution;
  19. import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
  20. import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
  21. import org.apache.commons.math4.legacy.exception.NullArgumentException;
  22. import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
  23. import org.apache.commons.math4.legacy.linear.BlockRealMatrix;
  24. import org.apache.commons.math4.legacy.linear.RealMatrix;
  25. import org.apache.commons.math4.legacy.stat.regression.SimpleRegression;
  26. import org.apache.commons.math4.core.jdkmath.JdkMath;

  27. /**
  28.  * Computes Pearson's product-moment correlation coefficients for pairs of arrays
  29.  * or columns of a matrix.
  30.  *
  31.  * <p>The constructors that take <code>RealMatrix</code> or
  32.  * <code>double[][]</code> arguments generate correlation matrices.  The
  33.  * columns of the input matrices are assumed to represent variable values.
  34.  * Correlations are given by the formula</p>
  35.  *
  36.  * <p><code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
  37.  * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
  38.  * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p>
  39.  *
  40.  * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
  41.  * to construct an instance with no data and then {@link #correlation(double[], double[])}.
  42.  * Correlation matrices can also be computed directly from an instance with no data using
  43.  * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
  44.  * {@link #getCorrelationPValues()},  or {@link #getCorrelationStandardErrors()}; however, one of the
  45.  * constructors supplying data or a covariance matrix must be used to create the instance.</p>
  46.  *
  47.  * @since 2.0
  48.  */
  49. public class PearsonsCorrelation {

  50.     /** correlation matrix. */
  51.     private final RealMatrix correlationMatrix;

  52.     /** number of observations. */
  53.     private final int nObs;

  54.     /**
  55.      * Create a PearsonsCorrelation instance without data.
  56.      */
  57.     public PearsonsCorrelation() {
  58.         super();
  59.         correlationMatrix = null;
  60.         nObs = 0;
  61.     }

  62.     /**
  63.      * Create a PearsonsCorrelation from a rectangular array
  64.      * whose columns represent values of variables to be correlated.
  65.      *
  66.      * Throws MathIllegalArgumentException if the input array does not have at least
  67.      * two columns and two rows.  Pairwise correlations are set to NaN if one
  68.      * of the correlates has zero variance.
  69.      *
  70.      * @param data rectangular array with columns representing variables
  71.      * @throws MathIllegalArgumentException if the input data array is not
  72.      * rectangular with at least two rows and two columns.
  73.      * @see #correlation(double[], double[])
  74.      */
  75.     public PearsonsCorrelation(double[][] data) {
  76.         this(new BlockRealMatrix(data));
  77.     }

  78.     /**
  79.      * Create a PearsonsCorrelation from a RealMatrix whose columns
  80.      * represent variables to be correlated.
  81.      *
  82.      * Throws MathIllegalArgumentException if the matrix does not have at least
  83.      * two columns and two rows.  Pairwise correlations are set to NaN if one
  84.      * of the correlates has zero variance.
  85.      *
  86.      * @param matrix matrix with columns representing variables to correlate
  87.      * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
  88.      * @see #correlation(double[], double[])
  89.      */
  90.     public PearsonsCorrelation(RealMatrix matrix) {
  91.         nObs = matrix.getRowDimension();
  92.         correlationMatrix = computeCorrelationMatrix(matrix);
  93.     }

  94.     /**
  95.      * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
  96.      * matrix is computed by scaling the Covariance's covariance matrix.
  97.      * The Covariance instance must have been created from a data matrix with
  98.      * columns representing variable values.
  99.      *
  100.      * @param covariance Covariance instance
  101.      */
  102.     public PearsonsCorrelation(Covariance covariance) {
  103.         RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
  104.         if (covarianceMatrix == null) {
  105.             throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
  106.         }
  107.         nObs = covariance.getN();
  108.         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
  109.     }

  110.     /**
  111.      * Create a PearsonsCorrelation from a covariance matrix. The correlation
  112.      * matrix is computed by scaling the covariance matrix.
  113.      *
  114.      * @param covarianceMatrix covariance matrix
  115.      * @param numberOfObservations the number of observations in the dataset used to compute
  116.      * the covariance matrix
  117.      */
  118.     public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
  119.         nObs = numberOfObservations;
  120.         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
  121.     }

  122.     /**
  123.      * Returns the correlation matrix.
  124.      *
  125.      * <p>This method will return null if the non-argument constructor was used
  126.      * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
  127.      * has been called before it is activated.</p>
  128.      *
  129.      * @return correlation matrix
  130.      */
  131.     public RealMatrix getCorrelationMatrix() {
  132.         return correlationMatrix;
  133.     }

  134.     /**
  135.      * Returns a matrix of standard errors associated with the estimates
  136.      * in the correlation matrix.<br>
  137.      * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
  138.      * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
  139.      *
  140.      * <p>The formula used to compute the standard error is <br>
  141.      * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
  142.      * where <code>r</code> is the estimated correlation coefficient and
  143.      * <code>n</code> is the number of observations in the source dataset.</p>
  144.      *
  145.      * <p>To use this method, one of the constructors that supply an input
  146.      * matrix must have been used to create this instance.</p>
  147.      *
  148.      * @return matrix of correlation standard errors
  149.      * @throws NullPointerException if this instance was created with no data.
  150.      */
  151.     public RealMatrix getCorrelationStandardErrors() {
  152.         int nVars = correlationMatrix.getColumnDimension();
  153.         double[][] out = new double[nVars][nVars];
  154.         for (int i = 0; i < nVars; i++) {
  155.             for (int j = 0; j < nVars; j++) {
  156.                 double r = correlationMatrix.getEntry(i, j);
  157.                 out[i][j] = JdkMath.sqrt((1 - r * r) /(nObs - 2));
  158.             }
  159.         }
  160.         return new BlockRealMatrix(out);
  161.     }

  162.     /**
  163.      * Returns a matrix of p-values associated with the (two-sided) null
  164.      * hypothesis that the corresponding correlation coefficient is zero.
  165.      *
  166.      * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
  167.      * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
  168.      * a value with absolute value greater than or equal to <br>
  169.      * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
  170.      *
  171.      * <p>The values in the matrix are sometimes referred to as the
  172.      * <i>significance</i> of the corresponding correlation coefficients.</p>
  173.      *
  174.      * <p>To use this method, one of the constructors that supply an input
  175.      * matrix must have been used to create this instance.</p>
  176.      *
  177.      * @return matrix of p-values
  178.      * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
  179.      * if an error occurs estimating probabilities
  180.      * @throws NullPointerException if this instance was created with no data.
  181.      */
  182.     public RealMatrix getCorrelationPValues() {
  183.         TDistribution tDistribution = TDistribution.of(nObs - 2);
  184.         int nVars = correlationMatrix.getColumnDimension();
  185.         double[][] out = new double[nVars][nVars];
  186.         for (int i = 0; i < nVars; i++) {
  187.             for (int j = 0; j < nVars; j++) {
  188.                 if (i == j) {
  189.                     out[i][j] = 0d;
  190.                 } else {
  191.                     double r = correlationMatrix.getEntry(i, j);
  192.                     double t = JdkMath.abs(r * JdkMath.sqrt((nObs - 2)/(1 - r * r)));
  193.                     out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
  194.                 }
  195.             }
  196.         }
  197.         return new BlockRealMatrix(out);
  198.     }


  199.     /**
  200.      * Computes the correlation matrix for the columns of the
  201.      * input matrix, using {@link #correlation(double[], double[])}.
  202.      *
  203.      * Throws MathIllegalArgumentException if the matrix does not have at least
  204.      * two columns and two rows.  Pairwise correlations are set to NaN if one
  205.      * of the correlates has zero variance.
  206.      *
  207.      * @param matrix matrix with columns representing variables to correlate
  208.      * @return correlation matrix
  209.      * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
  210.      * @see #correlation(double[], double[])
  211.      */
  212.     public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
  213.         checkSufficientData(matrix);
  214.         int nVars = matrix.getColumnDimension();
  215.         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
  216.         for (int i = 0; i < nVars; i++) {
  217.             for (int j = 0; j < i; j++) {
  218.               double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
  219.               outMatrix.setEntry(i, j, corr);
  220.               outMatrix.setEntry(j, i, corr);
  221.             }
  222.             outMatrix.setEntry(i, i, 1d);
  223.         }
  224.         return outMatrix;
  225.     }

  226.     /**
  227.      * Computes the correlation matrix for the columns of the
  228.      * input rectangular array.  The columns of the array represent values
  229.      * of variables to be correlated.
  230.      *
  231.      * Throws MathIllegalArgumentException if the matrix does not have at least
  232.      * two columns and two rows or if the array is not rectangular. Pairwise
  233.      * correlations are set to NaN if one of the correlates has zero variance.
  234.      *
  235.      * @param data matrix with columns representing variables to correlate
  236.      * @return correlation matrix
  237.      * @throws MathIllegalArgumentException if the array does not contain sufficient data
  238.      * @see #correlation(double[], double[])
  239.      */
  240.     public RealMatrix computeCorrelationMatrix(double[][] data) {
  241.        return computeCorrelationMatrix(new BlockRealMatrix(data));
  242.     }

  243.     /**
  244.      * Computes the Pearson's product-moment correlation coefficient between two arrays.
  245.      *
  246.      * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
  247.      * or their common length is less than 2.  Returns {@code NaN} if either of the arrays
  248.      * has zero variance (i.e., if one of the arrays does not contain at least two distinct
  249.      * values).</p>
  250.      *
  251.      * @param xArray first data array
  252.      * @param yArray second data array
  253.      * @return Returns Pearson's correlation coefficient for the two arrays
  254.      * @throws DimensionMismatchException if the arrays lengths do not match
  255.      * @throws MathIllegalArgumentException if there is insufficient data
  256.      */
  257.     public double correlation(final double[] xArray, final double[] yArray) {
  258.         SimpleRegression regression = new SimpleRegression();
  259.         if (xArray.length != yArray.length) {
  260.             throw new DimensionMismatchException(xArray.length, yArray.length);
  261.         } else if (xArray.length < 2) {
  262.             throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
  263.                                                    xArray.length, 2);
  264.         } else {
  265.             for(int i=0; i<xArray.length; i++) {
  266.                 regression.addData(xArray[i], yArray[i]);
  267.             }
  268.             return regression.getR();
  269.         }
  270.     }

  271.     /**
  272.      * Derives a correlation matrix from a covariance matrix.
  273.      *
  274.      * <p>Uses the formula <br>
  275.      * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
  276.      * <code>r(&middot;,&middot;)</code> is the correlation coefficient and
  277.      * <code>s(&middot;)</code> means standard deviation.</p>
  278.      *
  279.      * @param covarianceMatrix the covariance matrix
  280.      * @return correlation matrix
  281.      */
  282.     public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
  283.         int nVars = covarianceMatrix.getColumnDimension();
  284.         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
  285.         for (int i = 0; i < nVars; i++) {
  286.             double sigma = JdkMath.sqrt(covarianceMatrix.getEntry(i, i));
  287.             outMatrix.setEntry(i, i, 1d);
  288.             for (int j = 0; j < i; j++) {
  289.                 double entry = covarianceMatrix.getEntry(i, j) /
  290.                        (sigma * JdkMath.sqrt(covarianceMatrix.getEntry(j, j)));
  291.                 outMatrix.setEntry(i, j, entry);
  292.                 outMatrix.setEntry(j, i, entry);
  293.             }
  294.         }
  295.         return outMatrix;
  296.     }

  297.     /**
  298.      * Throws MathIllegalArgumentException if the matrix does not have at least
  299.      * two columns and two rows.
  300.      *
  301.      * @param matrix matrix to check for sufficiency
  302.      * @throws MathIllegalArgumentException if there is insufficient data
  303.      */
  304.     private void checkSufficientData(final RealMatrix matrix) {
  305.         int nRows = matrix.getRowDimension();
  306.         int nCols = matrix.getColumnDimension();
  307.         if (nRows < 2 || nCols < 2) {
  308.             throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
  309.                                                    nRows, nCols);
  310.         }
  311.     }
  312. }