001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math3.stat.correlation; 018 019 import org.apache.commons.math3.distribution.TDistribution; 020 import org.apache.commons.math3.exception.util.LocalizedFormats; 021 import org.apache.commons.math3.exception.MathIllegalArgumentException; 022 import org.apache.commons.math3.exception.NullArgumentException; 023 import org.apache.commons.math3.exception.DimensionMismatchException; 024 import org.apache.commons.math3.linear.RealMatrix; 025 import org.apache.commons.math3.linear.BlockRealMatrix; 026 import org.apache.commons.math3.stat.regression.SimpleRegression; 027 import org.apache.commons.math3.util.FastMath; 028 029 /** 030 * Computes Pearson's product-moment correlation coefficients for pairs of arrays 031 * or columns of a matrix. 032 * 033 * <p>The constructors that take <code>RealMatrix</code> or 034 * <code>double[][]</code> arguments generate correlation matrices. The 035 * columns of the input matrices are assumed to represent variable values. 036 * Correlations are given by the formula</p> 037 * <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> 038 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> 039 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations. 040 * 041 * @version $Id: PearsonsCorrelation.java 1416643 2012-12-03 19:37:14Z tn $ 042 * @since 2.0 043 */ 044 public class PearsonsCorrelation { 045 046 /** correlation matrix */ 047 private final RealMatrix correlationMatrix; 048 049 /** number of observations */ 050 private final int nObs; 051 052 /** 053 * Create a PearsonsCorrelation instance without data 054 */ 055 public PearsonsCorrelation() { 056 super(); 057 correlationMatrix = null; 058 nObs = 0; 059 } 060 061 /** 062 * Create a PearsonsCorrelation from a rectangular array 063 * whose columns represent values of variables to be correlated. 064 * 065 * @param data rectangular array with columns representing variables 066 * @throws IllegalArgumentException if the input data array is not 067 * rectangular with at least two rows and two columns. 068 */ 069 public PearsonsCorrelation(double[][] data) { 070 this(new BlockRealMatrix(data)); 071 } 072 073 /** 074 * Create a PearsonsCorrelation from a RealMatrix whose columns 075 * represent variables to be correlated. 076 * 077 * @param matrix matrix with columns representing variables to correlate 078 */ 079 public PearsonsCorrelation(RealMatrix matrix) { 080 checkSufficientData(matrix); 081 nObs = matrix.getRowDimension(); 082 correlationMatrix = computeCorrelationMatrix(matrix); 083 } 084 085 /** 086 * Create a PearsonsCorrelation from a {@link Covariance}. The correlation 087 * matrix is computed by scaling the Covariance's covariance matrix. 088 * The Covariance instance must have been created from a data matrix with 089 * columns representing variable values. 090 * 091 * @param covariance Covariance instance 092 */ 093 public PearsonsCorrelation(Covariance covariance) { 094 RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); 095 if (covarianceMatrix == null) { 096 throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); 097 } 098 nObs = covariance.getN(); 099 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 100 } 101 102 /** 103 * Create a PearsonsCorrelation from a covariance matrix. The correlation 104 * matrix is computed by scaling the covariance matrix. 105 * 106 * @param covarianceMatrix covariance matrix 107 * @param numberOfObservations the number of observations in the dataset used to compute 108 * the covariance matrix 109 */ 110 public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { 111 nObs = numberOfObservations; 112 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 113 114 } 115 116 /** 117 * Returns the correlation matrix 118 * 119 * @return correlation matrix 120 */ 121 public RealMatrix getCorrelationMatrix() { 122 return correlationMatrix; 123 } 124 125 /** 126 * Returns a matrix of standard errors associated with the estimates 127 * in the correlation matrix.<br/> 128 * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard 129 * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> 130 * <p>The formula used to compute the standard error is <br/> 131 * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> 132 * where <code>r</code> is the estimated correlation coefficient and 133 * <code>n</code> is the number of observations in the source dataset.</p> 134 * 135 * @return matrix of correlation standard errors 136 */ 137 public RealMatrix getCorrelationStandardErrors() { 138 int nVars = correlationMatrix.getColumnDimension(); 139 double[][] out = new double[nVars][nVars]; 140 for (int i = 0; i < nVars; i++) { 141 for (int j = 0; j < nVars; j++) { 142 double r = correlationMatrix.getEntry(i, j); 143 out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); 144 } 145 } 146 return new BlockRealMatrix(out); 147 } 148 149 /** 150 * Returns a matrix of p-values associated with the (two-sided) null 151 * hypothesis that the corresponding correlation coefficient is zero. 152 * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability 153 * that a random variable distributed as <code>t<sub>n-2</sub></code> takes 154 * a value with absolute value greater than or equal to <br> 155 * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> 156 * <p>The values in the matrix are sometimes referred to as the 157 * <i>significance</i> of the corresponding correlation coefficients.</p> 158 * 159 * @return matrix of p-values 160 * @throws org.apache.commons.math3.exception.MaxCountExceededException 161 * if an error occurs estimating probabilities 162 */ 163 public RealMatrix getCorrelationPValues() { 164 TDistribution tDistribution = new TDistribution(nObs - 2); 165 int nVars = correlationMatrix.getColumnDimension(); 166 double[][] out = new double[nVars][nVars]; 167 for (int i = 0; i < nVars; i++) { 168 for (int j = 0; j < nVars; j++) { 169 if (i == j) { 170 out[i][j] = 0d; 171 } else { 172 double r = correlationMatrix.getEntry(i, j); 173 double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); 174 out[i][j] = 2 * tDistribution.cumulativeProbability(-t); 175 } 176 } 177 } 178 return new BlockRealMatrix(out); 179 } 180 181 182 /** 183 * Computes the correlation matrix for the columns of the 184 * input matrix. 185 * 186 * @param matrix matrix with columns representing variables to correlate 187 * @return correlation matrix 188 */ 189 public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { 190 int nVars = matrix.getColumnDimension(); 191 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 192 for (int i = 0; i < nVars; i++) { 193 for (int j = 0; j < i; j++) { 194 double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); 195 outMatrix.setEntry(i, j, corr); 196 outMatrix.setEntry(j, i, corr); 197 } 198 outMatrix.setEntry(i, i, 1d); 199 } 200 return outMatrix; 201 } 202 203 /** 204 * Computes the correlation matrix for the columns of the 205 * input rectangular array. The colums of the array represent values 206 * of variables to be correlated. 207 * 208 * @param data matrix with columns representing variables to correlate 209 * @return correlation matrix 210 */ 211 public RealMatrix computeCorrelationMatrix(double[][] data) { 212 return computeCorrelationMatrix(new BlockRealMatrix(data)); 213 } 214 215 /** 216 * Computes the Pearson's product-moment correlation coefficient between the two arrays. 217 * 218 * </p>Throws IllegalArgumentException if the arrays do not have the same length 219 * or their common length is less than 2</p> 220 * 221 * @param xArray first data array 222 * @param yArray second data array 223 * @return Returns Pearson's correlation coefficient for the two arrays 224 * @throws DimensionMismatchException if the arrays lengths do not match 225 * @throws MathIllegalArgumentException if there is insufficient data 226 */ 227 public double correlation(final double[] xArray, final double[] yArray) { 228 SimpleRegression regression = new SimpleRegression(); 229 if (xArray.length != yArray.length) { 230 throw new DimensionMismatchException(xArray.length, yArray.length); 231 } else if (xArray.length < 2) { 232 throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, 233 xArray.length, 2); 234 } else { 235 for(int i=0; i<xArray.length; i++) { 236 regression.addData(xArray[i], yArray[i]); 237 } 238 return regression.getR(); 239 } 240 } 241 242 /** 243 * Derives a correlation matrix from a covariance matrix. 244 * 245 * <p>Uses the formula <br/> 246 * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where 247 * <code>r(·,·)</code> is the correlation coefficient and 248 * <code>s(·)</code> means standard deviation.</p> 249 * 250 * @param covarianceMatrix the covariance matrix 251 * @return correlation matrix 252 */ 253 public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { 254 int nVars = covarianceMatrix.getColumnDimension(); 255 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 256 for (int i = 0; i < nVars; i++) { 257 double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); 258 outMatrix.setEntry(i, i, 1d); 259 for (int j = 0; j < i; j++) { 260 double entry = covarianceMatrix.getEntry(i, j) / 261 (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); 262 outMatrix.setEntry(i, j, entry); 263 outMatrix.setEntry(j, i, entry); 264 } 265 } 266 return outMatrix; 267 } 268 269 /** 270 * Throws IllegalArgumentException of the matrix does not have at least 271 * two columns and two rows 272 * 273 * @param matrix matrix to check for sufficiency 274 * @throws MathIllegalArgumentException if there is insufficient data 275 */ 276 private void checkSufficientData(final RealMatrix matrix) { 277 int nRows = matrix.getRowDimension(); 278 int nCols = matrix.getColumnDimension(); 279 if (nRows < 2 || nCols < 2) { 280 throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, 281 nRows, nCols); 282 } 283 } 284 }