View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math4.legacy.stat.correlation;
18  
19  import org.apache.commons.statistics.distribution.TDistribution;
20  import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
21  import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
22  import org.apache.commons.math4.legacy.exception.NullArgumentException;
23  import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
24  import org.apache.commons.math4.legacy.linear.BlockRealMatrix;
25  import org.apache.commons.math4.legacy.linear.RealMatrix;
26  import org.apache.commons.math4.legacy.stat.regression.SimpleRegression;
27  import org.apache.commons.math4.core.jdkmath.JdkMath;
28  
29  /**
30   * Computes Pearson's product-moment correlation coefficients for pairs of arrays
31   * or columns of a matrix.
32   *
33   * <p>The constructors that take <code>RealMatrix</code> or
34   * <code>double[][]</code> arguments generate correlation matrices.  The
35   * columns of the input matrices are assumed to represent variable values.
36   * Correlations are given by the formula</p>
37   *
38   * <p><code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
39   * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
40   * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p>
41   *
42   * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
43   * to construct an instance with no data and then {@link #correlation(double[], double[])}.
44   * Correlation matrices can also be computed directly from an instance with no data using
45   * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
46   * {@link #getCorrelationPValues()},  or {@link #getCorrelationStandardErrors()}; however, one of the
47   * constructors supplying data or a covariance matrix must be used to create the instance.</p>
48   *
49   * @since 2.0
50   */
51  public class PearsonsCorrelation {
52  
53      /** correlation matrix. */
54      private final RealMatrix correlationMatrix;
55  
56      /** number of observations. */
57      private final int nObs;
58  
59      /**
60       * Create a PearsonsCorrelation instance without data.
61       */
62      public PearsonsCorrelation() {
63          super();
64          correlationMatrix = null;
65          nObs = 0;
66      }
67  
68      /**
69       * Create a PearsonsCorrelation from a rectangular array
70       * whose columns represent values of variables to be correlated.
71       *
72       * Throws MathIllegalArgumentException if the input array does not have at least
73       * two columns and two rows.  Pairwise correlations are set to NaN if one
74       * of the correlates has zero variance.
75       *
76       * @param data rectangular array with columns representing variables
77       * @throws MathIllegalArgumentException if the input data array is not
78       * rectangular with at least two rows and two columns.
79       * @see #correlation(double[], double[])
80       */
81      public PearsonsCorrelation(double[][] data) {
82          this(new BlockRealMatrix(data));
83      }
84  
85      /**
86       * Create a PearsonsCorrelation from a RealMatrix whose columns
87       * represent variables to be correlated.
88       *
89       * Throws MathIllegalArgumentException if the matrix does not have at least
90       * two columns and two rows.  Pairwise correlations are set to NaN if one
91       * of the correlates has zero variance.
92       *
93       * @param matrix matrix with columns representing variables to correlate
94       * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
95       * @see #correlation(double[], double[])
96       */
97      public PearsonsCorrelation(RealMatrix matrix) {
98          nObs = matrix.getRowDimension();
99          correlationMatrix = computeCorrelationMatrix(matrix);
100     }
101 
102     /**
103      * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
104      * matrix is computed by scaling the Covariance's covariance matrix.
105      * The Covariance instance must have been created from a data matrix with
106      * columns representing variable values.
107      *
108      * @param covariance Covariance instance
109      */
110     public PearsonsCorrelation(Covariance covariance) {
111         RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
112         if (covarianceMatrix == null) {
113             throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
114         }
115         nObs = covariance.getN();
116         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
117     }
118 
119     /**
120      * Create a PearsonsCorrelation from a covariance matrix. The correlation
121      * matrix is computed by scaling the covariance matrix.
122      *
123      * @param covarianceMatrix covariance matrix
124      * @param numberOfObservations the number of observations in the dataset used to compute
125      * the covariance matrix
126      */
127     public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
128         nObs = numberOfObservations;
129         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
130     }
131 
132     /**
133      * Returns the correlation matrix.
134      *
135      * <p>This method will return null if the non-argument constructor was used
136      * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
137      * has been called before it is activated.</p>
138      *
139      * @return correlation matrix
140      */
141     public RealMatrix getCorrelationMatrix() {
142         return correlationMatrix;
143     }
144 
145     /**
146      * Returns a matrix of standard errors associated with the estimates
147      * in the correlation matrix.<br>
148      * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
149      * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
150      *
151      * <p>The formula used to compute the standard error is <br>
152      * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
153      * where <code>r</code> is the estimated correlation coefficient and
154      * <code>n</code> is the number of observations in the source dataset.</p>
155      *
156      * <p>To use this method, one of the constructors that supply an input
157      * matrix must have been used to create this instance.</p>
158      *
159      * @return matrix of correlation standard errors
160      * @throws NullPointerException if this instance was created with no data.
161      */
162     public RealMatrix getCorrelationStandardErrors() {
163         int nVars = correlationMatrix.getColumnDimension();
164         double[][] out = new double[nVars][nVars];
165         for (int i = 0; i < nVars; i++) {
166             for (int j = 0; j < nVars; j++) {
167                 double r = correlationMatrix.getEntry(i, j);
168                 out[i][j] = JdkMath.sqrt((1 - r * r) /(nObs - 2));
169             }
170         }
171         return new BlockRealMatrix(out);
172     }
173 
174     /**
175      * Returns a matrix of p-values associated with the (two-sided) null
176      * hypothesis that the corresponding correlation coefficient is zero.
177      *
178      * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
179      * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
180      * a value with absolute value greater than or equal to <br>
181      * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
182      *
183      * <p>The values in the matrix are sometimes referred to as the
184      * <i>significance</i> of the corresponding correlation coefficients.</p>
185      *
186      * <p>To use this method, one of the constructors that supply an input
187      * matrix must have been used to create this instance.</p>
188      *
189      * @return matrix of p-values
190      * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
191      * if an error occurs estimating probabilities
192      * @throws NullPointerException if this instance was created with no data.
193      */
194     public RealMatrix getCorrelationPValues() {
195         TDistribution tDistribution = TDistribution.of(nObs - 2);
196         int nVars = correlationMatrix.getColumnDimension();
197         double[][] out = new double[nVars][nVars];
198         for (int i = 0; i < nVars; i++) {
199             for (int j = 0; j < nVars; j++) {
200                 if (i == j) {
201                     out[i][j] = 0d;
202                 } else {
203                     double r = correlationMatrix.getEntry(i, j);
204                     double t = JdkMath.abs(r * JdkMath.sqrt((nObs - 2)/(1 - r * r)));
205                     out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
206                 }
207             }
208         }
209         return new BlockRealMatrix(out);
210     }
211 
212 
213     /**
214      * Computes the correlation matrix for the columns of the
215      * input matrix, using {@link #correlation(double[], double[])}.
216      *
217      * Throws MathIllegalArgumentException if the matrix does not have at least
218      * two columns and two rows.  Pairwise correlations are set to NaN if one
219      * of the correlates has zero variance.
220      *
221      * @param matrix matrix with columns representing variables to correlate
222      * @return correlation matrix
223      * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
224      * @see #correlation(double[], double[])
225      */
226     public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
227         checkSufficientData(matrix);
228         int nVars = matrix.getColumnDimension();
229         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
230         for (int i = 0; i < nVars; i++) {
231             for (int j = 0; j < i; j++) {
232               double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
233               outMatrix.setEntry(i, j, corr);
234               outMatrix.setEntry(j, i, corr);
235             }
236             outMatrix.setEntry(i, i, 1d);
237         }
238         return outMatrix;
239     }
240 
241     /**
242      * Computes the correlation matrix for the columns of the
243      * input rectangular array.  The columns of the array represent values
244      * of variables to be correlated.
245      *
246      * Throws MathIllegalArgumentException if the matrix does not have at least
247      * two columns and two rows or if the array is not rectangular. Pairwise
248      * correlations are set to NaN if one of the correlates has zero variance.
249      *
250      * @param data matrix with columns representing variables to correlate
251      * @return correlation matrix
252      * @throws MathIllegalArgumentException if the array does not contain sufficient data
253      * @see #correlation(double[], double[])
254      */
255     public RealMatrix computeCorrelationMatrix(double[][] data) {
256        return computeCorrelationMatrix(new BlockRealMatrix(data));
257     }
258 
259     /**
260      * Computes the Pearson's product-moment correlation coefficient between two arrays.
261      *
262      * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
263      * or their common length is less than 2.  Returns {@code NaN} if either of the arrays
264      * has zero variance (i.e., if one of the arrays does not contain at least two distinct
265      * values).</p>
266      *
267      * @param xArray first data array
268      * @param yArray second data array
269      * @return Returns Pearson's correlation coefficient for the two arrays
270      * @throws DimensionMismatchException if the arrays lengths do not match
271      * @throws MathIllegalArgumentException if there is insufficient data
272      */
273     public double correlation(final double[] xArray, final double[] yArray) {
274         SimpleRegression regression = new SimpleRegression();
275         if (xArray.length != yArray.length) {
276             throw new DimensionMismatchException(xArray.length, yArray.length);
277         } else if (xArray.length < 2) {
278             throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
279                                                    xArray.length, 2);
280         } else {
281             for(int i=0; i<xArray.length; i++) {
282                 regression.addData(xArray[i], yArray[i]);
283             }
284             return regression.getR();
285         }
286     }
287 
288     /**
289      * Derives a correlation matrix from a covariance matrix.
290      *
291      * <p>Uses the formula <br>
292      * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
293      * <code>r(&middot;,&middot;)</code> is the correlation coefficient and
294      * <code>s(&middot;)</code> means standard deviation.</p>
295      *
296      * @param covarianceMatrix the covariance matrix
297      * @return correlation matrix
298      */
299     public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
300         int nVars = covarianceMatrix.getColumnDimension();
301         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
302         for (int i = 0; i < nVars; i++) {
303             double sigma = JdkMath.sqrt(covarianceMatrix.getEntry(i, i));
304             outMatrix.setEntry(i, i, 1d);
305             for (int j = 0; j < i; j++) {
306                 double entry = covarianceMatrix.getEntry(i, j) /
307                        (sigma * JdkMath.sqrt(covarianceMatrix.getEntry(j, j)));
308                 outMatrix.setEntry(i, j, entry);
309                 outMatrix.setEntry(j, i, entry);
310             }
311         }
312         return outMatrix;
313     }
314 
315     /**
316      * Throws MathIllegalArgumentException if the matrix does not have at least
317      * two columns and two rows.
318      *
319      * @param matrix matrix to check for sufficiency
320      * @throws MathIllegalArgumentException if there is insufficient data
321      */
322     private void checkSufficientData(final RealMatrix matrix) {
323         int nRows = matrix.getRowDimension();
324         int nCols = matrix.getColumnDimension();
325         if (nRows < 2 || nCols < 2) {
326             throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
327                                                    nRows, nCols);
328         }
329     }
330 }