001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.stat.correlation;
018
019import org.apache.commons.math3.distribution.TDistribution;
020import org.apache.commons.math3.exception.util.LocalizedFormats;
021import org.apache.commons.math3.exception.MathIllegalArgumentException;
022import org.apache.commons.math3.exception.NullArgumentException;
023import org.apache.commons.math3.exception.DimensionMismatchException;
024import org.apache.commons.math3.linear.RealMatrix;
025import org.apache.commons.math3.linear.BlockRealMatrix;
026import org.apache.commons.math3.stat.regression.SimpleRegression;
027import org.apache.commons.math3.util.FastMath;
028
029/**
030 * Computes Pearson's product-moment correlation coefficients for pairs of arrays
031 * or columns of a matrix.
032 *
033 * <p>The constructors that take <code>RealMatrix</code> or
034 * <code>double[][]</code> arguments generate correlation matrices.  The
035 * columns of the input matrices are assumed to represent variable values.
036 * Correlations are given by the formula</p>
037 *
038 * <p><code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
039 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
040 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p>
041 *
042 * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
043 * to construct an instance with no data and then {@link #correlation(double[], double[])}.
044 * Correlation matrices can also be computed directly from an instance with no data using
045 * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
046 * {@link #getCorrelationPValues()},  or {@link #getCorrelationStandardErrors()}; however, one of the
047 * constructors supplying data or a covariance matrix must be used to create the instance.</p>
048 *
049 * @version $Id: PearsonsCorrelation.java 1540395 2013-11-09 21:32:06Z psteitz $
050 * @since 2.0
051 */
052public class PearsonsCorrelation {
053
054    /** correlation matrix */
055    private final RealMatrix correlationMatrix;
056
057    /** number of observations */
058    private final int nObs;
059
060    /**
061     * Create a PearsonsCorrelation instance without data.
062     */
063    public PearsonsCorrelation() {
064        super();
065        correlationMatrix = null;
066        nObs = 0;
067    }
068
069    /**
070     * Create a PearsonsCorrelation from a rectangular array
071     * whose columns represent values of variables to be correlated.
072     *
073     * Throws MathIllegalArgumentException if the input array does not have at least
074     * two columns and two rows.  Pairwise correlations are set to NaN if one
075     * of the correlates has zero variance.
076     *
077     * @param data rectangular array with columns representing variables
078     * @throws MathIllegalArgumentException if the input data array is not
079     * rectangular with at least two rows and two columns.
080     * @see #correlation(double[], double[])
081     */
082    public PearsonsCorrelation(double[][] data) {
083        this(new BlockRealMatrix(data));
084    }
085
086    /**
087     * Create a PearsonsCorrelation from a RealMatrix whose columns
088     * represent variables to be correlated.
089     *
090     * Throws MathIllegalArgumentException if the matrix does not have at least
091     * two columns and two rows.  Pairwise correlations are set to NaN if one
092     * of the correlates has zero variance.
093     *
094     * @param matrix matrix with columns representing variables to correlate
095     * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
096     * @see #correlation(double[], double[])
097     */
098    public PearsonsCorrelation(RealMatrix matrix) {
099        nObs = matrix.getRowDimension();
100        correlationMatrix = computeCorrelationMatrix(matrix);
101    }
102
103    /**
104     * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
105     * matrix is computed by scaling the Covariance's covariance matrix.
106     * The Covariance instance must have been created from a data matrix with
107     * columns representing variable values.
108     *
109     * @param covariance Covariance instance
110     */
111    public PearsonsCorrelation(Covariance covariance) {
112        RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
113        if (covarianceMatrix == null) {
114            throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
115        }
116        nObs = covariance.getN();
117        correlationMatrix = covarianceToCorrelation(covarianceMatrix);
118    }
119
120    /**
121     * Create a PearsonsCorrelation from a covariance matrix. The correlation
122     * matrix is computed by scaling the covariance matrix.
123     *
124     * @param covarianceMatrix covariance matrix
125     * @param numberOfObservations the number of observations in the dataset used to compute
126     * the covariance matrix
127     */
128    public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
129        nObs = numberOfObservations;
130        correlationMatrix = covarianceToCorrelation(covarianceMatrix);
131    }
132
133    /**
134     * Returns the correlation matrix.
135     *
136     * <p>This method will return null if the argumentless constructor was used
137     * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
138     * has been called before it is activated.</p>
139     *
140     * @return correlation matrix
141     */
142    public RealMatrix getCorrelationMatrix() {
143        return correlationMatrix;
144    }
145
146    /**
147     * Returns a matrix of standard errors associated with the estimates
148     * in the correlation matrix.<br/>
149     * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
150     * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
151     *
152     * <p>The formula used to compute the standard error is <br/>
153     * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
154     * where <code>r</code> is the estimated correlation coefficient and
155     * <code>n</code> is the number of observations in the source dataset.</p>
156     *
157     * <p>To use this method, one of the constructors that supply an input
158     * matrix must have been used to create this instance.</p>
159     *
160     * @return matrix of correlation standard errors
161     * @throws NullPointerException if this instance was created with no data
162     */
163    public RealMatrix getCorrelationStandardErrors() {
164        int nVars = correlationMatrix.getColumnDimension();
165        double[][] out = new double[nVars][nVars];
166        for (int i = 0; i < nVars; i++) {
167            for (int j = 0; j < nVars; j++) {
168                double r = correlationMatrix.getEntry(i, j);
169                out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
170            }
171        }
172        return new BlockRealMatrix(out);
173    }
174
175    /**
176     * Returns a matrix of p-values associated with the (two-sided) null
177     * hypothesis that the corresponding correlation coefficient is zero.
178     *
179     * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
180     * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
181     * a value with absolute value greater than or equal to <br>
182     * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
183     *
184     * <p>The values in the matrix are sometimes referred to as the
185     * <i>significance</i> of the corresponding correlation coefficients.</p>
186     *
187     * <p>To use this method, one of the constructors that supply an input
188     * matrix must have been used to create this instance.</p>
189     *
190     * @return matrix of p-values
191     * @throws org.apache.commons.math3.exception.MaxCountExceededException
192     * if an error occurs estimating probabilities
193     * @throws NullPointerException if this instance was created with no data
194     */
195    public RealMatrix getCorrelationPValues() {
196        TDistribution tDistribution = new TDistribution(nObs - 2);
197        int nVars = correlationMatrix.getColumnDimension();
198        double[][] out = new double[nVars][nVars];
199        for (int i = 0; i < nVars; i++) {
200            for (int j = 0; j < nVars; j++) {
201                if (i == j) {
202                    out[i][j] = 0d;
203                } else {
204                    double r = correlationMatrix.getEntry(i, j);
205                    double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
206                    out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
207                }
208            }
209        }
210        return new BlockRealMatrix(out);
211    }
212
213
214    /**
215     * Computes the correlation matrix for the columns of the
216     * input matrix, using {@link #correlation(double[], double[])}.
217     *
218     * Throws MathIllegalArgumentException if the matrix does not have at least
219     * two columns and two rows.  Pairwise correlations are set to NaN if one
220     * of the correlates has zero variance.
221     *
222     * @param matrix matrix with columns representing variables to correlate
223     * @return correlation matrix
224     * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
225     * @see #correlation(double[], double[])
226     */
227    public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
228        checkSufficientData(matrix);
229        int nVars = matrix.getColumnDimension();
230        RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
231        for (int i = 0; i < nVars; i++) {
232            for (int j = 0; j < i; j++) {
233              double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
234              outMatrix.setEntry(i, j, corr);
235              outMatrix.setEntry(j, i, corr);
236            }
237            outMatrix.setEntry(i, i, 1d);
238        }
239        return outMatrix;
240    }
241
242    /**
243     * Computes the correlation matrix for the columns of the
244     * input rectangular array.  The columns of the array represent values
245     * of variables to be correlated.
246     *
247     * Throws MathIllegalArgumentException if the matrix does not have at least
248     * two columns and two rows or if the array is not rectangular. Pairwise
249     * correlations are set to NaN if one of the correlates has zero variance.
250     *
251     * @param data matrix with columns representing variables to correlate
252     * @return correlation matrix
253     * @throws MathIllegalArgumentException if the array does not contain sufficient data
254     * @see #correlation(double[], double[])
255     */
256    public RealMatrix computeCorrelationMatrix(double[][] data) {
257       return computeCorrelationMatrix(new BlockRealMatrix(data));
258    }
259
260    /**
261     * Computes the Pearson's product-moment correlation coefficient between two arrays.
262     *
263     * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
264     * or their common length is less than 2.  Returns {@code NaN} if either of the arrays
265     * has zero variance (i.e., if one of the arrays does not contain at least two distinct
266     * values).</p>
267     *
268     * @param xArray first data array
269     * @param yArray second data array
270     * @return Returns Pearson's correlation coefficient for the two arrays
271     * @throws DimensionMismatchException if the arrays lengths do not match
272     * @throws MathIllegalArgumentException if there is insufficient data
273     */
274    public double correlation(final double[] xArray, final double[] yArray) {
275        SimpleRegression regression = new SimpleRegression();
276        if (xArray.length != yArray.length) {
277            throw new DimensionMismatchException(xArray.length, yArray.length);
278        } else if (xArray.length < 2) {
279            throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
280                                                   xArray.length, 2);
281        } else {
282            for(int i=0; i<xArray.length; i++) {
283                regression.addData(xArray[i], yArray[i]);
284            }
285            return regression.getR();
286        }
287    }
288
289    /**
290     * Derives a correlation matrix from a covariance matrix.
291     *
292     * <p>Uses the formula <br/>
293     * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
294     * <code>r(&middot,&middot;)</code> is the correlation coefficient and
295     * <code>s(&middot;)</code> means standard deviation.</p>
296     *
297     * @param covarianceMatrix the covariance matrix
298     * @return correlation matrix
299     */
300    public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
301        int nVars = covarianceMatrix.getColumnDimension();
302        RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
303        for (int i = 0; i < nVars; i++) {
304            double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
305            outMatrix.setEntry(i, i, 1d);
306            for (int j = 0; j < i; j++) {
307                double entry = covarianceMatrix.getEntry(i, j) /
308                       (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
309                outMatrix.setEntry(i, j, entry);
310                outMatrix.setEntry(j, i, entry);
311            }
312        }
313        return outMatrix;
314    }
315
316    /**
317     * Throws MathIllegalArgumentException if the matrix does not have at least
318     * two columns and two rows.
319     *
320     * @param matrix matrix to check for sufficiency
321     * @throws MathIllegalArgumentException if there is insufficient data
322     */
323    private void checkSufficientData(final RealMatrix matrix) {
324        int nRows = matrix.getRowDimension();
325        int nCols = matrix.getColumnDimension();
326        if (nRows < 2 || nCols < 2) {
327            throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
328                                                   nRows, nCols);
329        }
330    }
331}