001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.stat.correlation;
018
019import org.apache.commons.math3.distribution.TDistribution;
020import org.apache.commons.math3.exception.util.LocalizedFormats;
021import org.apache.commons.math3.exception.MathIllegalArgumentException;
022import org.apache.commons.math3.exception.NullArgumentException;
023import org.apache.commons.math3.exception.DimensionMismatchException;
024import org.apache.commons.math3.linear.RealMatrix;
025import org.apache.commons.math3.linear.BlockRealMatrix;
026import org.apache.commons.math3.stat.regression.SimpleRegression;
027import org.apache.commons.math3.util.FastMath;
028
029/**
030 * Computes Pearson's product-moment correlation coefficients for pairs of arrays
031 * or columns of a matrix.
032 *
033 * <p>The constructors that take <code>RealMatrix</code> or
034 * <code>double[][]</code> arguments generate correlation matrices.  The
035 * columns of the input matrices are assumed to represent variable values.
036 * Correlations are given by the formula</p>
037 * <code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
038 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
039 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
040 *
041 * @version $Id: PearsonsCorrelation.java 1416643 2012-12-03 19:37:14Z tn $
042 * @since 2.0
043 */
044public class PearsonsCorrelation {
045
046    /** correlation matrix */
047    private final RealMatrix correlationMatrix;
048
049    /** number of observations */
050    private final int nObs;
051
052    /**
053     * Create a PearsonsCorrelation instance without data
054     */
055    public PearsonsCorrelation() {
056        super();
057        correlationMatrix = null;
058        nObs = 0;
059    }
060
061    /**
062     * Create a PearsonsCorrelation from a rectangular array
063     * whose columns represent values of variables to be correlated.
064     *
065     * @param data rectangular array with columns representing variables
066     * @throws IllegalArgumentException if the input data array is not
067     * rectangular with at least two rows and two columns.
068     */
069    public PearsonsCorrelation(double[][] data) {
070        this(new BlockRealMatrix(data));
071    }
072
073    /**
074     * Create a PearsonsCorrelation from a RealMatrix whose columns
075     * represent variables to be correlated.
076     *
077     * @param matrix matrix with columns representing variables to correlate
078     */
079    public PearsonsCorrelation(RealMatrix matrix) {
080        checkSufficientData(matrix);
081        nObs = matrix.getRowDimension();
082        correlationMatrix = computeCorrelationMatrix(matrix);
083    }
084
085    /**
086     * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
087     * matrix is computed by scaling the Covariance's covariance matrix.
088     * The Covariance instance must have been created from a data matrix with
089     * columns representing variable values.
090     *
091     * @param covariance Covariance instance
092     */
093    public PearsonsCorrelation(Covariance covariance) {
094        RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
095        if (covarianceMatrix == null) {
096            throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
097        }
098        nObs = covariance.getN();
099        correlationMatrix = covarianceToCorrelation(covarianceMatrix);
100    }
101
102    /**
103     * Create a PearsonsCorrelation from a covariance matrix.  The correlation
104     * matrix is computed by scaling the covariance matrix.
105     *
106     * @param covarianceMatrix covariance matrix
107     * @param numberOfObservations the number of observations in the dataset used to compute
108     * the covariance matrix
109     */
110    public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
111        nObs = numberOfObservations;
112        correlationMatrix = covarianceToCorrelation(covarianceMatrix);
113
114    }
115
116    /**
117     * Returns the correlation matrix
118     *
119     * @return correlation matrix
120     */
121    public RealMatrix getCorrelationMatrix() {
122        return correlationMatrix;
123    }
124
125    /**
126     * Returns a matrix of standard errors associated with the estimates
127     * in the correlation matrix.<br/>
128     * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
129     * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
130     * <p>The formula used to compute the standard error is <br/>
131     * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
132     * where <code>r</code> is the estimated correlation coefficient and
133     * <code>n</code> is the number of observations in the source dataset.</p>
134     *
135     * @return matrix of correlation standard errors
136     */
137    public RealMatrix getCorrelationStandardErrors() {
138        int nVars = correlationMatrix.getColumnDimension();
139        double[][] out = new double[nVars][nVars];
140        for (int i = 0; i < nVars; i++) {
141            for (int j = 0; j < nVars; j++) {
142                double r = correlationMatrix.getEntry(i, j);
143                out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
144            }
145        }
146        return new BlockRealMatrix(out);
147    }
148
149    /**
150     * Returns a matrix of p-values associated with the (two-sided) null
151     * hypothesis that the corresponding correlation coefficient is zero.
152     * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
153     * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
154     * a value with absolute value greater than or equal to <br>
155     * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
156     * <p>The values in the matrix are sometimes referred to as the
157     * <i>significance</i> of the corresponding correlation coefficients.</p>
158     *
159     * @return matrix of p-values
160     * @throws org.apache.commons.math3.exception.MaxCountExceededException
161     * if an error occurs estimating probabilities
162     */
163    public RealMatrix getCorrelationPValues() {
164        TDistribution tDistribution = new TDistribution(nObs - 2);
165        int nVars = correlationMatrix.getColumnDimension();
166        double[][] out = new double[nVars][nVars];
167        for (int i = 0; i < nVars; i++) {
168            for (int j = 0; j < nVars; j++) {
169                if (i == j) {
170                    out[i][j] = 0d;
171                } else {
172                    double r = correlationMatrix.getEntry(i, j);
173                    double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
174                    out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
175                }
176            }
177        }
178        return new BlockRealMatrix(out);
179    }
180
181
182    /**
183     * Computes the correlation matrix for the columns of the
184     * input matrix.
185     *
186     * @param matrix matrix with columns representing variables to correlate
187     * @return correlation matrix
188     */
189    public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
190        int nVars = matrix.getColumnDimension();
191        RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
192        for (int i = 0; i < nVars; i++) {
193            for (int j = 0; j < i; j++) {
194              double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
195              outMatrix.setEntry(i, j, corr);
196              outMatrix.setEntry(j, i, corr);
197            }
198            outMatrix.setEntry(i, i, 1d);
199        }
200        return outMatrix;
201    }
202
203    /**
204     * Computes the correlation matrix for the columns of the
205     * input rectangular array.  The colums of the array represent values
206     * of variables to be correlated.
207     *
208     * @param data matrix with columns representing variables to correlate
209     * @return correlation matrix
210     */
211    public RealMatrix computeCorrelationMatrix(double[][] data) {
212       return computeCorrelationMatrix(new BlockRealMatrix(data));
213    }
214
215    /**
216     * Computes the Pearson's product-moment correlation coefficient between the two arrays.
217     *
218     * </p>Throws IllegalArgumentException if the arrays do not have the same length
219     * or their common length is less than 2</p>
220     *
221     * @param xArray first data array
222     * @param yArray second data array
223     * @return Returns Pearson's correlation coefficient for the two arrays
224     * @throws DimensionMismatchException if the arrays lengths do not match
225     * @throws MathIllegalArgumentException if there is insufficient data
226     */
227    public double correlation(final double[] xArray, final double[] yArray) {
228        SimpleRegression regression = new SimpleRegression();
229        if (xArray.length != yArray.length) {
230            throw new DimensionMismatchException(xArray.length, yArray.length);
231        } else if (xArray.length < 2) {
232            throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
233                                                   xArray.length, 2);
234        } else {
235            for(int i=0; i<xArray.length; i++) {
236                regression.addData(xArray[i], yArray[i]);
237            }
238            return regression.getR();
239        }
240    }
241
242    /**
243     * Derives a correlation matrix from a covariance matrix.
244     *
245     * <p>Uses the formula <br/>
246     * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
247     * <code>r(&middot,&middot;)</code> is the correlation coefficient and
248     * <code>s(&middot;)</code> means standard deviation.</p>
249     *
250     * @param covarianceMatrix the covariance matrix
251     * @return correlation matrix
252     */
253    public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
254        int nVars = covarianceMatrix.getColumnDimension();
255        RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
256        for (int i = 0; i < nVars; i++) {
257            double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
258            outMatrix.setEntry(i, i, 1d);
259            for (int j = 0; j < i; j++) {
260                double entry = covarianceMatrix.getEntry(i, j) /
261                       (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
262                outMatrix.setEntry(i, j, entry);
263                outMatrix.setEntry(j, i, entry);
264            }
265        }
266        return outMatrix;
267    }
268
269    /**
270     * Throws IllegalArgumentException of the matrix does not have at least
271     * two columns and two rows
272     *
273     * @param matrix matrix to check for sufficiency
274     * @throws MathIllegalArgumentException if there is insufficient data
275     */
276    private void checkSufficientData(final RealMatrix matrix) {
277        int nRows = matrix.getRowDimension();
278        int nCols = matrix.getColumnDimension();
279        if (nRows < 2 || nCols < 2) {
280            throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
281                                                   nRows, nCols);
282        }
283    }
284}