001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math3.stat.correlation;
018    
019    import org.apache.commons.math3.distribution.TDistribution;
020    import org.apache.commons.math3.exception.util.LocalizedFormats;
021    import org.apache.commons.math3.exception.MathIllegalArgumentException;
022    import org.apache.commons.math3.exception.NullArgumentException;
023    import org.apache.commons.math3.exception.DimensionMismatchException;
024    import org.apache.commons.math3.linear.RealMatrix;
025    import org.apache.commons.math3.linear.BlockRealMatrix;
026    import org.apache.commons.math3.stat.regression.SimpleRegression;
027    import org.apache.commons.math3.util.FastMath;
028    
029    /**
030     * Computes Pearson's product-moment correlation coefficients for pairs of arrays
031     * or columns of a matrix.
032     *
033     * <p>The constructors that take <code>RealMatrix</code> or
034     * <code>double[][]</code> arguments generate correlation matrices.  The
035     * columns of the input matrices are assumed to represent variable values.
036     * Correlations are given by the formula</p>
037     * <code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
038     * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
039     * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
040     *
041     * @version $Id: PearsonsCorrelation.java 1416643 2012-12-03 19:37:14Z tn $
042     * @since 2.0
043     */
044    public class PearsonsCorrelation {
045    
046        /** correlation matrix */
047        private final RealMatrix correlationMatrix;
048    
049        /** number of observations */
050        private final int nObs;
051    
052        /**
053         * Create a PearsonsCorrelation instance without data
054         */
055        public PearsonsCorrelation() {
056            super();
057            correlationMatrix = null;
058            nObs = 0;
059        }
060    
061        /**
062         * Create a PearsonsCorrelation from a rectangular array
063         * whose columns represent values of variables to be correlated.
064         *
065         * @param data rectangular array with columns representing variables
066         * @throws IllegalArgumentException if the input data array is not
067         * rectangular with at least two rows and two columns.
068         */
069        public PearsonsCorrelation(double[][] data) {
070            this(new BlockRealMatrix(data));
071        }
072    
073        /**
074         * Create a PearsonsCorrelation from a RealMatrix whose columns
075         * represent variables to be correlated.
076         *
077         * @param matrix matrix with columns representing variables to correlate
078         */
079        public PearsonsCorrelation(RealMatrix matrix) {
080            checkSufficientData(matrix);
081            nObs = matrix.getRowDimension();
082            correlationMatrix = computeCorrelationMatrix(matrix);
083        }
084    
085        /**
086         * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
087         * matrix is computed by scaling the Covariance's covariance matrix.
088         * The Covariance instance must have been created from a data matrix with
089         * columns representing variable values.
090         *
091         * @param covariance Covariance instance
092         */
093        public PearsonsCorrelation(Covariance covariance) {
094            RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
095            if (covarianceMatrix == null) {
096                throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
097            }
098            nObs = covariance.getN();
099            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
100        }
101    
102        /**
103         * Create a PearsonsCorrelation from a covariance matrix.  The correlation
104         * matrix is computed by scaling the covariance matrix.
105         *
106         * @param covarianceMatrix covariance matrix
107         * @param numberOfObservations the number of observations in the dataset used to compute
108         * the covariance matrix
109         */
110        public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
111            nObs = numberOfObservations;
112            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
113    
114        }
115    
116        /**
117         * Returns the correlation matrix
118         *
119         * @return correlation matrix
120         */
121        public RealMatrix getCorrelationMatrix() {
122            return correlationMatrix;
123        }
124    
125        /**
126         * Returns a matrix of standard errors associated with the estimates
127         * in the correlation matrix.<br/>
128         * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
129         * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
130         * <p>The formula used to compute the standard error is <br/>
131         * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
132         * where <code>r</code> is the estimated correlation coefficient and
133         * <code>n</code> is the number of observations in the source dataset.</p>
134         *
135         * @return matrix of correlation standard errors
136         */
137        public RealMatrix getCorrelationStandardErrors() {
138            int nVars = correlationMatrix.getColumnDimension();
139            double[][] out = new double[nVars][nVars];
140            for (int i = 0; i < nVars; i++) {
141                for (int j = 0; j < nVars; j++) {
142                    double r = correlationMatrix.getEntry(i, j);
143                    out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
144                }
145            }
146            return new BlockRealMatrix(out);
147        }
148    
149        /**
150         * Returns a matrix of p-values associated with the (two-sided) null
151         * hypothesis that the corresponding correlation coefficient is zero.
152         * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
153         * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
154         * a value with absolute value greater than or equal to <br>
155         * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
156         * <p>The values in the matrix are sometimes referred to as the
157         * <i>significance</i> of the corresponding correlation coefficients.</p>
158         *
159         * @return matrix of p-values
160         * @throws org.apache.commons.math3.exception.MaxCountExceededException
161         * if an error occurs estimating probabilities
162         */
163        public RealMatrix getCorrelationPValues() {
164            TDistribution tDistribution = new TDistribution(nObs - 2);
165            int nVars = correlationMatrix.getColumnDimension();
166            double[][] out = new double[nVars][nVars];
167            for (int i = 0; i < nVars; i++) {
168                for (int j = 0; j < nVars; j++) {
169                    if (i == j) {
170                        out[i][j] = 0d;
171                    } else {
172                        double r = correlationMatrix.getEntry(i, j);
173                        double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
174                        out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
175                    }
176                }
177            }
178            return new BlockRealMatrix(out);
179        }
180    
181    
182        /**
183         * Computes the correlation matrix for the columns of the
184         * input matrix.
185         *
186         * @param matrix matrix with columns representing variables to correlate
187         * @return correlation matrix
188         */
189        public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
190            int nVars = matrix.getColumnDimension();
191            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
192            for (int i = 0; i < nVars; i++) {
193                for (int j = 0; j < i; j++) {
194                  double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
195                  outMatrix.setEntry(i, j, corr);
196                  outMatrix.setEntry(j, i, corr);
197                }
198                outMatrix.setEntry(i, i, 1d);
199            }
200            return outMatrix;
201        }
202    
203        /**
204         * Computes the correlation matrix for the columns of the
205         * input rectangular array.  The colums of the array represent values
206         * of variables to be correlated.
207         *
208         * @param data matrix with columns representing variables to correlate
209         * @return correlation matrix
210         */
211        public RealMatrix computeCorrelationMatrix(double[][] data) {
212           return computeCorrelationMatrix(new BlockRealMatrix(data));
213        }
214    
215        /**
216         * Computes the Pearson's product-moment correlation coefficient between the two arrays.
217         *
218         * </p>Throws IllegalArgumentException if the arrays do not have the same length
219         * or their common length is less than 2</p>
220         *
221         * @param xArray first data array
222         * @param yArray second data array
223         * @return Returns Pearson's correlation coefficient for the two arrays
224         * @throws DimensionMismatchException if the arrays lengths do not match
225         * @throws MathIllegalArgumentException if there is insufficient data
226         */
227        public double correlation(final double[] xArray, final double[] yArray) {
228            SimpleRegression regression = new SimpleRegression();
229            if (xArray.length != yArray.length) {
230                throw new DimensionMismatchException(xArray.length, yArray.length);
231            } else if (xArray.length < 2) {
232                throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
233                                                       xArray.length, 2);
234            } else {
235                for(int i=0; i<xArray.length; i++) {
236                    regression.addData(xArray[i], yArray[i]);
237                }
238                return regression.getR();
239            }
240        }
241    
242        /**
243         * Derives a correlation matrix from a covariance matrix.
244         *
245         * <p>Uses the formula <br/>
246         * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
247         * <code>r(&middot,&middot;)</code> is the correlation coefficient and
248         * <code>s(&middot;)</code> means standard deviation.</p>
249         *
250         * @param covarianceMatrix the covariance matrix
251         * @return correlation matrix
252         */
253        public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
254            int nVars = covarianceMatrix.getColumnDimension();
255            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
256            for (int i = 0; i < nVars; i++) {
257                double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
258                outMatrix.setEntry(i, i, 1d);
259                for (int j = 0; j < i; j++) {
260                    double entry = covarianceMatrix.getEntry(i, j) /
261                           (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
262                    outMatrix.setEntry(i, j, entry);
263                    outMatrix.setEntry(j, i, entry);
264                }
265            }
266            return outMatrix;
267        }
268    
269        /**
270         * Throws IllegalArgumentException of the matrix does not have at least
271         * two columns and two rows
272         *
273         * @param matrix matrix to check for sufficiency
274         * @throws MathIllegalArgumentException if there is insufficient data
275         */
276        private void checkSufficientData(final RealMatrix matrix) {
277            int nRows = matrix.getRowDimension();
278            int nCols = matrix.getColumnDimension();
279            if (nRows < 2 || nCols < 2) {
280                throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
281                                                       nRows, nCols);
282            }
283        }
284    }