001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive;
018    
019    import java.io.Serializable;
020    import java.util.Arrays;
021    
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.exception.util.LocalizedFormats;
024    import org.apache.commons.math.exception.DimensionMismatchException;
025    import org.apache.commons.math.linear.RealMatrix;
026    import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
027    import org.apache.commons.math.stat.descriptive.moment.Mean;
028    import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
029    import org.apache.commons.math.stat.descriptive.rank.Max;
030    import org.apache.commons.math.stat.descriptive.rank.Min;
031    import org.apache.commons.math.stat.descriptive.summary.Sum;
032    import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
033    import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
034    import org.apache.commons.math.util.MathUtils;
035    import org.apache.commons.math.util.MathArrays;
036    import org.apache.commons.math.util.Precision;
037    import org.apache.commons.math.util.FastMath;
038    
039    /**
040     * <p>Computes summary statistics for a stream of n-tuples added using the
041     * {@link #addValue(double[]) addValue} method. The data values are not stored
042     * in memory, so this class can be used to compute statistics for very large
043     * n-tuple streams.</p>
044     *
045     * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
046     * summary state and compute statistics are configurable via setters.
047     * For example, the default implementation for the mean can be overridden by
048     * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
049     * parameters to these methods must implement the
050     * {@link StorelessUnivariateStatistic} interface and configuration must be
051     * completed before <code>addValue</code> is called. No configuration is
052     * necessary to use the default, commons-math provided implementations.</p>
053     *
054     * <p>To compute statistics for a stream of n-tuples, construct a
055     * MultivariateStatistics instance with dimension n and then use
056     * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
057     * methods where Xxx is a statistic return an array of <code>double</code>
058     * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
059     * value of the given statistic for data range consisting of the i<sup>th</sup> element of
060     * each of the input n-tuples.  For example, if <code>addValue</code> is called
061     * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
062     * <code>getSum</code> will return a three-element array with values
063     * {0+3+6, 1+4+7, 2+5+8}</p>
064     *
065     * <p>Note: This class is not thread-safe. Use
066     * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
067     * threads is required.</p>
068     *
069     * @since 1.2
070     * @version $Id: MultivariateSummaryStatistics.java 1182137 2011-10-11 23:10:46Z erans $
071     */
072    public class MultivariateSummaryStatistics
073        implements StatisticalMultivariateSummary, Serializable {
074    
075        /** Serialization UID */
076        private static final long serialVersionUID = 2271900808994826718L;
077    
078        /** Dimension of the data. */
079        private int k;
080    
081        /** Count of values that have been added */
082        private long n = 0;
083    
084        /** Sum statistic implementation - can be reset by setter. */
085        private StorelessUnivariateStatistic[] sumImpl;
086    
087        /** Sum of squares statistic implementation - can be reset by setter. */
088        private StorelessUnivariateStatistic[] sumSqImpl;
089    
090        /** Minimum statistic implementation - can be reset by setter. */
091        private StorelessUnivariateStatistic[] minImpl;
092    
093        /** Maximum statistic implementation - can be reset by setter. */
094        private StorelessUnivariateStatistic[] maxImpl;
095    
096        /** Sum of log statistic implementation - can be reset by setter. */
097        private StorelessUnivariateStatistic[] sumLogImpl;
098    
099        /** Geometric mean statistic implementation - can be reset by setter. */
100        private StorelessUnivariateStatistic[] geoMeanImpl;
101    
102        /** Mean statistic implementation - can be reset by setter. */
103        private StorelessUnivariateStatistic[] meanImpl;
104    
105        /** Covariance statistic implementation - cannot be reset. */
106        private VectorialCovariance covarianceImpl;
107    
108        /**
109         * Construct a MultivariateSummaryStatistics instance
110         * @param k dimension of the data
111         * @param isCovarianceBiasCorrected if true, the unbiased sample
112         * covariance is computed, otherwise the biased population covariance
113         * is computed
114         */
115        public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
116            this.k = k;
117    
118            sumImpl     = new StorelessUnivariateStatistic[k];
119            sumSqImpl   = new StorelessUnivariateStatistic[k];
120            minImpl     = new StorelessUnivariateStatistic[k];
121            maxImpl     = new StorelessUnivariateStatistic[k];
122            sumLogImpl  = new StorelessUnivariateStatistic[k];
123            geoMeanImpl = new StorelessUnivariateStatistic[k];
124            meanImpl    = new StorelessUnivariateStatistic[k];
125    
126            for (int i = 0; i < k; ++i) {
127                sumImpl[i]     = new Sum();
128                sumSqImpl[i]   = new SumOfSquares();
129                minImpl[i]     = new Min();
130                maxImpl[i]     = new Max();
131                sumLogImpl[i]  = new SumOfLogs();
132                geoMeanImpl[i] = new GeometricMean();
133                meanImpl[i]    = new Mean();
134            }
135    
136            covarianceImpl =
137                new VectorialCovariance(k, isCovarianceBiasCorrected);
138    
139        }
140    
141        /**
142         * Add an n-tuple to the data
143         *
144         * @param value  the n-tuple to add
145         * @throws DimensionMismatchException if the length of the array
146         * does not match the one used at construction
147         */
148        public void addValue(double[] value) {
149            checkDimension(value.length);
150            for (int i = 0; i < k; ++i) {
151                double v = value[i];
152                sumImpl[i].increment(v);
153                sumSqImpl[i].increment(v);
154                minImpl[i].increment(v);
155                maxImpl[i].increment(v);
156                sumLogImpl[i].increment(v);
157                geoMeanImpl[i].increment(v);
158                meanImpl[i].increment(v);
159            }
160            covarianceImpl.increment(value);
161            n++;
162        }
163    
164        /**
165         * Returns the dimension of the data
166         * @return The dimension of the data
167         */
168        public int getDimension() {
169            return k;
170        }
171    
172        /**
173         * Returns the number of available values
174         * @return The number of available values
175         */
176        public long getN() {
177            return n;
178        }
179    
180        /**
181         * Returns an array of the results of a statistic.
182         * @param stats univariate statistic array
183         * @return results array
184         */
185        private double[] getResults(StorelessUnivariateStatistic[] stats) {
186            double[] results = new double[stats.length];
187            for (int i = 0; i < results.length; ++i) {
188                results[i] = stats[i].getResult();
189            }
190            return results;
191        }
192    
193        /**
194         * Returns an array whose i<sup>th</sup> entry is the sum of the
195         * i<sup>th</sup> entries of the arrays that have been added using
196         * {@link #addValue(double[])}
197         *
198         * @return the array of component sums
199         */
200        public double[] getSum() {
201            return getResults(sumImpl);
202        }
203    
204        /**
205         * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
206         * i<sup>th</sup> entries of the arrays that have been added using
207         * {@link #addValue(double[])}
208         *
209         * @return the array of component sums of squares
210         */
211        public double[] getSumSq() {
212            return getResults(sumSqImpl);
213        }
214    
215        /**
216         * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
217         * i<sup>th</sup> entries of the arrays that have been added using
218         * {@link #addValue(double[])}
219         *
220         * @return the array of component log sums
221         */
222        public double[] getSumLog() {
223            return getResults(sumLogImpl);
224        }
225    
226        /**
227         * Returns an array whose i<sup>th</sup> entry is the mean of the
228         * i<sup>th</sup> entries of the arrays that have been added using
229         * {@link #addValue(double[])}
230         *
231         * @return the array of component means
232         */
233        public double[] getMean() {
234            return getResults(meanImpl);
235        }
236    
237        /**
238         * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
239         * i<sup>th</sup> entries of the arrays that have been added using
240         * {@link #addValue(double[])}
241         *
242         * @return the array of component standard deviations
243         */
244        public double[] getStandardDeviation() {
245            double[] stdDev = new double[k];
246            if (getN() < 1) {
247                Arrays.fill(stdDev, Double.NaN);
248            } else if (getN() < 2) {
249                Arrays.fill(stdDev, 0.0);
250            } else {
251                RealMatrix matrix = covarianceImpl.getResult();
252                for (int i = 0; i < k; ++i) {
253                    stdDev[i] = FastMath.sqrt(matrix.getEntry(i, i));
254                }
255            }
256            return stdDev;
257        }
258    
259        /**
260         * Returns the covariance matrix of the values that have been added.
261         *
262         * @return the covariance matrix
263         */
264        public RealMatrix getCovariance() {
265            return covarianceImpl.getResult();
266        }
267    
268        /**
269         * Returns an array whose i<sup>th</sup> entry is the maximum of the
270         * i<sup>th</sup> entries of the arrays that have been added using
271         * {@link #addValue(double[])}
272         *
273         * @return the array of component maxima
274         */
275        public double[] getMax() {
276            return getResults(maxImpl);
277        }
278    
279        /**
280         * Returns an array whose i<sup>th</sup> entry is the minimum of the
281         * i<sup>th</sup> entries of the arrays that have been added using
282         * {@link #addValue(double[])}
283         *
284         * @return the array of component minima
285         */
286        public double[] getMin() {
287            return getResults(minImpl);
288        }
289    
290        /**
291         * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
292         * i<sup>th</sup> entries of the arrays that have been added using
293         * {@link #addValue(double[])}
294         *
295         * @return the array of component geometric means
296         */
297        public double[] getGeometricMean() {
298            return getResults(geoMeanImpl);
299        }
300    
301        /**
302         * Generates a text report displaying
303         * summary statistics from values that
304         * have been added.
305         * @return String with line feeds displaying statistics
306         */
307        @Override
308        public String toString() {
309            final String separator = ", ";
310            final String suffix = System.getProperty("line.separator");
311            StringBuilder outBuffer = new StringBuilder();
312            outBuffer.append("MultivariateSummaryStatistics:" + suffix);
313            outBuffer.append("n: " + getN() + suffix);
314            append(outBuffer, getMin(), "min: ", separator, suffix);
315            append(outBuffer, getMax(), "max: ", separator, suffix);
316            append(outBuffer, getMean(), "mean: ", separator, suffix);
317            append(outBuffer, getGeometricMean(), "geometric mean: ", separator, suffix);
318            append(outBuffer, getSumSq(), "sum of squares: ", separator, suffix);
319            append(outBuffer, getSumLog(), "sum of logarithms: ", separator, suffix);
320            append(outBuffer, getStandardDeviation(), "standard deviation: ", separator, suffix);
321            outBuffer.append("covariance: " + getCovariance().toString() + suffix);
322            return outBuffer.toString();
323        }
324    
325        /**
326         * Append a text representation of an array to a buffer.
327         * @param buffer buffer to fill
328         * @param data data array
329         * @param prefix text prefix
330         * @param separator elements separator
331         * @param suffix text suffix
332         */
333        private void append(StringBuilder buffer, double[] data,
334                            String prefix, String separator, String suffix) {
335            buffer.append(prefix);
336            for (int i = 0; i < data.length; ++i) {
337                if (i > 0) {
338                    buffer.append(separator);
339                }
340                buffer.append(data[i]);
341            }
342            buffer.append(suffix);
343        }
344    
345        /**
346         * Resets all statistics and storage
347         */
348        public void clear() {
349            this.n = 0;
350            for (int i = 0; i < k; ++i) {
351                minImpl[i].clear();
352                maxImpl[i].clear();
353                sumImpl[i].clear();
354                sumLogImpl[i].clear();
355                sumSqImpl[i].clear();
356                geoMeanImpl[i].clear();
357                meanImpl[i].clear();
358            }
359            covarianceImpl.clear();
360        }
361    
362        /**
363         * Returns true iff <code>object</code> is a <code>MultivariateSummaryStatistics</code>
364         * instance and all statistics have the same values as this.
365         * @param object the object to test equality against.
366         * @return true if object equals this
367         */
368        @Override
369        public boolean equals(Object object) {
370            if (object == this ) {
371                return true;
372            }
373            if (object instanceof MultivariateSummaryStatistics == false) {
374                return false;
375            }
376            MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
377            return MathArrays.equalsIncludingNaN(stat.getGeometricMean(), getGeometricMean()) &&
378                   MathArrays.equalsIncludingNaN(stat.getMax(),           getMax())           &&
379                   MathArrays.equalsIncludingNaN(stat.getMean(),          getMean())          &&
380                   MathArrays.equalsIncludingNaN(stat.getMin(),           getMin())           &&
381                   Precision.equalsIncludingNaN(stat.getN(),             getN())             &&
382                   MathArrays.equalsIncludingNaN(stat.getSum(),           getSum())           &&
383                   MathArrays.equalsIncludingNaN(stat.getSumSq(),         getSumSq())         &&
384                   MathArrays.equalsIncludingNaN(stat.getSumLog(),        getSumLog())        &&
385                   stat.getCovariance().equals( getCovariance());
386        }
387    
388        /**
389         * Returns hash code based on values of statistics
390         *
391         * @return hash code
392         */
393        @Override
394        public int hashCode() {
395            int result = 31 + MathUtils.hash(getGeometricMean());
396            result = result * 31 + MathUtils.hash(getGeometricMean());
397            result = result * 31 + MathUtils.hash(getMax());
398            result = result * 31 + MathUtils.hash(getMean());
399            result = result * 31 + MathUtils.hash(getMin());
400            result = result * 31 + MathUtils.hash(getN());
401            result = result * 31 + MathUtils.hash(getSum());
402            result = result * 31 + MathUtils.hash(getSumSq());
403            result = result * 31 + MathUtils.hash(getSumLog());
404            result = result * 31 + getCovariance().hashCode();
405            return result;
406        }
407    
408        // Getters and setters for statistics implementations
409        /**
410         * Sets statistics implementations.
411         * @param newImpl new implementations for statistics
412         * @param oldImpl old implementations for statistics
413         * @throws DimensionMismatchException if the array dimension
414         * does not match the one used at construction
415         * @throws IllegalStateException if data has already been added
416         *  (i.e if n > 0)
417         */
418        private void setImpl(StorelessUnivariateStatistic[] newImpl,
419                             StorelessUnivariateStatistic[] oldImpl) {
420            checkEmpty();
421            checkDimension(newImpl.length);
422            System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
423        }
424    
425        /**
426         * Returns the currently configured Sum implementation
427         *
428         * @return the StorelessUnivariateStatistic implementing the sum
429         */
430        public StorelessUnivariateStatistic[] getSumImpl() {
431            return sumImpl.clone();
432        }
433    
434        /**
435         * <p>Sets the implementation for the Sum.</p>
436         * <p>This method must be activated before any data has been added - i.e.,
437         * before {@link #addValue(double[]) addValue} has been used to add data;
438         * otherwise an IllegalStateException will be thrown.</p>
439         *
440         * @param sumImpl the StorelessUnivariateStatistic instance to use
441         * for computing the Sum
442         * @throws DimensionMismatchException if the array dimension
443         * does not match the one used at construction
444         * @throws IllegalStateException if data has already been added
445         *  (i.e if n > 0)
446         */
447        public void setSumImpl(StorelessUnivariateStatistic[] sumImpl) {
448            setImpl(sumImpl, this.sumImpl);
449        }
450    
451        /**
452         * Returns the currently configured sum of squares implementation
453         *
454         * @return the StorelessUnivariateStatistic implementing the sum of squares
455         */
456        public StorelessUnivariateStatistic[] getSumsqImpl() {
457            return sumSqImpl.clone();
458        }
459    
460        /**
461         * <p>Sets the implementation for the sum of squares.</p>
462         * <p>This method must be activated before any data has been added - i.e.,
463         * before {@link #addValue(double[]) addValue} has been used to add data;
464         * otherwise an IllegalStateException will be thrown.</p>
465         *
466         * @param sumsqImpl the StorelessUnivariateStatistic instance to use
467         * for computing the sum of squares
468         * @throws DimensionMismatchException if the array dimension
469         * does not match the one used at construction
470         * @throws IllegalStateException if data has already been added
471         *  (i.e if n > 0)
472         */
473        public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) {
474            setImpl(sumsqImpl, this.sumSqImpl);
475        }
476    
477        /**
478         * Returns the currently configured minimum implementation
479         *
480         * @return the StorelessUnivariateStatistic implementing the minimum
481         */
482        public StorelessUnivariateStatistic[] getMinImpl() {
483            return minImpl.clone();
484        }
485    
486        /**
487         * <p>Sets the implementation for the minimum.</p>
488         * <p>This method must be activated before any data has been added - i.e.,
489         * before {@link #addValue(double[]) addValue} has been used to add data;
490         * otherwise an IllegalStateException will be thrown.</p>
491         *
492         * @param minImpl the StorelessUnivariateStatistic instance to use
493         * for computing the minimum
494         * @throws DimensionMismatchException if the array dimension
495         * does not match the one used at construction
496         * @throws IllegalStateException if data has already been added
497         *  (i.e if n > 0)
498         */
499        public void setMinImpl(StorelessUnivariateStatistic[] minImpl) {
500            setImpl(minImpl, this.minImpl);
501        }
502    
503        /**
504         * Returns the currently configured maximum implementation
505         *
506         * @return the StorelessUnivariateStatistic implementing the maximum
507         */
508        public StorelessUnivariateStatistic[] getMaxImpl() {
509            return maxImpl.clone();
510        }
511    
512        /**
513         * <p>Sets the implementation for the maximum.</p>
514         * <p>This method must be activated before any data has been added - i.e.,
515         * before {@link #addValue(double[]) addValue} has been used to add data;
516         * otherwise an IllegalStateException will be thrown.</p>
517         *
518         * @param maxImpl the StorelessUnivariateStatistic instance to use
519         * for computing the maximum
520         * @throws DimensionMismatchException if the array dimension
521         * does not match the one used at construction
522         * @throws IllegalStateException if data has already been added
523         *  (i.e if n > 0)
524         */
525        public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) {
526            setImpl(maxImpl, this.maxImpl);
527        }
528    
529        /**
530         * Returns the currently configured sum of logs implementation
531         *
532         * @return the StorelessUnivariateStatistic implementing the log sum
533         */
534        public StorelessUnivariateStatistic[] getSumLogImpl() {
535            return sumLogImpl.clone();
536        }
537    
538        /**
539         * <p>Sets the implementation for the sum of logs.</p>
540         * <p>This method must be activated before any data has been added - i.e.,
541         * before {@link #addValue(double[]) addValue} has been used to add data;
542         * otherwise an IllegalStateException will be thrown.</p>
543         *
544         * @param sumLogImpl the StorelessUnivariateStatistic instance to use
545         * for computing the log sum
546         * @throws DimensionMismatchException if the array dimension
547         * does not match the one used at construction
548         * @throws IllegalStateException if data has already been added
549         *  (i.e if n > 0)
550         */
551        public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) {
552            setImpl(sumLogImpl, this.sumLogImpl);
553        }
554    
555        /**
556         * Returns the currently configured geometric mean implementation
557         *
558         * @return the StorelessUnivariateStatistic implementing the geometric mean
559         */
560        public StorelessUnivariateStatistic[] getGeoMeanImpl() {
561            return geoMeanImpl.clone();
562        }
563    
564        /**
565         * <p>Sets the implementation for the geometric mean.</p>
566         * <p>This method must be activated before any data has been added - i.e.,
567         * before {@link #addValue(double[]) addValue} has been used to add data;
568         * otherwise an IllegalStateException will be thrown.</p>
569         *
570         * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
571         * for computing the geometric mean
572         * @throws DimensionMismatchException if the array dimension
573         * does not match the one used at construction
574         * @throws IllegalStateException if data has already been added
575         *  (i.e if n > 0)
576         */
577        public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) {
578            setImpl(geoMeanImpl, this.geoMeanImpl);
579        }
580    
581        /**
582         * Returns the currently configured mean implementation
583         *
584         * @return the StorelessUnivariateStatistic implementing the mean
585         */
586        public StorelessUnivariateStatistic[] getMeanImpl() {
587            return meanImpl.clone();
588        }
589    
590        /**
591         * <p>Sets the implementation for the mean.</p>
592         * <p>This method must be activated before any data has been added - i.e.,
593         * before {@link #addValue(double[]) addValue} has been used to add data;
594         * otherwise an IllegalStateException will be thrown.</p>
595         *
596         * @param meanImpl the StorelessUnivariateStatistic instance to use
597         * for computing the mean
598         * @throws DimensionMismatchException if the array dimension
599         * does not match the one used at construction
600         * @throws IllegalStateException if data has already been added
601         *  (i.e if n > 0)
602         */
603        public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) {
604            setImpl(meanImpl, this.meanImpl);
605        }
606    
607        /**
608         * Throws IllegalStateException if n > 0.
609         */
610        private void checkEmpty() {
611            if (n > 0) {
612                throw MathRuntimeException.createIllegalStateException(
613                        LocalizedFormats.VALUES_ADDED_BEFORE_CONFIGURING_STATISTIC,
614                        n);
615            }
616        }
617    
618        /**
619         * Throws DimensionMismatchException if dimension != k.
620         * @param dimension dimension to check
621         * @throws DimensionMismatchException if dimension != k
622         */
623        private void checkDimension(int dimension) {
624            if (dimension != k) {
625                throw new DimensionMismatchException(dimension, k);
626            }
627        }
628    }