001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive.moment;
018    
019    import java.io.Serializable;
020    
021    import org.apache.commons.math.exception.NullArgumentException;
022    import org.apache.commons.math.exception.util.LocalizedFormats;
023    import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
024    import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
025    import org.apache.commons.math.util.MathUtils;
026    
027    /**
028     * Computes the variance of the available values.  By default, the unbiased
029     * "sample variance" definitional formula is used:
030     * <p>
031     * variance = sum((x_i - mean)^2) / (n - 1) </p>
032     * <p>
033     * where mean is the {@link Mean} and <code>n</code> is the number
034     * of sample observations.</p>
035     * <p>
036     * The definitional formula does not have good numerical properties, so
037     * this implementation does not compute the statistic using the definitional
038     * formula. <ul>
039     * <li> The <code>getResult</code> method computes the variance using
040     * updating formulas based on West's algorithm, as described in
041     * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
042     * J. G. Lewis 1979, <i>Communications of the ACM</i>,
043     * vol. 22 no. 9, pp. 526-531.</a></li>
044     * <li> The <code>evaluate</code> methods leverage the fact that they have the
045     * full array of values in memory to execute a two-pass algorithm.
046     * Specifically, these methods use the "corrected two-pass algorithm" from
047     * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
048     * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
049     * Note that adding values using <code>increment</code> or
050     * <code>incrementAll</code> and then executing <code>getResult</code> will
051     * sometimes give a different, less accurate, result than executing
052     * <code>evaluate</code> with the full array of values. The former approach
053     * should only be used when the full array of values is not available.</p>
054     * <p>
055     * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
056     * be computed using this statistic.  The <code>isBiasCorrected</code>
057     * property determines whether the "population" or "sample" value is
058     * returned by the <code>evaluate</code> and <code>getResult</code> methods.
059     * To compute population variances, set this property to <code>false.</code>
060     * </p>
061     * <p>
062     * <strong>Note that this implementation is not synchronized.</strong> If
063     * multiple threads access an instance of this class concurrently, and at least
064     * one of the threads invokes the <code>increment()</code> or
065     * <code>clear()</code> method, it must be synchronized externally.</p>
066     *
067     * @version $Id: Variance.java 1182147 2011-10-11 23:37:57Z erans $
068     */
069    public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
070    
071        /** Serializable version identifier */
072        private static final long serialVersionUID = -9111962718267217978L;
073    
074        /** SecondMoment is used in incremental calculation of Variance*/
075        protected SecondMoment moment = null;
076    
077        /**
078         * Boolean test to determine if this Variance should also increment
079         * the second moment, this evaluates to false when this Variance is
080         * constructed with an external SecondMoment as a parameter.
081         */
082        protected boolean incMoment = true;
083    
084        /**
085         * Determines whether or not bias correction is applied when computing the
086         * value of the statisic.  True means that bias is corrected.  See
087         * {@link Variance} for details on the formula.
088         */
089        private boolean isBiasCorrected = true;
090    
091        /**
092         * Constructs a Variance with default (true) <code>isBiasCorrected</code>
093         * property.
094         */
095        public Variance() {
096            moment = new SecondMoment();
097        }
098    
099        /**
100         * Constructs a Variance based on an external second moment.
101         *
102         * @param m2 the SecondMoment (Third or Fourth moments work
103         * here as well.)
104         */
105        public Variance(final SecondMoment m2) {
106            incMoment = false;
107            this.moment = m2;
108        }
109    
110        /**
111         * Constructs a Variance with the specified <code>isBiasCorrected</code>
112         * property
113         *
114         * @param isBiasCorrected  setting for bias correction - true means
115         * bias will be corrected and is equivalent to using the argumentless
116         * constructor
117         */
118        public Variance(boolean isBiasCorrected) {
119            moment = new SecondMoment();
120            this.isBiasCorrected = isBiasCorrected;
121        }
122    
123        /**
124         * Constructs a Variance with the specified <code>isBiasCorrected</code>
125         * property and the supplied external second moment.
126         *
127         * @param isBiasCorrected  setting for bias correction - true means
128         * bias will be corrected
129         * @param m2 the SecondMoment (Third or Fourth moments work
130         * here as well.)
131         */
132        public Variance(boolean isBiasCorrected, SecondMoment m2) {
133            incMoment = false;
134            this.moment = m2;
135            this.isBiasCorrected = isBiasCorrected;
136        }
137    
138        /**
139         * Copy constructor, creates a new {@code Variance} identical
140         * to the {@code original}
141         *
142         * @param original the {@code Variance} instance to copy
143         */
144        public Variance(Variance original) {
145            copy(original, this);
146        }
147    
148        /**
149         * {@inheritDoc}
150         * <p>If all values are available, it is more accurate to use
151         * {@link #evaluate(double[])} rather than adding values one at a time
152         * using this method and then executing {@link #getResult}, since
153         * <code>evaluate</code> leverages the fact that is has the full
154         * list of values together to execute a two-pass algorithm.
155         * See {@link Variance}.</p>
156         */
157        @Override
158        public void increment(final double d) {
159            if (incMoment) {
160                moment.increment(d);
161            }
162        }
163    
164        /**
165         * {@inheritDoc}
166         */
167        @Override
168        public double getResult() {
169                if (moment.n == 0) {
170                    return Double.NaN;
171                } else if (moment.n == 1) {
172                    return 0d;
173                } else {
174                    if (isBiasCorrected) {
175                        return moment.m2 / (moment.n - 1d);
176                    } else {
177                        return moment.m2 / (moment.n);
178                    }
179                }
180        }
181    
182        /**
183         * {@inheritDoc}
184         */
185        public long getN() {
186            return moment.getN();
187        }
188    
189        /**
190         * {@inheritDoc}
191         */
192        @Override
193        public void clear() {
194            if (incMoment) {
195                moment.clear();
196            }
197        }
198    
199        /**
200         * Returns the variance of the entries in the input array, or
201         * <code>Double.NaN</code> if the array is empty.
202         * <p>
203         * See {@link Variance} for details on the computing algorithm.</p>
204         * <p>
205         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
206         * <p>
207         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
208         * <p>
209         * Does not change the internal state of the statistic.</p>
210         *
211         * @param values the input array
212         * @return the variance of the values or Double.NaN if length = 0
213         * @throws IllegalArgumentException if the array is null
214         */
215        @Override
216        public double evaluate(final double[] values) {
217            if (values == null) {
218                throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
219            }
220            return evaluate(values, 0, values.length);
221        }
222    
223        /**
224         * Returns the variance of the entries in the specified portion of
225         * the input array, or <code>Double.NaN</code> if the designated subarray
226         * is empty.
227         * <p>
228         * See {@link Variance} for details on the computing algorithm.</p>
229         * <p>
230         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
231         * <p>
232         * Does not change the internal state of the statistic.</p>
233         * <p>
234         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
235         *
236         * @param values the input array
237         * @param begin index of the first array element to include
238         * @param length the number of elements to include
239         * @return the variance of the values or Double.NaN if length = 0
240         * @throws IllegalArgumentException if the array is null or the array index
241         *  parameters are not valid
242         */
243        @Override
244        public double evaluate(final double[] values, final int begin, final int length) {
245    
246            double var = Double.NaN;
247    
248            if (test(values, begin, length)) {
249                clear();
250                if (length == 1) {
251                    var = 0.0;
252                } else if (length > 1) {
253                    Mean mean = new Mean();
254                    double m = mean.evaluate(values, begin, length);
255                    var = evaluate(values, m, begin, length);
256                }
257            }
258            return var;
259        }
260    
261        /**
262         * <p>Returns the weighted variance of the entries in the specified portion of
263         * the input array, or <code>Double.NaN</code> if the designated subarray
264         * is empty.</p>
265         * <p>
266         * Uses the formula <pre>
267         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
268         * </pre>
269         * where weightedMean is the weighted mean</p>
270         * <p>
271         * This formula will not return the same result as the unweighted variance when all
272         * weights are equal, unless all weights are equal to 1. The formula assumes that
273         * weights are to be treated as "expansion values," as will be the case if for example
274         * the weights represent frequency counts. To normalize weights so that the denominator
275         * in the variance computation equals the length of the input vector minus one, use <pre>
276         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
277         * </pre>
278         * <p>
279         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
280         * <p>
281         * Throws <code>IllegalArgumentException</code> if any of the following are true:
282         * <ul><li>the values array is null</li>
283         *     <li>the weights array is null</li>
284         *     <li>the weights array does not have the same length as the values array</li>
285         *     <li>the weights array contains one or more infinite values</li>
286         *     <li>the weights array contains one or more NaN values</li>
287         *     <li>the weights array contains negative values</li>
288         *     <li>the start and length arguments do not determine a valid array</li>
289         * </ul></p>
290         * <p>
291         * Does not change the internal state of the statistic.</p>
292         * <p>
293         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
294         *
295         * @param values the input array
296         * @param weights the weights array
297         * @param begin index of the first array element to include
298         * @param length the number of elements to include
299         * @return the weighted variance of the values or Double.NaN if length = 0
300         * @throws IllegalArgumentException if the parameters are not valid
301         * @since 2.1
302         */
303        public double evaluate(final double[] values, final double[] weights,
304                               final int begin, final int length) {
305    
306            double var = Double.NaN;
307    
308            if (test(values, weights,begin, length)) {
309                clear();
310                if (length == 1) {
311                    var = 0.0;
312                } else if (length > 1) {
313                    Mean mean = new Mean();
314                    double m = mean.evaluate(values, weights, begin, length);
315                    var = evaluate(values, weights, m, begin, length);
316                }
317            }
318            return var;
319        }
320    
321        /**
322         * <p>
323         * Returns the weighted variance of the entries in the the input array.</p>
324         * <p>
325         * Uses the formula <pre>
326         *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
327         * </pre>
328         * where weightedMean is the weighted mean</p>
329         * <p>
330         * This formula will not return the same result as the unweighted variance when all
331         * weights are equal, unless all weights are equal to 1. The formula assumes that
332         * weights are to be treated as "expansion values," as will be the case if for example
333         * the weights represent frequency counts. To normalize weights so that the denominator
334         * in the variance computation equals the length of the input vector minus one, use <pre>
335         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
336         * </pre>
337         * <p>
338         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
339         * <p>
340         * Throws <code>IllegalArgumentException</code> if any of the following are true:
341         * <ul><li>the values array is null</li>
342         *     <li>the weights array is null</li>
343         *     <li>the weights array does not have the same length as the values array</li>
344         *     <li>the weights array contains one or more infinite values</li>
345         *     <li>the weights array contains one or more NaN values</li>
346         *     <li>the weights array contains negative values</li>
347         * </ul></p>
348         * <p>
349         * Does not change the internal state of the statistic.</p>
350         * <p>
351         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
352         *
353         * @param values the input array
354         * @param weights the weights array
355         * @return the weighted variance of the values
356         * @throws IllegalArgumentException if the parameters are not valid
357         * @since 2.1
358         */
359        public double evaluate(final double[] values, final double[] weights) {
360            return evaluate(values, weights, 0, values.length);
361        }
362    
363        /**
364         * Returns the variance of the entries in the specified portion of
365         * the input array, using the precomputed mean value.  Returns
366         * <code>Double.NaN</code> if the designated subarray is empty.
367         * <p>
368         * See {@link Variance} for details on the computing algorithm.</p>
369         * <p>
370         * The formula used assumes that the supplied mean value is the arithmetic
371         * mean of the sample data, not a known population parameter.  This method
372         * is supplied only to save computation when the mean has already been
373         * computed.</p>
374         * <p>
375         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
376         * <p>
377         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
378         * <p>
379         * Does not change the internal state of the statistic.</p>
380         *
381         * @param values the input array
382         * @param mean the precomputed mean value
383         * @param begin index of the first array element to include
384         * @param length the number of elements to include
385         * @return the variance of the values or Double.NaN if length = 0
386         * @throws IllegalArgumentException if the array is null or the array index
387         *  parameters are not valid
388         */
389        public double evaluate(final double[] values, final double mean,
390                final int begin, final int length) {
391    
392            double var = Double.NaN;
393    
394            if (test(values, begin, length)) {
395                if (length == 1) {
396                    var = 0.0;
397                } else if (length > 1) {
398                    double accum = 0.0;
399                    double dev = 0.0;
400                    double accum2 = 0.0;
401                    for (int i = begin; i < begin + length; i++) {
402                        dev = values[i] - mean;
403                        accum += dev * dev;
404                        accum2 += dev;
405                    }
406                    double len = length;
407                    if (isBiasCorrected) {
408                        var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
409                    } else {
410                        var = (accum - (accum2 * accum2 / len)) / len;
411                    }
412                }
413            }
414            return var;
415        }
416    
417        /**
418         * Returns the variance of the entries in the input array, using the
419         * precomputed mean value.  Returns <code>Double.NaN</code> if the array
420         * is empty.
421         * <p>
422         * See {@link Variance} for details on the computing algorithm.</p>
423         * <p>
424         * If <code>isBiasCorrected</code> is <code>true</code> the formula used
425         * assumes that the supplied mean value is the arithmetic mean of the
426         * sample data, not a known population parameter.  If the mean is a known
427         * population parameter, or if the "population" version of the variance is
428         * desired, set <code>isBiasCorrected</code> to <code>false</code> before
429         * invoking this method.</p>
430         * <p>
431         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
432         * <p>
433         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
434         * <p>
435         * Does not change the internal state of the statistic.</p>
436         *
437         * @param values the input array
438         * @param mean the precomputed mean value
439         * @return the variance of the values or Double.NaN if the array is empty
440         * @throws IllegalArgumentException if the array is null
441         */
442        public double evaluate(final double[] values, final double mean) {
443            return evaluate(values, mean, 0, values.length);
444        }
445    
446        /**
447         * Returns the weighted variance of the entries in the specified portion of
448         * the input array, using the precomputed weighted mean value.  Returns
449         * <code>Double.NaN</code> if the designated subarray is empty.
450         * <p>
451         * Uses the formula <pre>
452         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
453         * </pre></p>
454         * <p>
455         * The formula used assumes that the supplied mean value is the weighted arithmetic
456         * mean of the sample data, not a known population parameter. This method
457         * is supplied only to save computation when the mean has already been
458         * computed.</p>
459         * <p>
460         * This formula will not return the same result as the unweighted variance when all
461         * weights are equal, unless all weights are equal to 1. The formula assumes that
462         * weights are to be treated as "expansion values," as will be the case if for example
463         * the weights represent frequency counts. To normalize weights so that the denominator
464         * in the variance computation equals the length of the input vector minus one, use <pre>
465         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
466         * </pre>
467         * <p>
468         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
469         * <p>
470         * Throws <code>IllegalArgumentException</code> if any of the following are true:
471         * <ul><li>the values array is null</li>
472         *     <li>the weights array is null</li>
473         *     <li>the weights array does not have the same length as the values array</li>
474         *     <li>the weights array contains one or more infinite values</li>
475         *     <li>the weights array contains one or more NaN values</li>
476         *     <li>the weights array contains negative values</li>
477         *     <li>the start and length arguments do not determine a valid array</li>
478         * </ul></p>
479         * <p>
480         * Does not change the internal state of the statistic.</p>
481         *
482         * @param values the input array
483         * @param weights the weights array
484         * @param mean the precomputed weighted mean value
485         * @param begin index of the first array element to include
486         * @param length the number of elements to include
487         * @return the variance of the values or Double.NaN if length = 0
488         * @throws IllegalArgumentException if the parameters are not valid
489         * @since 2.1
490         */
491        public double evaluate(final double[] values, final double[] weights,
492                               final double mean, final int begin, final int length) {
493    
494            double var = Double.NaN;
495    
496            if (test(values, weights, begin, length)) {
497                if (length == 1) {
498                    var = 0.0;
499                } else if (length > 1) {
500                    double accum = 0.0;
501                    double dev = 0.0;
502                    double accum2 = 0.0;
503                    for (int i = begin; i < begin + length; i++) {
504                        dev = values[i] - mean;
505                        accum += weights[i] * (dev * dev);
506                        accum2 += weights[i] * dev;
507                    }
508    
509                    double sumWts = 0;
510                    for (int i = 0; i < weights.length; i++) {
511                        sumWts += weights[i];
512                    }
513    
514                    if (isBiasCorrected) {
515                        var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
516                    } else {
517                        var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
518                    }
519                }
520            }
521            return var;
522        }
523    
524        /**
525         * <p>Returns the weighted variance of the values in the input array, using
526         * the precomputed weighted mean value.</p>
527         * <p>
528         * Uses the formula <pre>
529         *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
530         * </pre></p>
531         * <p>
532         * The formula used assumes that the supplied mean value is the weighted arithmetic
533         * mean of the sample data, not a known population parameter. This method
534         * is supplied only to save computation when the mean has already been
535         * computed.</p>
536         * <p>
537         * This formula will not return the same result as the unweighted variance when all
538         * weights are equal, unless all weights are equal to 1. The formula assumes that
539         * weights are to be treated as "expansion values," as will be the case if for example
540         * the weights represent frequency counts. To normalize weights so that the denominator
541         * in the variance computation equals the length of the input vector minus one, use <pre>
542         *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
543         * </pre>
544         * <p>
545         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
546         * <p>
547         * Throws <code>IllegalArgumentException</code> if any of the following are true:
548         * <ul><li>the values array is null</li>
549         *     <li>the weights array is null</li>
550         *     <li>the weights array does not have the same length as the values array</li>
551         *     <li>the weights array contains one or more infinite values</li>
552         *     <li>the weights array contains one or more NaN values</li>
553         *     <li>the weights array contains negative values</li>
554         * </ul></p>
555         * <p>
556         * Does not change the internal state of the statistic.</p>
557         *
558         * @param values the input array
559         * @param weights the weights array
560         * @param mean the precomputed weighted mean value
561         * @return the variance of the values or Double.NaN if length = 0
562         * @throws IllegalArgumentException if the parameters are not valid
563         * @since 2.1
564         */
565        public double evaluate(final double[] values, final double[] weights, final double mean) {
566            return evaluate(values, weights, mean, 0, values.length);
567        }
568    
569        /**
570         * @return Returns the isBiasCorrected.
571         */
572        public boolean isBiasCorrected() {
573            return isBiasCorrected;
574        }
575    
576        /**
577         * @param biasCorrected The isBiasCorrected to set.
578         */
579        public void setBiasCorrected(boolean biasCorrected) {
580            this.isBiasCorrected = biasCorrected;
581        }
582    
583        /**
584         * {@inheritDoc}
585         */
586        @Override
587        public Variance copy() {
588            Variance result = new Variance();
589            copy(this, result);
590            return result;
591        }
592    
593        /**
594         * Copies source to dest.
595         * <p>Neither source nor dest can be null.</p>
596         *
597         * @param source Variance to copy
598         * @param dest Variance to copy to
599         * @throws NullArgumentException if either source or dest is null
600         */
601        public static void copy(Variance source, Variance dest)
602            throws NullArgumentException {
603            MathUtils.checkNotNull(source);
604            MathUtils.checkNotNull(dest);
605            dest.setData(source.getDataRef());
606            dest.moment = source.moment.copy();
607            dest.isBiasCorrected = source.isBiasCorrected;
608            dest.incMoment = source.incMoment;
609        }
610    }