View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math4.legacy.stat.descriptive.moment;
18  
19  import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
20  import org.apache.commons.math4.legacy.exception.NullArgumentException;
21  import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
22  import org.apache.commons.math4.legacy.stat.descriptive.AbstractStorelessUnivariateStatistic;
23  import org.apache.commons.math4.legacy.stat.descriptive.WeightedEvaluation;
24  import org.apache.commons.math4.legacy.core.MathArrays;
25  
26  /**
27   * Computes the variance of the available values.  By default, the unbiased
28   * "sample variance" definitional formula is used:
29   * <p>
30   * variance = sum((x_i - mean)^2) / (n - 1) </p>
31   * <p>
32   * where mean is the {@link Mean} and <code>n</code> is the number
33   * of sample observations.</p>
34   * <p>
35   * The definitional formula does not have good numerical properties, so
36   * this implementation does not compute the statistic using the definitional
37   * formula. <ul>
38   * <li> The <code>getResult</code> method computes the variance using
39   * updating formulas based on West's algorithm, as described in
40   * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
41   * J. G. Lewis 1979, <i>Communications of the ACM</i>,
42   * vol. 22 no. 9, pp. 526-531.</a></li>
43   * <li> The <code>evaluate</code> methods leverage the fact that they have the
44   * full array of values in memory to execute a two-pass algorithm.
45   * Specifically, these methods use the "corrected two-pass algorithm" from
46   * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
47   * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
48   * Note that adding values using <code>increment</code> or
49   * <code>incrementAll</code> and then executing <code>getResult</code> will
50   * sometimes give a different, less accurate, result than executing
51   * <code>evaluate</code> with the full array of values. The former approach
52   * should only be used when the full array of values is not available.
53   * <p>
54   * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
55   * be computed using this statistic.  The <code>isBiasCorrected</code>
56   * property determines whether the "population" or "sample" value is
57   * returned by the <code>evaluate</code> and <code>getResult</code> methods.
58   * To compute population variances, set this property to <code>false.</code>
59   * </p>
60   * <p>
61   * <strong>Note that this implementation is not synchronized.</strong> If
62   * multiple threads access an instance of this class concurrently, and at least
63   * one of the threads invokes the <code>increment()</code> or
64   * <code>clear()</code> method, it must be synchronized externally.</p>
65   */
66  public class Variance extends AbstractStorelessUnivariateStatistic implements WeightedEvaluation {
67      /** SecondMoment is used in incremental calculation of Variance. */
68      protected SecondMoment moment;
69  
70      /**
71       * Whether or not {@link #increment(double)} should increment
72       * the internal second moment. When a Variance is constructed with an
73       * external SecondMoment as a constructor parameter, this property is
74       * set to false and increments must be applied to the second moment
75       * directly.
76       */
77      protected boolean incMoment = true;
78  
79      /**
80       * Whether or not bias correction is applied when computing the
81       * value of the statistic. True means that bias is corrected.  See
82       * {@link Variance} for details on the formula.
83       */
84      private boolean isBiasCorrected = true;
85  
86      /**
87       * Constructs a Variance with default (true) <code>isBiasCorrected</code>
88       * property.
89       */
90      public Variance() {
91          moment = new SecondMoment();
92      }
93  
94      /**
95       * Constructs a Variance based on an external second moment.
96       * <p>
97       * When this constructor is used, the statistic may only be
98       * incremented via the moment, i.e., {@link #increment(double)}
99       * does nothing; whereas {@code m2.increment(value)} increments
100      * both {@code m2} and the Variance instance constructed from it.
101      *
102      * @param m2 the SecondMoment (Third or Fourth moments work here as well.)
103      */
104     public Variance(final SecondMoment m2) {
105         incMoment = false;
106         this.moment = m2;
107     }
108 
109     /**
110      * Constructs a Variance with the specified <code>isBiasCorrected</code>
111      * property.
112      *
113      * @param isBiasCorrected  setting for bias correction - true means
114      * bias will be corrected and is equivalent to using the "no arg"
115      * constructor
116      */
117     public Variance(boolean isBiasCorrected) {
118         moment = new SecondMoment();
119         this.isBiasCorrected = isBiasCorrected;
120     }
121 
122     /**
123      * Constructs a Variance with the specified <code>isBiasCorrected</code>
124      * property and the supplied external second moment.
125      *
126      * @param isBiasCorrected  setting for bias correction - true means
127      * bias will be corrected
128      * @param m2 the SecondMoment (Third or Fourth moments work
129      * here as well.)
130      */
131     public Variance(boolean isBiasCorrected, SecondMoment m2) {
132         incMoment = false;
133         this.moment = m2;
134         this.isBiasCorrected = isBiasCorrected;
135     }
136 
137     /**
138      * Copy constructor, creates a new {@code Variance} identical
139      * to the {@code original}.
140      *
141      * @param original the {@code Variance} instance to copy
142      * @throws NullArgumentException if original is null
143      */
144     public Variance(Variance original) throws NullArgumentException {
145         copy(original, this);
146     }
147 
148     /**
149      * {@inheritDoc}
150      * <p>If all values are available, it is more accurate to use
151      * {@link #evaluate(double[])} rather than adding values one at a time
152      * using this method and then executing {@link #getResult}, since
153      * <code>evaluate</code> leverages the fact that is has the full
154      * list of values together to execute a two-pass algorithm.
155      * See {@link Variance}.</p>
156      *
157      * <p>Note also that when {@link #Variance(SecondMoment)} is used to
158      * create a Variance, this method does nothing. In that case, the
159      * SecondMoment should be incremented directly.</p>
160      */
161     @Override
162     public void increment(final double d) {
163         if (incMoment) {
164             moment.increment(d);
165         }
166     }
167 
168     /**
169      * {@inheritDoc}
170      */
171     @Override
172     public double getResult() {
173         if (moment.n == 0) {
174             return Double.NaN;
175         } else if (moment.n == 1) {
176             return 0d;
177         } else {
178             if (isBiasCorrected) {
179                 return moment.m2 / (moment.n - 1d);
180             } else {
181                 return moment.m2 / (moment.n);
182             }
183         }
184     }
185 
186     /**
187      * {@inheritDoc}
188      */
189     @Override
190     public long getN() {
191         return moment.getN();
192     }
193 
194     /**
195      * {@inheritDoc}
196      */
197     @Override
198     public void clear() {
199         if (incMoment) {
200             moment.clear();
201         }
202     }
203 
204     /**
205      * Returns the variance of the entries in the input array, or
206      * <code>Double.NaN</code> if the array is empty.
207      * <p>
208      * See {@link Variance} for details on the computing algorithm.</p>
209      * <p>
210      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
211      * <p>
212      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
213      * <p>
214      * Does not change the internal state of the statistic.</p>
215      *
216      * @param values the input array
217      * @return the variance of the values or Double.NaN if length = 0
218      * @throws MathIllegalArgumentException if the array is null
219      */
220     @Override
221     public double evaluate(final double[] values) throws MathIllegalArgumentException {
222         if (values == null) {
223             throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
224         }
225         return evaluate(values, 0, values.length);
226     }
227 
228     /**
229      * Returns the variance of the entries in the specified portion of
230      * the input array, or <code>Double.NaN</code> if the designated subarray
231      * is empty.  Note that Double.NaN may also be returned if the input
232      * includes NaN and / or infinite values.
233      * <p>
234      * See {@link Variance} for details on the computing algorithm.</p>
235      * <p>
236      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
237      * <p>
238      * Does not change the internal state of the statistic.</p>
239      * <p>
240      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
241      *
242      * @param values the input array
243      * @param begin index of the first array element to include
244      * @param length the number of elements to include
245      * @return the variance of the values or Double.NaN if length = 0
246      * @throws MathIllegalArgumentException if the array is null or the array index
247      *  parameters are not valid
248      */
249     @Override
250     public double evaluate(final double[] values, final int begin, final int length)
251         throws MathIllegalArgumentException {
252 
253         double var = Double.NaN;
254 
255         if (MathArrays.verifyValues(values, begin, length)) {
256             if (length == 1) {
257                 var = 0.0;
258             } else if (length > 1) {
259                 Mean mean = new Mean();
260                 double m = mean.evaluate(values, begin, length);
261                 var = evaluate(values, m, begin, length);
262             }
263         }
264         return var;
265     }
266 
267     /**
268      * <p>Returns the weighted variance of the entries in the specified portion of
269      * the input array, or <code>Double.NaN</code> if the designated subarray
270      * is empty.</p>
271      * <p>
272      * Uses the formula <div style="white-space: pre"><code>
273      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
274      * </code></div>
275      * where weightedMean is the weighted mean
276      * <p>
277      * This formula will not return the same result as the unweighted variance when all
278      * weights are equal, unless all weights are equal to 1. The formula assumes that
279      * weights are to be treated as "expansion values," as will be the case if for example
280      * the weights represent frequency counts. To normalize weights so that the denominator
281      * in the variance computation equals the length of the input vector minus one, use <pre>
282      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
283      * </pre>
284      * <p>
285      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
286      * <p>
287      * Throws <code>IllegalArgumentException</code> if any of the following are true:
288      * <ul><li>the values array is null</li>
289      *     <li>the weights array is null</li>
290      *     <li>the weights array does not have the same length as the values array</li>
291      *     <li>the weights array contains one or more infinite values</li>
292      *     <li>the weights array contains one or more NaN values</li>
293      *     <li>the weights array contains negative values</li>
294      *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
295      *     <li>the start and length arguments do not determine a valid array</li>
296      * </ul>
297      * <p>
298      * Does not change the internal state of the statistic.</p>
299      * <p>
300      * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
301      *
302      * @param values the input array
303      * @param weights the weights array
304      * @param begin index of the first array element to include
305      * @param length the number of elements to include
306      * @return the weighted variance of the values or Double.NaN if length = 0
307      * @throws MathIllegalArgumentException if the parameters are not valid
308      * @since 2.1
309      */
310     @Override
311     public double evaluate(final double[] values, final double[] weights,
312                            final int begin, final int length) throws MathIllegalArgumentException {
313 
314         double var = Double.NaN;
315 
316         if (MathArrays.verifyValues(values, weights, begin, length)) {
317             if (length == 1) {
318                 var = 0.0;
319             } else if (length > 1) {
320                 Mean mean = new Mean();
321                 double m = mean.evaluate(values, weights, begin, length);
322                 var = evaluate(values, weights, m, begin, length);
323             }
324         }
325         return var;
326     }
327 
328     /**
329      * <p>
330      * Returns the weighted variance of the entries in the input array.</p>
331      * <p>
332      * Uses the formula <div style="white-space:pre"><code>
333      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
334      * </code></div>
335      * where weightedMean is the weighted mean
336      * <p>
337      * This formula will not return the same result as the unweighted variance when all
338      * weights are equal, unless all weights are equal to 1. The formula assumes that
339      * weights are to be treated as "expansion values," as will be the case if for example
340      * the weights represent frequency counts. To normalize weights so that the denominator
341      * in the variance computation equals the length of the input vector minus one, use <pre>
342      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
343      * </pre>
344      * <p>
345      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
346      * <p>
347      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
348      * <ul><li>the values array is null</li>
349      *     <li>the weights array is null</li>
350      *     <li>the weights array does not have the same length as the values array</li>
351      *     <li>the weights array contains one or more infinite values</li>
352      *     <li>the weights array contains one or more NaN values</li>
353      *     <li>the weights array contains negative values</li>
354      *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
355      * </ul>
356      * <p>
357      * Does not change the internal state of the statistic.</p>
358      * <p>
359      * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
360      *
361      * @param values the input array
362      * @param weights the weights array
363      * @return the weighted variance of the values
364      * @throws MathIllegalArgumentException if the parameters are not valid
365      * @since 2.1
366      */
367     @Override
368     public double evaluate(final double[] values, final double[] weights)
369         throws MathIllegalArgumentException {
370         return evaluate(values, weights, 0, values.length);
371     }
372 
373     /**
374      * Returns the variance of the entries in the specified portion of
375      * the input array, using the precomputed mean value.  Returns
376      * <code>Double.NaN</code> if the designated subarray is empty.
377      * <p>
378      * See {@link Variance} for details on the computing algorithm.</p>
379      * <p>
380      * The formula used assumes that the supplied mean value is the arithmetic
381      * mean of the sample data, not a known population parameter.  This method
382      * is supplied only to save computation when the mean has already been
383      * computed.</p>
384      * <p>
385      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
386      * <p>
387      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
388      * <p>
389      * Does not change the internal state of the statistic.</p>
390      *
391      * @param values the input array
392      * @param mean the precomputed mean value
393      * @param begin index of the first array element to include
394      * @param length the number of elements to include
395      * @return the variance of the values or Double.NaN if length = 0
396      * @throws MathIllegalArgumentException if the array is null or the array index
397      *  parameters are not valid
398      */
399     public double evaluate(final double[] values, final double mean,
400                            final int begin, final int length) throws MathIllegalArgumentException {
401 
402         double var = Double.NaN;
403 
404         if (MathArrays.verifyValues(values, begin, length)) {
405             if (length == 1) {
406                 var = 0.0;
407             } else if (length > 1) {
408                 double accum = 0.0;
409                 double dev = 0.0;
410                 double accum2 = 0.0;
411                 for (int i = begin; i < begin + length; i++) {
412                     dev = values[i] - mean;
413                     accum += dev * dev;
414                     accum2 += dev;
415                 }
416                 double len = length;
417                 if (isBiasCorrected) {
418                     var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
419                 } else {
420                     var = (accum - (accum2 * accum2 / len)) / len;
421                 }
422             }
423         }
424         return var;
425     }
426 
427     /**
428      * Returns the variance of the entries in the input array, using the
429      * precomputed mean value.  Returns <code>Double.NaN</code> if the array
430      * is empty.
431      * <p>
432      * See {@link Variance} for details on the computing algorithm.</p>
433      * <p>
434      * If <code>isBiasCorrected</code> is <code>true</code> the formula used
435      * assumes that the supplied mean value is the arithmetic mean of the
436      * sample data, not a known population parameter.  If the mean is a known
437      * population parameter, or if the "population" version of the variance is
438      * desired, set <code>isBiasCorrected</code> to <code>false</code> before
439      * invoking this method.</p>
440      * <p>
441      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
442      * <p>
443      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
444      * <p>
445      * Does not change the internal state of the statistic.</p>
446      *
447      * @param values the input array
448      * @param mean the precomputed mean value
449      * @return the variance of the values or Double.NaN if the array is empty
450      * @throws MathIllegalArgumentException if the array is null
451      */
452     public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
453         return evaluate(values, mean, 0, values.length);
454     }
455 
456     /**
457      * Returns the weighted variance of the entries in the specified portion of
458      * the input array, using the precomputed weighted mean value.  Returns
459      * <code>Double.NaN</code> if the designated subarray is empty.
460      * <p>
461      * Uses the formula <div style="white-space:pre"><code>
462      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
463      * </code></div>
464      * <p>
465      * The formula used assumes that the supplied mean value is the weighted arithmetic
466      * mean of the sample data, not a known population parameter. This method
467      * is supplied only to save computation when the mean has already been
468      * computed.</p>
469      * <p>
470      * This formula will not return the same result as the unweighted variance when all
471      * weights are equal, unless all weights are equal to 1. The formula assumes that
472      * weights are to be treated as "expansion values," as will be the case if for example
473      * the weights represent frequency counts. To normalize weights so that the denominator
474      * in the variance computation equals the length of the input vector minus one, use <pre>
475      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
476      * </pre>
477      * <p>
478      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
479      * <p>
480      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
481      * <ul><li>the values array is null</li>
482      *     <li>the weights array is null</li>
483      *     <li>the weights array does not have the same length as the values array</li>
484      *     <li>the weights array contains one or more infinite values</li>
485      *     <li>the weights array contains one or more NaN values</li>
486      *     <li>the weights array contains negative values</li>
487      *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
488      *     <li>the start and length arguments do not determine a valid array</li>
489      * </ul>
490      * <p>
491      * Does not change the internal state of the statistic.</p>
492      *
493      * @param values the input array
494      * @param weights the weights array
495      * @param mean the precomputed weighted mean value
496      * @param begin index of the first array element to include
497      * @param length the number of elements to include
498      * @return the variance of the values or Double.NaN if length = 0
499      * @throws MathIllegalArgumentException if the parameters are not valid
500      * @since 2.1
501      */
502     public double evaluate(final double[] values, final double[] weights,
503                            final double mean, final int begin, final int length)
504         throws MathIllegalArgumentException {
505 
506         double var = Double.NaN;
507 
508         if (MathArrays.verifyValues(values, weights, begin, length)) {
509             if (length == 1) {
510                 var = 0.0;
511             } else if (length > 1) {
512                 double accum = 0.0;
513                 double dev = 0.0;
514                 double accum2 = 0.0;
515                 for (int i = begin; i < begin + length; i++) {
516                     dev = values[i] - mean;
517                     accum += weights[i] * (dev * dev);
518                     accum2 += weights[i] * dev;
519                 }
520 
521                 double sumWts = 0;
522                 for (int i = begin; i < begin + length; i++) {
523                     sumWts += weights[i];
524                 }
525 
526                 if (isBiasCorrected) {
527                     // Note: For this to be valid the weights should correspond to counts
528                     // of each observation where the weights are positive integers; the
529                     // sum of the weights is the total number of observations and should
530                     // be at least 2.
531                     var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
532                 } else {
533                     var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
534                 }
535             }
536         }
537         return var;
538     }
539 
540     /**
541      * <p>Returns the weighted variance of the values in the input array, using
542      * the precomputed weighted mean value.</p>
543      * <p>
544      * Uses the formula <div style="white-space:pre"><code>
545      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
546      * </code></div>
547      * <p>
548      * The formula used assumes that the supplied mean value is the weighted arithmetic
549      * mean of the sample data, not a known population parameter. This method
550      * is supplied only to save computation when the mean has already been
551      * computed.</p>
552      * <p>
553      * This formula will not return the same result as the unweighted variance when all
554      * weights are equal, unless all weights are equal to 1. The formula assumes that
555      * weights are to be treated as "expansion values," as will be the case if for example
556      * the weights represent frequency counts. To normalize weights so that the denominator
557      * in the variance computation equals the length of the input vector minus one, use <pre>
558      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
559      * </pre>
560      * <p>
561      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
562      * <p>
563      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
564      * <ul><li>the values array is null</li>
565      *     <li>the weights array is null</li>
566      *     <li>the weights array does not have the same length as the values array</li>
567      *     <li>the weights array contains one or more infinite values</li>
568      *     <li>the weights array contains one or more NaN values</li>
569      *     <li>the weights array contains negative values</li>
570      *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
571      * </ul>
572      * <p>
573      * Does not change the internal state of the statistic.</p>
574      *
575      * @param values the input array
576      * @param weights the weights array
577      * @param mean the precomputed weighted mean value
578      * @return the variance of the values or Double.NaN if length = 0
579      * @throws MathIllegalArgumentException if the parameters are not valid
580      * @since 2.1
581      */
582     public double evaluate(final double[] values, final double[] weights, final double mean)
583         throws MathIllegalArgumentException {
584         return evaluate(values, weights, mean, 0, values.length);
585     }
586 
587     /**
588      * @return Returns the isBiasCorrected.
589      */
590     public boolean isBiasCorrected() {
591         return isBiasCorrected;
592     }
593 
594     /**
595      * @param biasCorrected The isBiasCorrected to set.
596      */
597     public void setBiasCorrected(boolean biasCorrected) {
598         this.isBiasCorrected = biasCorrected;
599     }
600 
601     /**
602      * {@inheritDoc}
603      */
604     @Override
605     public Variance copy() {
606         Variance result = new Variance();
607         // No try-catch or advertised exception because parameters are guaranteed non-null
608         copy(this, result);
609         return result;
610     }
611 
612     /**
613      * Copies source to dest.
614      * <p>Neither source nor dest can be null.</p>
615      *
616      * @param source Variance to copy
617      * @param dest Variance to copy to
618      * @throws NullArgumentException if either source or dest is null
619      */
620     public static void copy(Variance source, Variance dest)
621         throws NullArgumentException {
622         NullArgumentException.check(source);
623         NullArgumentException.check(dest);
624         dest.moment = source.moment.copy();
625         dest.isBiasCorrected = source.isBiasCorrected;
626         dest.incMoment = source.incMoment;
627     }
628 }