View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math3.stat.descriptive.moment;
18  
19  import java.io.Serializable;
20  
21  import org.apache.commons.math3.exception.MathIllegalArgumentException;
22  import org.apache.commons.math3.exception.NullArgumentException;
23  import org.apache.commons.math3.exception.util.LocalizedFormats;
24  import org.apache.commons.math3.stat.descriptive.WeightedEvaluation;
25  import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic;
26  import org.apache.commons.math3.util.MathUtils;
27  
28  /**
29   * Computes the variance of the available values.  By default, the unbiased
30   * "sample variance" definitional formula is used:
31   * <p>
32   * variance = sum((x_i - mean)^2) / (n - 1) </p>
33   * <p>
34   * where mean is the {@link Mean} and <code>n</code> is the number
35   * of sample observations.</p>
36   * <p>
37   * The definitional formula does not have good numerical properties, so
38   * this implementation does not compute the statistic using the definitional
39   * formula. <ul>
40   * <li> The <code>getResult</code> method computes the variance using
41   * updating formulas based on West's algorithm, as described in
42   * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
43   * J. G. Lewis 1979, <i>Communications of the ACM</i>,
44   * vol. 22 no. 9, pp. 526-531.</a></li>
45   * <li> The <code>evaluate</code> methods leverage the fact that they have the
46   * full array of values in memory to execute a two-pass algorithm.
47   * Specifically, these methods use the "corrected two-pass algorithm" from
48   * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
49   * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
50   * Note that adding values using <code>increment</code> or
51   * <code>incrementAll</code> and then executing <code>getResult</code> will
52   * sometimes give a different, less accurate, result than executing
53   * <code>evaluate</code> with the full array of values. The former approach
54   * should only be used when the full array of values is not available.</p>
55   * <p>
56   * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
57   * be computed using this statistic.  The <code>isBiasCorrected</code>
58   * property determines whether the "population" or "sample" value is
59   * returned by the <code>evaluate</code> and <code>getResult</code> methods.
60   * To compute population variances, set this property to <code>false.</code>
61   * </p>
62   * <p>
63   * <strong>Note that this implementation is not synchronized.</strong> If
64   * multiple threads access an instance of this class concurrently, and at least
65   * one of the threads invokes the <code>increment()</code> or
66   * <code>clear()</code> method, it must be synchronized externally.</p>
67   *
68   * @version $Id: Variance.java 1416643 2012-12-03 19:37:14Z tn $
69   */
70  public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
71  
72      /** Serializable version identifier */
73      private static final long serialVersionUID = -9111962718267217978L;
74  
75      /** SecondMoment is used in incremental calculation of Variance*/
76      protected SecondMoment moment = null;
77  
78      /**
79       * Whether or not {@link #increment(double)} should increment
80       * the internal second moment. When a Variance is constructed with an
81       * external SecondMoment as a constructor parameter, this property is
82       * set to false and increments must be applied to the second moment
83       * directly.
84       */
85      protected boolean incMoment = true;
86  
87      /**
88       * Whether or not bias correction is applied when computing the
89       * value of the statistic. True means that bias is corrected.  See
90       * {@link Variance} for details on the formula.
91       */
92      private boolean isBiasCorrected = true;
93  
94      /**
95       * Constructs a Variance with default (true) <code>isBiasCorrected</code>
96       * property.
97       */
98      public Variance() {
99          moment = new SecondMoment();
100     }
101 
102     /**
103      * Constructs a Variance based on an external second moment.
104      * When this constructor is used, the statistic may only be
105      * incremented via the moment, i.e., {@link #increment(double)}
106      * does nothing; whereas {@code m2.increment(value)} increments
107      * both {@code m2} and the Variance instance constructed from it.
108      *
109      * @param m2 the SecondMoment (Third or Fourth moments work
110      * here as well.)
111      */
112     public Variance(final SecondMoment m2) {
113         incMoment = false;
114         this.moment = m2;
115     }
116 
117     /**
118      * Constructs a Variance with the specified <code>isBiasCorrected</code>
119      * property
120      *
121      * @param isBiasCorrected  setting for bias correction - true means
122      * bias will be corrected and is equivalent to using the argumentless
123      * constructor
124      */
125     public Variance(boolean isBiasCorrected) {
126         moment = new SecondMoment();
127         this.isBiasCorrected = isBiasCorrected;
128     }
129 
130     /**
131      * Constructs a Variance with the specified <code>isBiasCorrected</code>
132      * property and the supplied external second moment.
133      *
134      * @param isBiasCorrected  setting for bias correction - true means
135      * bias will be corrected
136      * @param m2 the SecondMoment (Third or Fourth moments work
137      * here as well.)
138      */
139     public Variance(boolean isBiasCorrected, SecondMoment m2) {
140         incMoment = false;
141         this.moment = m2;
142         this.isBiasCorrected = isBiasCorrected;
143     }
144 
145     /**
146      * Copy constructor, creates a new {@code Variance} identical
147      * to the {@code original}
148      *
149      * @param original the {@code Variance} instance to copy
150      * @throws NullArgumentException if original is null
151      */
152     public Variance(Variance original) throws NullArgumentException {
153         copy(original, this);
154     }
155 
156     /**
157      * {@inheritDoc}
158      * <p>If all values are available, it is more accurate to use
159      * {@link #evaluate(double[])} rather than adding values one at a time
160      * using this method and then executing {@link #getResult}, since
161      * <code>evaluate</code> leverages the fact that is has the full
162      * list of values together to execute a two-pass algorithm.
163      * See {@link Variance}.</p>
164      *
165      * <p>Note also that when {@link #Variance(SecondMoment)} is used to
166      * create a Variance, this method does nothing. In that case, the
167      * SecondMoment should be incremented directly.</p>
168      */
169     @Override
170     public void increment(final double d) {
171         if (incMoment) {
172             moment.increment(d);
173         }
174     }
175 
176     /**
177      * {@inheritDoc}
178      */
179     @Override
180     public double getResult() {
181             if (moment.n == 0) {
182                 return Double.NaN;
183             } else if (moment.n == 1) {
184                 return 0d;
185             } else {
186                 if (isBiasCorrected) {
187                     return moment.m2 / (moment.n - 1d);
188                 } else {
189                     return moment.m2 / (moment.n);
190                 }
191             }
192     }
193 
194     /**
195      * {@inheritDoc}
196      */
197     public long getN() {
198         return moment.getN();
199     }
200 
201     /**
202      * {@inheritDoc}
203      */
204     @Override
205     public void clear() {
206         if (incMoment) {
207             moment.clear();
208         }
209     }
210 
211     /**
212      * Returns the variance of the entries in the input array, or
213      * <code>Double.NaN</code> if the array is empty.
214      * <p>
215      * See {@link Variance} for details on the computing algorithm.</p>
216      * <p>
217      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
218      * <p>
219      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
220      * <p>
221      * Does not change the internal state of the statistic.</p>
222      *
223      * @param values the input array
224      * @return the variance of the values or Double.NaN if length = 0
225      * @throws MathIllegalArgumentException if the array is null
226      */
227     @Override
228     public double evaluate(final double[] values) throws MathIllegalArgumentException {
229         if (values == null) {
230             throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
231         }
232         return evaluate(values, 0, values.length);
233     }
234 
235     /**
236      * Returns the variance of the entries in the specified portion of
237      * the input array, or <code>Double.NaN</code> if the designated subarray
238      * is empty.
239      * <p>
240      * See {@link Variance} for details on the computing algorithm.</p>
241      * <p>
242      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
243      * <p>
244      * Does not change the internal state of the statistic.</p>
245      * <p>
246      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
247      *
248      * @param values the input array
249      * @param begin index of the first array element to include
250      * @param length the number of elements to include
251      * @return the variance of the values or Double.NaN if length = 0
252      * @throws MathIllegalArgumentException if the array is null or the array index
253      *  parameters are not valid
254      */
255     @Override
256     public double evaluate(final double[] values, final int begin, final int length)
257     throws MathIllegalArgumentException {
258 
259         double var = Double.NaN;
260 
261         if (test(values, begin, length)) {
262             clear();
263             if (length == 1) {
264                 var = 0.0;
265             } else if (length > 1) {
266                 Mean mean = new Mean();
267                 double m = mean.evaluate(values, begin, length);
268                 var = evaluate(values, m, begin, length);
269             }
270         }
271         return var;
272     }
273 
274     /**
275      * <p>Returns the weighted variance of the entries in the specified portion of
276      * the input array, or <code>Double.NaN</code> if the designated subarray
277      * is empty.</p>
278      * <p>
279      * Uses the formula <pre>
280      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
281      * </pre>
282      * where weightedMean is the weighted mean</p>
283      * <p>
284      * This formula will not return the same result as the unweighted variance when all
285      * weights are equal, unless all weights are equal to 1. The formula assumes that
286      * weights are to be treated as "expansion values," as will be the case if for example
287      * the weights represent frequency counts. To normalize weights so that the denominator
288      * in the variance computation equals the length of the input vector minus one, use <pre>
289      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
290      * </pre>
291      * <p>
292      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
293      * <p>
294      * Throws <code>IllegalArgumentException</code> if any of the following are true:
295      * <ul><li>the values array is null</li>
296      *     <li>the weights array is null</li>
297      *     <li>the weights array does not have the same length as the values array</li>
298      *     <li>the weights array contains one or more infinite values</li>
299      *     <li>the weights array contains one or more NaN values</li>
300      *     <li>the weights array contains negative values</li>
301      *     <li>the start and length arguments do not determine a valid array</li>
302      * </ul></p>
303      * <p>
304      * Does not change the internal state of the statistic.</p>
305      * <p>
306      * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
307      *
308      * @param values the input array
309      * @param weights the weights array
310      * @param begin index of the first array element to include
311      * @param length the number of elements to include
312      * @return the weighted variance of the values or Double.NaN if length = 0
313      * @throws MathIllegalArgumentException if the parameters are not valid
314      * @since 2.1
315      */
316     public double evaluate(final double[] values, final double[] weights,
317                            final int begin, final int length) throws MathIllegalArgumentException {
318 
319         double var = Double.NaN;
320 
321         if (test(values, weights,begin, length)) {
322             clear();
323             if (length == 1) {
324                 var = 0.0;
325             } else if (length > 1) {
326                 Mean mean = new Mean();
327                 double m = mean.evaluate(values, weights, begin, length);
328                 var = evaluate(values, weights, m, begin, length);
329             }
330         }
331         return var;
332     }
333 
334     /**
335      * <p>
336      * Returns the weighted variance of the entries in the the input array.</p>
337      * <p>
338      * Uses the formula <pre>
339      *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
340      * </pre>
341      * where weightedMean is the weighted mean</p>
342      * <p>
343      * This formula will not return the same result as the unweighted variance when all
344      * weights are equal, unless all weights are equal to 1. The formula assumes that
345      * weights are to be treated as "expansion values," as will be the case if for example
346      * the weights represent frequency counts. To normalize weights so that the denominator
347      * in the variance computation equals the length of the input vector minus one, use <pre>
348      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
349      * </pre>
350      * <p>
351      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
352      * <p>
353      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
354      * <ul><li>the values array is null</li>
355      *     <li>the weights array is null</li>
356      *     <li>the weights array does not have the same length as the values array</li>
357      *     <li>the weights array contains one or more infinite values</li>
358      *     <li>the weights array contains one or more NaN values</li>
359      *     <li>the weights array contains negative values</li>
360      * </ul></p>
361      * <p>
362      * Does not change the internal state of the statistic.</p>
363      * <p>
364      * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
365      *
366      * @param values the input array
367      * @param weights the weights array
368      * @return the weighted variance of the values
369      * @throws MathIllegalArgumentException if the parameters are not valid
370      * @since 2.1
371      */
372     public double evaluate(final double[] values, final double[] weights)
373     throws MathIllegalArgumentException {
374         return evaluate(values, weights, 0, values.length);
375     }
376 
377     /**
378      * Returns the variance of the entries in the specified portion of
379      * the input array, using the precomputed mean value.  Returns
380      * <code>Double.NaN</code> if the designated subarray is empty.
381      * <p>
382      * See {@link Variance} for details on the computing algorithm.</p>
383      * <p>
384      * The formula used assumes that the supplied mean value is the arithmetic
385      * mean of the sample data, not a known population parameter.  This method
386      * is supplied only to save computation when the mean has already been
387      * computed.</p>
388      * <p>
389      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
390      * <p>
391      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
392      * <p>
393      * Does not change the internal state of the statistic.</p>
394      *
395      * @param values the input array
396      * @param mean the precomputed mean value
397      * @param begin index of the first array element to include
398      * @param length the number of elements to include
399      * @return the variance of the values or Double.NaN if length = 0
400      * @throws MathIllegalArgumentException if the array is null or the array index
401      *  parameters are not valid
402      */
403     public double evaluate(final double[] values, final double mean,
404             final int begin, final int length) throws MathIllegalArgumentException {
405 
406         double var = Double.NaN;
407 
408         if (test(values, begin, length)) {
409             if (length == 1) {
410                 var = 0.0;
411             } else if (length > 1) {
412                 double accum = 0.0;
413                 double dev = 0.0;
414                 double accum2 = 0.0;
415                 for (int i = begin; i < begin + length; i++) {
416                     dev = values[i] - mean;
417                     accum += dev * dev;
418                     accum2 += dev;
419                 }
420                 double len = length;
421                 if (isBiasCorrected) {
422                     var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
423                 } else {
424                     var = (accum - (accum2 * accum2 / len)) / len;
425                 }
426             }
427         }
428         return var;
429     }
430 
431     /**
432      * Returns the variance of the entries in the input array, using the
433      * precomputed mean value.  Returns <code>Double.NaN</code> if the array
434      * is empty.
435      * <p>
436      * See {@link Variance} for details on the computing algorithm.</p>
437      * <p>
438      * If <code>isBiasCorrected</code> is <code>true</code> the formula used
439      * assumes that the supplied mean value is the arithmetic mean of the
440      * sample data, not a known population parameter.  If the mean is a known
441      * population parameter, or if the "population" version of the variance is
442      * desired, set <code>isBiasCorrected</code> to <code>false</code> before
443      * invoking this method.</p>
444      * <p>
445      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
446      * <p>
447      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
448      * <p>
449      * Does not change the internal state of the statistic.</p>
450      *
451      * @param values the input array
452      * @param mean the precomputed mean value
453      * @return the variance of the values or Double.NaN if the array is empty
454      * @throws MathIllegalArgumentException if the array is null
455      */
456     public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
457         return evaluate(values, mean, 0, values.length);
458     }
459 
460     /**
461      * Returns the weighted variance of the entries in the specified portion of
462      * the input array, using the precomputed weighted mean value.  Returns
463      * <code>Double.NaN</code> if the designated subarray is empty.
464      * <p>
465      * Uses the formula <pre>
466      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
467      * </pre></p>
468      * <p>
469      * The formula used assumes that the supplied mean value is the weighted arithmetic
470      * mean of the sample data, not a known population parameter. This method
471      * is supplied only to save computation when the mean has already been
472      * computed.</p>
473      * <p>
474      * This formula will not return the same result as the unweighted variance when all
475      * weights are equal, unless all weights are equal to 1. The formula assumes that
476      * weights are to be treated as "expansion values," as will be the case if for example
477      * the weights represent frequency counts. To normalize weights so that the denominator
478      * in the variance computation equals the length of the input vector minus one, use <pre>
479      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
480      * </pre>
481      * <p>
482      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
483      * <p>
484      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
485      * <ul><li>the values array is null</li>
486      *     <li>the weights array is null</li>
487      *     <li>the weights array does not have the same length as the values array</li>
488      *     <li>the weights array contains one or more infinite values</li>
489      *     <li>the weights array contains one or more NaN values</li>
490      *     <li>the weights array contains negative values</li>
491      *     <li>the start and length arguments do not determine a valid array</li>
492      * </ul></p>
493      * <p>
494      * Does not change the internal state of the statistic.</p>
495      *
496      * @param values the input array
497      * @param weights the weights array
498      * @param mean the precomputed weighted mean value
499      * @param begin index of the first array element to include
500      * @param length the number of elements to include
501      * @return the variance of the values or Double.NaN if length = 0
502      * @throws MathIllegalArgumentException if the parameters are not valid
503      * @since 2.1
504      */
505     public double evaluate(final double[] values, final double[] weights,
506     final double mean, final int begin, final int length)
507     throws MathIllegalArgumentException {
508 
509         double var = Double.NaN;
510 
511         if (test(values, weights, begin, length)) {
512             if (length == 1) {
513                 var = 0.0;
514             } else if (length > 1) {
515                 double accum = 0.0;
516                 double dev = 0.0;
517                 double accum2 = 0.0;
518                 for (int i = begin; i < begin + length; i++) {
519                     dev = values[i] - mean;
520                     accum += weights[i] * (dev * dev);
521                     accum2 += weights[i] * dev;
522                 }
523 
524                 double sumWts = 0;
525                 for (int i = begin; i < begin + length; i++) {
526                     sumWts += weights[i];
527                 }
528 
529                 if (isBiasCorrected) {
530                     var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
531                 } else {
532                     var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
533                 }
534             }
535         }
536         return var;
537     }
538 
539     /**
540      * <p>Returns the weighted variance of the values in the input array, using
541      * the precomputed weighted mean value.</p>
542      * <p>
543      * Uses the formula <pre>
544      *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
545      * </pre></p>
546      * <p>
547      * The formula used assumes that the supplied mean value is the weighted arithmetic
548      * mean of the sample data, not a known population parameter. This method
549      * is supplied only to save computation when the mean has already been
550      * computed.</p>
551      * <p>
552      * This formula will not return the same result as the unweighted variance when all
553      * weights are equal, unless all weights are equal to 1. The formula assumes that
554      * weights are to be treated as "expansion values," as will be the case if for example
555      * the weights represent frequency counts. To normalize weights so that the denominator
556      * in the variance computation equals the length of the input vector minus one, use <pre>
557      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
558      * </pre>
559      * <p>
560      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
561      * <p>
562      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
563      * <ul><li>the values array is null</li>
564      *     <li>the weights array is null</li>
565      *     <li>the weights array does not have the same length as the values array</li>
566      *     <li>the weights array contains one or more infinite values</li>
567      *     <li>the weights array contains one or more NaN values</li>
568      *     <li>the weights array contains negative values</li>
569      * </ul></p>
570      * <p>
571      * Does not change the internal state of the statistic.</p>
572      *
573      * @param values the input array
574      * @param weights the weights array
575      * @param mean the precomputed weighted mean value
576      * @return the variance of the values or Double.NaN if length = 0
577      * @throws MathIllegalArgumentException if the parameters are not valid
578      * @since 2.1
579      */
580     public double evaluate(final double[] values, final double[] weights, final double mean)
581     throws MathIllegalArgumentException {
582         return evaluate(values, weights, mean, 0, values.length);
583     }
584 
585     /**
586      * @return Returns the isBiasCorrected.
587      */
588     public boolean isBiasCorrected() {
589         return isBiasCorrected;
590     }
591 
592     /**
593      * @param biasCorrected The isBiasCorrected to set.
594      */
595     public void setBiasCorrected(boolean biasCorrected) {
596         this.isBiasCorrected = biasCorrected;
597     }
598 
599     /**
600      * {@inheritDoc}
601      */
602     @Override
603     public Variance copy() {
604         Variance result = new Variance();
605         // No try-catch or advertised exception because parameters are guaranteed non-null
606         copy(this, result);
607         return result;
608     }
609 
610     /**
611      * Copies source to dest.
612      * <p>Neither source nor dest can be null.</p>
613      *
614      * @param source Variance to copy
615      * @param dest Variance to copy to
616      * @throws NullArgumentException if either source or dest is null
617      */
618     public static void copy(Variance source, Variance dest)
619         throws NullArgumentException {
620         MathUtils.checkNotNull(source);
621         MathUtils.checkNotNull(dest);
622         dest.setData(source.getDataRef());
623         dest.moment = source.moment.copy();
624         dest.isBiasCorrected = source.isBiasCorrected;
625         dest.incMoment = source.incMoment;
626     }
627 }