001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math3.stat.descriptive.moment;
018
019import java.io.Serializable;
020
021import org.apache.commons.math3.exception.MathIllegalArgumentException;
022import org.apache.commons.math3.exception.NullArgumentException;
023import org.apache.commons.math3.exception.util.LocalizedFormats;
024import org.apache.commons.math3.stat.descriptive.WeightedEvaluation;
025import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic;
026import org.apache.commons.math3.util.MathUtils;
027
028/**
029 * Computes the variance of the available values.  By default, the unbiased
030 * "sample variance" definitional formula is used:
031 * <p>
032 * variance = sum((x_i - mean)^2) / (n - 1) </p>
033 * <p>
034 * where mean is the {@link Mean} and <code>n</code> is the number
035 * of sample observations.</p>
036 * <p>
037 * The definitional formula does not have good numerical properties, so
038 * this implementation does not compute the statistic using the definitional
039 * formula. <ul>
040 * <li> The <code>getResult</code> method computes the variance using
041 * updating formulas based on West's algorithm, as described in
042 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
043 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
044 * vol. 22 no. 9, pp. 526-531.</a></li>
045 * <li> The <code>evaluate</code> methods leverage the fact that they have the
046 * full array of values in memory to execute a two-pass algorithm.
047 * Specifically, these methods use the "corrected two-pass algorithm" from
048 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
049 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
050 * Note that adding values using <code>increment</code> or
051 * <code>incrementAll</code> and then executing <code>getResult</code> will
052 * sometimes give a different, less accurate, result than executing
053 * <code>evaluate</code> with the full array of values. The former approach
054 * should only be used when the full array of values is not available.</p>
055 * <p>
056 * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
057 * be computed using this statistic.  The <code>isBiasCorrected</code>
058 * property determines whether the "population" or "sample" value is
059 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
060 * To compute population variances, set this property to <code>false.</code>
061 * </p>
062 * <p>
063 * <strong>Note that this implementation is not synchronized.</strong> If
064 * multiple threads access an instance of this class concurrently, and at least
065 * one of the threads invokes the <code>increment()</code> or
066 * <code>clear()</code> method, it must be synchronized externally.</p>
067 *
068 */
069public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
070
071    /** Serializable version identifier */
072    private static final long serialVersionUID = -9111962718267217978L;
073
074    /** SecondMoment is used in incremental calculation of Variance*/
075    protected SecondMoment moment = null;
076
077    /**
078     * Whether or not {@link #increment(double)} should increment
079     * the internal second moment. When a Variance is constructed with an
080     * external SecondMoment as a constructor parameter, this property is
081     * set to false and increments must be applied to the second moment
082     * directly.
083     */
084    protected boolean incMoment = true;
085
086    /**
087     * Whether or not bias correction is applied when computing the
088     * value of the statistic. True means that bias is corrected.  See
089     * {@link Variance} for details on the formula.
090     */
091    private boolean isBiasCorrected = true;
092
093    /**
094     * Constructs a Variance with default (true) <code>isBiasCorrected</code>
095     * property.
096     */
097    public Variance() {
098        moment = new SecondMoment();
099    }
100
101    /**
102     * Constructs a Variance based on an external second moment.
103     * When this constructor is used, the statistic may only be
104     * incremented via the moment, i.e., {@link #increment(double)}
105     * does nothing; whereas {@code m2.increment(value)} increments
106     * both {@code m2} and the Variance instance constructed from it.
107     *
108     * @param m2 the SecondMoment (Third or Fourth moments work
109     * here as well.)
110     */
111    public Variance(final SecondMoment m2) {
112        incMoment = false;
113        this.moment = m2;
114    }
115
116    /**
117     * Constructs a Variance with the specified <code>isBiasCorrected</code>
118     * property
119     *
120     * @param isBiasCorrected  setting for bias correction - true means
121     * bias will be corrected and is equivalent to using the argumentless
122     * constructor
123     */
124    public Variance(boolean isBiasCorrected) {
125        moment = new SecondMoment();
126        this.isBiasCorrected = isBiasCorrected;
127    }
128
129    /**
130     * Constructs a Variance with the specified <code>isBiasCorrected</code>
131     * property and the supplied external second moment.
132     *
133     * @param isBiasCorrected  setting for bias correction - true means
134     * bias will be corrected
135     * @param m2 the SecondMoment (Third or Fourth moments work
136     * here as well.)
137     */
138    public Variance(boolean isBiasCorrected, SecondMoment m2) {
139        incMoment = false;
140        this.moment = m2;
141        this.isBiasCorrected = isBiasCorrected;
142    }
143
144    /**
145     * Copy constructor, creates a new {@code Variance} identical
146     * to the {@code original}
147     *
148     * @param original the {@code Variance} instance to copy
149     * @throws NullArgumentException if original is null
150     */
151    public Variance(Variance original) throws NullArgumentException {
152        copy(original, this);
153    }
154
155    /**
156     * {@inheritDoc}
157     * <p>If all values are available, it is more accurate to use
158     * {@link #evaluate(double[])} rather than adding values one at a time
159     * using this method and then executing {@link #getResult}, since
160     * <code>evaluate</code> leverages the fact that is has the full
161     * list of values together to execute a two-pass algorithm.
162     * See {@link Variance}.</p>
163     *
164     * <p>Note also that when {@link #Variance(SecondMoment)} is used to
165     * create a Variance, this method does nothing. In that case, the
166     * SecondMoment should be incremented directly.</p>
167     */
168    @Override
169    public void increment(final double d) {
170        if (incMoment) {
171            moment.increment(d);
172        }
173    }
174
175    /**
176     * {@inheritDoc}
177     */
178    @Override
179    public double getResult() {
180            if (moment.n == 0) {
181                return Double.NaN;
182            } else if (moment.n == 1) {
183                return 0d;
184            } else {
185                if (isBiasCorrected) {
186                    return moment.m2 / (moment.n - 1d);
187                } else {
188                    return moment.m2 / (moment.n);
189                }
190            }
191    }
192
193    /**
194     * {@inheritDoc}
195     */
196    public long getN() {
197        return moment.getN();
198    }
199
200    /**
201     * {@inheritDoc}
202     */
203    @Override
204    public void clear() {
205        if (incMoment) {
206            moment.clear();
207        }
208    }
209
210    /**
211     * Returns the variance of the entries in the input array, or
212     * <code>Double.NaN</code> if the array is empty.
213     * <p>
214     * See {@link Variance} for details on the computing algorithm.</p>
215     * <p>
216     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
217     * <p>
218     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
219     * <p>
220     * Does not change the internal state of the statistic.</p>
221     *
222     * @param values the input array
223     * @return the variance of the values or Double.NaN if length = 0
224     * @throws MathIllegalArgumentException if the array is null
225     */
226    @Override
227    public double evaluate(final double[] values) throws MathIllegalArgumentException {
228        if (values == null) {
229            throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
230        }
231        return evaluate(values, 0, values.length);
232    }
233
234    /**
235     * Returns the variance of the entries in the specified portion of
236     * the input array, or <code>Double.NaN</code> if the designated subarray
237     * is empty.  Note that Double.NaN may also be returned if the input
238     * includes NaN and / or infinite values.
239     * <p>
240     * See {@link Variance} for details on the computing algorithm.</p>
241     * <p>
242     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
243     * <p>
244     * Does not change the internal state of the statistic.</p>
245     * <p>
246     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
247     *
248     * @param values the input array
249     * @param begin index of the first array element to include
250     * @param length the number of elements to include
251     * @return the variance of the values or Double.NaN if length = 0
252     * @throws MathIllegalArgumentException if the array is null or the array index
253     *  parameters are not valid
254     */
255    @Override
256    public double evaluate(final double[] values, final int begin, final int length)
257    throws MathIllegalArgumentException {
258
259        double var = Double.NaN;
260
261        if (test(values, begin, length)) {
262            clear();
263            if (length == 1) {
264                var = 0.0;
265            } else if (length > 1) {
266                Mean mean = new Mean();
267                double m = mean.evaluate(values, begin, length);
268                var = evaluate(values, m, begin, length);
269            }
270        }
271        return var;
272    }
273
274    /**
275     * <p>Returns the weighted variance of the entries in the specified portion of
276     * the input array, or <code>Double.NaN</code> if the designated subarray
277     * is empty.</p>
278     * <p>
279     * Uses the formula <pre>
280     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
281     * </pre>
282     * where weightedMean is the weighted mean</p>
283     * <p>
284     * This formula will not return the same result as the unweighted variance when all
285     * weights are equal, unless all weights are equal to 1. The formula assumes that
286     * weights are to be treated as "expansion values," as will be the case if for example
287     * the weights represent frequency counts. To normalize weights so that the denominator
288     * in the variance computation equals the length of the input vector minus one, use <pre>
289     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
290     * </pre>
291     * <p>
292     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
293     * <p>
294     * Throws <code>IllegalArgumentException</code> if any of the following are true:
295     * <ul><li>the values array is null</li>
296     *     <li>the weights array is null</li>
297     *     <li>the weights array does not have the same length as the values array</li>
298     *     <li>the weights array contains one or more infinite values</li>
299     *     <li>the weights array contains one or more NaN values</li>
300     *     <li>the weights array contains negative values</li>
301     *     <li>the start and length arguments do not determine a valid array</li>
302     * </ul></p>
303     * <p>
304     * Does not change the internal state of the statistic.</p>
305     * <p>
306     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
307     *
308     * @param values the input array
309     * @param weights the weights array
310     * @param begin index of the first array element to include
311     * @param length the number of elements to include
312     * @return the weighted variance of the values or Double.NaN if length = 0
313     * @throws MathIllegalArgumentException if the parameters are not valid
314     * @since 2.1
315     */
316    public double evaluate(final double[] values, final double[] weights,
317                           final int begin, final int length) throws MathIllegalArgumentException {
318
319        double var = Double.NaN;
320
321        if (test(values, weights,begin, length)) {
322            clear();
323            if (length == 1) {
324                var = 0.0;
325            } else if (length > 1) {
326                Mean mean = new Mean();
327                double m = mean.evaluate(values, weights, begin, length);
328                var = evaluate(values, weights, m, begin, length);
329            }
330        }
331        return var;
332    }
333
334    /**
335     * <p>
336     * Returns the weighted variance of the entries in the the input array.</p>
337     * <p>
338     * Uses the formula <pre>
339     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
340     * </pre>
341     * where weightedMean is the weighted mean</p>
342     * <p>
343     * This formula will not return the same result as the unweighted variance when all
344     * weights are equal, unless all weights are equal to 1. The formula assumes that
345     * weights are to be treated as "expansion values," as will be the case if for example
346     * the weights represent frequency counts. To normalize weights so that the denominator
347     * in the variance computation equals the length of the input vector minus one, use <pre>
348     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
349     * </pre>
350     * <p>
351     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
352     * <p>
353     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
354     * <ul><li>the values array is null</li>
355     *     <li>the weights array is null</li>
356     *     <li>the weights array does not have the same length as the values array</li>
357     *     <li>the weights array contains one or more infinite values</li>
358     *     <li>the weights array contains one or more NaN values</li>
359     *     <li>the weights array contains negative values</li>
360     * </ul></p>
361     * <p>
362     * Does not change the internal state of the statistic.</p>
363     * <p>
364     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
365     *
366     * @param values the input array
367     * @param weights the weights array
368     * @return the weighted variance of the values
369     * @throws MathIllegalArgumentException if the parameters are not valid
370     * @since 2.1
371     */
372    public double evaluate(final double[] values, final double[] weights)
373    throws MathIllegalArgumentException {
374        return evaluate(values, weights, 0, values.length);
375    }
376
377    /**
378     * Returns the variance of the entries in the specified portion of
379     * the input array, using the precomputed mean value.  Returns
380     * <code>Double.NaN</code> if the designated subarray is empty.
381     * <p>
382     * See {@link Variance} for details on the computing algorithm.</p>
383     * <p>
384     * The formula used assumes that the supplied mean value is the arithmetic
385     * mean of the sample data, not a known population parameter.  This method
386     * is supplied only to save computation when the mean has already been
387     * computed.</p>
388     * <p>
389     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
390     * <p>
391     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
392     * <p>
393     * Does not change the internal state of the statistic.</p>
394     *
395     * @param values the input array
396     * @param mean the precomputed mean value
397     * @param begin index of the first array element to include
398     * @param length the number of elements to include
399     * @return the variance of the values or Double.NaN if length = 0
400     * @throws MathIllegalArgumentException if the array is null or the array index
401     *  parameters are not valid
402     */
403    public double evaluate(final double[] values, final double mean,
404            final int begin, final int length) throws MathIllegalArgumentException {
405
406        double var = Double.NaN;
407
408        if (test(values, begin, length)) {
409            if (length == 1) {
410                var = 0.0;
411            } else if (length > 1) {
412                double accum = 0.0;
413                double dev = 0.0;
414                double accum2 = 0.0;
415                for (int i = begin; i < begin + length; i++) {
416                    dev = values[i] - mean;
417                    accum += dev * dev;
418                    accum2 += dev;
419                }
420                double len = length;
421                if (isBiasCorrected) {
422                    var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
423                } else {
424                    var = (accum - (accum2 * accum2 / len)) / len;
425                }
426            }
427        }
428        return var;
429    }
430
431    /**
432     * Returns the variance of the entries in the input array, using the
433     * precomputed mean value.  Returns <code>Double.NaN</code> if the array
434     * is empty.
435     * <p>
436     * See {@link Variance} for details on the computing algorithm.</p>
437     * <p>
438     * If <code>isBiasCorrected</code> is <code>true</code> the formula used
439     * assumes that the supplied mean value is the arithmetic mean of the
440     * sample data, not a known population parameter.  If the mean is a known
441     * population parameter, or if the "population" version of the variance is
442     * desired, set <code>isBiasCorrected</code> to <code>false</code> before
443     * invoking this method.</p>
444     * <p>
445     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
446     * <p>
447     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
448     * <p>
449     * Does not change the internal state of the statistic.</p>
450     *
451     * @param values the input array
452     * @param mean the precomputed mean value
453     * @return the variance of the values or Double.NaN if the array is empty
454     * @throws MathIllegalArgumentException if the array is null
455     */
456    public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
457        return evaluate(values, mean, 0, values.length);
458    }
459
460    /**
461     * Returns the weighted variance of the entries in the specified portion of
462     * the input array, using the precomputed weighted mean value.  Returns
463     * <code>Double.NaN</code> if the designated subarray is empty.
464     * <p>
465     * Uses the formula <pre>
466     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
467     * </pre></p>
468     * <p>
469     * The formula used assumes that the supplied mean value is the weighted arithmetic
470     * mean of the sample data, not a known population parameter. This method
471     * is supplied only to save computation when the mean has already been
472     * computed.</p>
473     * <p>
474     * This formula will not return the same result as the unweighted variance when all
475     * weights are equal, unless all weights are equal to 1. The formula assumes that
476     * weights are to be treated as "expansion values," as will be the case if for example
477     * the weights represent frequency counts. To normalize weights so that the denominator
478     * in the variance computation equals the length of the input vector minus one, use <pre>
479     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
480     * </pre>
481     * <p>
482     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
483     * <p>
484     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
485     * <ul><li>the values array is null</li>
486     *     <li>the weights array is null</li>
487     *     <li>the weights array does not have the same length as the values array</li>
488     *     <li>the weights array contains one or more infinite values</li>
489     *     <li>the weights array contains one or more NaN values</li>
490     *     <li>the weights array contains negative values</li>
491     *     <li>the start and length arguments do not determine a valid array</li>
492     * </ul></p>
493     * <p>
494     * Does not change the internal state of the statistic.</p>
495     *
496     * @param values the input array
497     * @param weights the weights array
498     * @param mean the precomputed weighted mean value
499     * @param begin index of the first array element to include
500     * @param length the number of elements to include
501     * @return the variance of the values or Double.NaN if length = 0
502     * @throws MathIllegalArgumentException if the parameters are not valid
503     * @since 2.1
504     */
505    public double evaluate(final double[] values, final double[] weights,
506    final double mean, final int begin, final int length)
507    throws MathIllegalArgumentException {
508
509        double var = Double.NaN;
510
511        if (test(values, weights, begin, length)) {
512            if (length == 1) {
513                var = 0.0;
514            } else if (length > 1) {
515                double accum = 0.0;
516                double dev = 0.0;
517                double accum2 = 0.0;
518                for (int i = begin; i < begin + length; i++) {
519                    dev = values[i] - mean;
520                    accum += weights[i] * (dev * dev);
521                    accum2 += weights[i] * dev;
522                }
523
524                double sumWts = 0;
525                for (int i = begin; i < begin + length; i++) {
526                    sumWts += weights[i];
527                }
528
529                if (isBiasCorrected) {
530                    var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
531                } else {
532                    var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
533                }
534            }
535        }
536        return var;
537    }
538
539    /**
540     * <p>Returns the weighted variance of the values in the input array, using
541     * the precomputed weighted mean value.</p>
542     * <p>
543     * Uses the formula <pre>
544     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
545     * </pre></p>
546     * <p>
547     * The formula used assumes that the supplied mean value is the weighted arithmetic
548     * mean of the sample data, not a known population parameter. This method
549     * is supplied only to save computation when the mean has already been
550     * computed.</p>
551     * <p>
552     * This formula will not return the same result as the unweighted variance when all
553     * weights are equal, unless all weights are equal to 1. The formula assumes that
554     * weights are to be treated as "expansion values," as will be the case if for example
555     * the weights represent frequency counts. To normalize weights so that the denominator
556     * in the variance computation equals the length of the input vector minus one, use <pre>
557     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
558     * </pre>
559     * <p>
560     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
561     * <p>
562     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
563     * <ul><li>the values array is null</li>
564     *     <li>the weights array is null</li>
565     *     <li>the weights array does not have the same length as the values array</li>
566     *     <li>the weights array contains one or more infinite values</li>
567     *     <li>the weights array contains one or more NaN values</li>
568     *     <li>the weights array contains negative values</li>
569     * </ul></p>
570     * <p>
571     * Does not change the internal state of the statistic.</p>
572     *
573     * @param values the input array
574     * @param weights the weights array
575     * @param mean the precomputed weighted mean value
576     * @return the variance of the values or Double.NaN if length = 0
577     * @throws MathIllegalArgumentException if the parameters are not valid
578     * @since 2.1
579     */
580    public double evaluate(final double[] values, final double[] weights, final double mean)
581    throws MathIllegalArgumentException {
582        return evaluate(values, weights, mean, 0, values.length);
583    }
584
585    /**
586     * @return Returns the isBiasCorrected.
587     */
588    public boolean isBiasCorrected() {
589        return isBiasCorrected;
590    }
591
592    /**
593     * @param biasCorrected The isBiasCorrected to set.
594     */
595    public void setBiasCorrected(boolean biasCorrected) {
596        this.isBiasCorrected = biasCorrected;
597    }
598
599    /**
600     * {@inheritDoc}
601     */
602    @Override
603    public Variance copy() {
604        Variance result = new Variance();
605        // No try-catch or advertised exception because parameters are guaranteed non-null
606        copy(this, result);
607        return result;
608    }
609
610    /**
611     * Copies source to dest.
612     * <p>Neither source nor dest can be null.</p>
613     *
614     * @param source Variance to copy
615     * @param dest Variance to copy to
616     * @throws NullArgumentException if either source or dest is null
617     */
618    public static void copy(Variance source, Variance dest)
619        throws NullArgumentException {
620        MathUtils.checkNotNull(source);
621        MathUtils.checkNotNull(dest);
622        dest.setData(source.getDataRef());
623        dest.moment = source.moment.copy();
624        dest.isBiasCorrected = source.isBiasCorrected;
625        dest.incMoment = source.incMoment;
626    }
627}