001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math4.stat.descriptive.moment;
018
019import java.io.Serializable;
020
021import org.apache.commons.math4.exception.MathIllegalArgumentException;
022import org.apache.commons.math4.exception.NullArgumentException;
023import org.apache.commons.math4.exception.util.LocalizedFormats;
024import org.apache.commons.math4.stat.descriptive.AbstractStorelessUnivariateStatistic;
025import org.apache.commons.math4.stat.descriptive.WeightedEvaluation;
026import org.apache.commons.math4.util.MathArrays;
027import org.apache.commons.math4.util.MathUtils;
028
029/**
030 * Computes the variance of the available values.  By default, the unbiased
031 * "sample variance" definitional formula is used:
032 * <p>
033 * variance = sum((x_i - mean)^2) / (n - 1) </p>
034 * <p>
035 * where mean is the {@link Mean} and <code>n</code> is the number
036 * of sample observations.</p>
037 * <p>
038 * The definitional formula does not have good numerical properties, so
039 * this implementation does not compute the statistic using the definitional
040 * formula. <ul>
041 * <li> The <code>getResult</code> method computes the variance using
042 * updating formulas based on West's algorithm, as described in
043 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
044 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
045 * vol. 22 no. 9, pp. 526-531.</a></li>
046 * <li> The <code>evaluate</code> methods leverage the fact that they have the
047 * full array of values in memory to execute a two-pass algorithm.
048 * Specifically, these methods use the "corrected two-pass algorithm" from
049 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
050 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
051 * Note that adding values using <code>increment</code> or
052 * <code>incrementAll</code> and then executing <code>getResult</code> will
053 * sometimes give a different, less accurate, result than executing
054 * <code>evaluate</code> with the full array of values. The former approach
055 * should only be used when the full array of values is not available.
056 * <p>
057 * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
058 * be computed using this statistic.  The <code>isBiasCorrected</code>
059 * property determines whether the "population" or "sample" value is
060 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
061 * To compute population variances, set this property to <code>false.</code>
062 * </p>
063 * <p>
064 * <strong>Note that this implementation is not synchronized.</strong> If
065 * multiple threads access an instance of this class concurrently, and at least
066 * one of the threads invokes the <code>increment()</code> or
067 * <code>clear()</code> method, it must be synchronized externally.</p>
068 */
069public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
070
071    /** Serializable version identifier */
072    private static final long serialVersionUID = 20150412L;
073
074    /** SecondMoment is used in incremental calculation of Variance*/
075    protected SecondMoment moment;
076
077    /**
078     * Whether or not {@link #increment(double)} should increment
079     * the internal second moment. When a Variance is constructed with an
080     * external SecondMoment as a constructor parameter, this property is
081     * set to false and increments must be applied to the second moment
082     * directly.
083     */
084    protected boolean incMoment = true;
085
086    /**
087     * Whether or not bias correction is applied when computing the
088     * value of the statistic. True means that bias is corrected.  See
089     * {@link Variance} for details on the formula.
090     */
091    private boolean isBiasCorrected = true;
092
093    /**
094     * Constructs a Variance with default (true) <code>isBiasCorrected</code>
095     * property.
096     */
097    public Variance() {
098        moment = new SecondMoment();
099    }
100
101    /**
102     * Constructs a Variance based on an external second moment.
103     * <p>
104     * When this constructor is used, the statistic may only be
105     * incremented via the moment, i.e., {@link #increment(double)}
106     * does nothing; whereas {@code m2.increment(value)} increments
107     * both {@code m2} and the Variance instance constructed from it.
108     *
109     * @param m2 the SecondMoment (Third or Fourth moments work here as well.)
110     */
111    public Variance(final SecondMoment m2) {
112        incMoment = false;
113        this.moment = m2;
114    }
115
116    /**
117     * Constructs a Variance with the specified <code>isBiasCorrected</code>
118     * property.
119     *
120     * @param isBiasCorrected  setting for bias correction - true means
121     * bias will be corrected and is equivalent to using the "no arg"
122     * constructor
123     */
124    public Variance(boolean isBiasCorrected) {
125        moment = new SecondMoment();
126        this.isBiasCorrected = isBiasCorrected;
127    }
128
129    /**
130     * Constructs a Variance with the specified <code>isBiasCorrected</code>
131     * property and the supplied external second moment.
132     *
133     * @param isBiasCorrected  setting for bias correction - true means
134     * bias will be corrected
135     * @param m2 the SecondMoment (Third or Fourth moments work
136     * here as well.)
137     */
138    public Variance(boolean isBiasCorrected, SecondMoment m2) {
139        incMoment = false;
140        this.moment = m2;
141        this.isBiasCorrected = isBiasCorrected;
142    }
143
144    /**
145     * Copy constructor, creates a new {@code Variance} identical
146     * to the {@code original}.
147     *
148     * @param original the {@code Variance} instance to copy
149     * @throws NullArgumentException if original is null
150     */
151    public Variance(Variance original) throws NullArgumentException {
152        copy(original, this);
153    }
154
155    /**
156     * {@inheritDoc}
157     * <p>If all values are available, it is more accurate to use
158     * {@link #evaluate(double[])} rather than adding values one at a time
159     * using this method and then executing {@link #getResult}, since
160     * <code>evaluate</code> leverages the fact that is has the full
161     * list of values together to execute a two-pass algorithm.
162     * See {@link Variance}.</p>
163     *
164     * <p>Note also that when {@link #Variance(SecondMoment)} is used to
165     * create a Variance, this method does nothing. In that case, the
166     * SecondMoment should be incremented directly.</p>
167     */
168    @Override
169    public void increment(final double d) {
170        if (incMoment) {
171            moment.increment(d);
172        }
173    }
174
175    /**
176     * {@inheritDoc}
177     */
178    @Override
179    public double getResult() {
180        if (moment.n == 0) {
181            return Double.NaN;
182        } else if (moment.n == 1) {
183            return 0d;
184        } else {
185            if (isBiasCorrected) {
186                return moment.m2 / (moment.n - 1d);
187            } else {
188                return moment.m2 / (moment.n);
189            }
190        }
191    }
192
193    /**
194     * {@inheritDoc}
195     */
196    @Override
197    public long getN() {
198        return moment.getN();
199    }
200
201    /**
202     * {@inheritDoc}
203     */
204    @Override
205    public void clear() {
206        if (incMoment) {
207            moment.clear();
208        }
209    }
210
211    /**
212     * Returns the variance of the entries in the input array, or
213     * <code>Double.NaN</code> if the array is empty.
214     * <p>
215     * See {@link Variance} for details on the computing algorithm.</p>
216     * <p>
217     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
218     * <p>
219     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
220     * <p>
221     * Does not change the internal state of the statistic.</p>
222     *
223     * @param values the input array
224     * @return the variance of the values or Double.NaN if length = 0
225     * @throws MathIllegalArgumentException if the array is null
226     */
227    @Override
228    public double evaluate(final double[] values) throws MathIllegalArgumentException {
229        if (values == null) {
230            throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
231        }
232        return evaluate(values, 0, values.length);
233    }
234
235    /**
236     * Returns the variance of the entries in the specified portion of
237     * the input array, or <code>Double.NaN</code> if the designated subarray
238     * is empty.  Note that Double.NaN may also be returned if the input
239     * includes NaN and / or infinite values.
240     * <p>
241     * See {@link Variance} for details on the computing algorithm.</p>
242     * <p>
243     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
244     * <p>
245     * Does not change the internal state of the statistic.</p>
246     * <p>
247     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
248     *
249     * @param values the input array
250     * @param begin index of the first array element to include
251     * @param length the number of elements to include
252     * @return the variance of the values or Double.NaN if length = 0
253     * @throws MathIllegalArgumentException if the array is null or the array index
254     *  parameters are not valid
255     */
256    @Override
257    public double evaluate(final double[] values, final int begin, final int length)
258        throws MathIllegalArgumentException {
259
260        double var = Double.NaN;
261
262        if (MathArrays.verifyValues(values, begin, length)) {
263            if (length == 1) {
264                var = 0.0;
265            } else if (length > 1) {
266                Mean mean = new Mean();
267                double m = mean.evaluate(values, begin, length);
268                var = evaluate(values, m, begin, length);
269            }
270        }
271        return var;
272    }
273
274    /**
275     * <p>Returns the weighted variance of the entries in the specified portion of
276     * the input array, or <code>Double.NaN</code> if the designated subarray
277     * is empty.</p>
278     * <p>
279     * Uses the formula <div style="white-space: pre"><code>
280     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
281     * </code></div>
282     * where weightedMean is the weighted mean
283     * <p>
284     * This formula will not return the same result as the unweighted variance when all
285     * weights are equal, unless all weights are equal to 1. The formula assumes that
286     * weights are to be treated as "expansion values," as will be the case if for example
287     * the weights represent frequency counts. To normalize weights so that the denominator
288     * in the variance computation equals the length of the input vector minus one, use <pre>
289     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
290     * </pre>
291     * <p>
292     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
293     * <p>
294     * Throws <code>IllegalArgumentException</code> if any of the following are true:
295     * <ul><li>the values array is null</li>
296     *     <li>the weights array is null</li>
297     *     <li>the weights array does not have the same length as the values array</li>
298     *     <li>the weights array contains one or more infinite values</li>
299     *     <li>the weights array contains one or more NaN values</li>
300     *     <li>the weights array contains negative values</li>
301     *     <li>the start and length arguments do not determine a valid array</li>
302     * </ul>
303     * <p>
304     * Does not change the internal state of the statistic.</p>
305     * <p>
306     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
307     *
308     * @param values the input array
309     * @param weights the weights array
310     * @param begin index of the first array element to include
311     * @param length the number of elements to include
312     * @return the weighted variance of the values or Double.NaN if length = 0
313     * @throws MathIllegalArgumentException if the parameters are not valid
314     * @since 2.1
315     */
316    @Override
317    public double evaluate(final double[] values, final double[] weights,
318                           final int begin, final int length) throws MathIllegalArgumentException {
319
320        double var = Double.NaN;
321
322        if (MathArrays.verifyValues(values, weights,begin, length)) {
323            if (length == 1) {
324                var = 0.0;
325            } else if (length > 1) {
326                Mean mean = new Mean();
327                double m = mean.evaluate(values, weights, begin, length);
328                var = evaluate(values, weights, m, begin, length);
329            }
330        }
331        return var;
332    }
333
334    /**
335     * <p>
336     * Returns the weighted variance of the entries in the input array.</p>
337     * <p>
338     * Uses the formula <div style="white-space:pre"><code>
339     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
340     * </code></div>
341     * where weightedMean is the weighted mean
342     * <p>
343     * This formula will not return the same result as the unweighted variance when all
344     * weights are equal, unless all weights are equal to 1. The formula assumes that
345     * weights are to be treated as "expansion values," as will be the case if for example
346     * the weights represent frequency counts. To normalize weights so that the denominator
347     * in the variance computation equals the length of the input vector minus one, use <pre>
348     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
349     * </pre>
350     * <p>
351     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
352     * <p>
353     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
354     * <ul><li>the values array is null</li>
355     *     <li>the weights array is null</li>
356     *     <li>the weights array does not have the same length as the values array</li>
357     *     <li>the weights array contains one or more infinite values</li>
358     *     <li>the weights array contains one or more NaN values</li>
359     *     <li>the weights array contains negative values</li>
360     * </ul>
361     * <p>
362     * Does not change the internal state of the statistic.</p>
363     * <p>
364     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
365     *
366     * @param values the input array
367     * @param weights the weights array
368     * @return the weighted variance of the values
369     * @throws MathIllegalArgumentException if the parameters are not valid
370     * @since 2.1
371     */
372    @Override
373    public double evaluate(final double[] values, final double[] weights)
374        throws MathIllegalArgumentException {
375        return evaluate(values, weights, 0, values.length);
376    }
377
378    /**
379     * Returns the variance of the entries in the specified portion of
380     * the input array, using the precomputed mean value.  Returns
381     * <code>Double.NaN</code> if the designated subarray is empty.
382     * <p>
383     * See {@link Variance} for details on the computing algorithm.</p>
384     * <p>
385     * The formula used assumes that the supplied mean value is the arithmetic
386     * mean of the sample data, not a known population parameter.  This method
387     * is supplied only to save computation when the mean has already been
388     * computed.</p>
389     * <p>
390     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
391     * <p>
392     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
393     * <p>
394     * Does not change the internal state of the statistic.</p>
395     *
396     * @param values the input array
397     * @param mean the precomputed mean value
398     * @param begin index of the first array element to include
399     * @param length the number of elements to include
400     * @return the variance of the values or Double.NaN if length = 0
401     * @throws MathIllegalArgumentException if the array is null or the array index
402     *  parameters are not valid
403     */
404    public double evaluate(final double[] values, final double mean,
405                           final int begin, final int length) throws MathIllegalArgumentException {
406
407        double var = Double.NaN;
408
409        if (MathArrays.verifyValues(values, begin, length)) {
410            if (length == 1) {
411                var = 0.0;
412            } else if (length > 1) {
413                double accum = 0.0;
414                double dev = 0.0;
415                double accum2 = 0.0;
416                for (int i = begin; i < begin + length; i++) {
417                    dev = values[i] - mean;
418                    accum += dev * dev;
419                    accum2 += dev;
420                }
421                double len = length;
422                if (isBiasCorrected) {
423                    var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
424                } else {
425                    var = (accum - (accum2 * accum2 / len)) / len;
426                }
427            }
428        }
429        return var;
430    }
431
432    /**
433     * Returns the variance of the entries in the input array, using the
434     * precomputed mean value.  Returns <code>Double.NaN</code> if the array
435     * is empty.
436     * <p>
437     * See {@link Variance} for details on the computing algorithm.</p>
438     * <p>
439     * If <code>isBiasCorrected</code> is <code>true</code> the formula used
440     * assumes that the supplied mean value is the arithmetic mean of the
441     * sample data, not a known population parameter.  If the mean is a known
442     * population parameter, or if the "population" version of the variance is
443     * desired, set <code>isBiasCorrected</code> to <code>false</code> before
444     * invoking this method.</p>
445     * <p>
446     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
447     * <p>
448     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
449     * <p>
450     * Does not change the internal state of the statistic.</p>
451     *
452     * @param values the input array
453     * @param mean the precomputed mean value
454     * @return the variance of the values or Double.NaN if the array is empty
455     * @throws MathIllegalArgumentException if the array is null
456     */
457    public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
458        return evaluate(values, mean, 0, values.length);
459    }
460
461    /**
462     * Returns the weighted variance of the entries in the specified portion of
463     * the input array, using the precomputed weighted mean value.  Returns
464     * <code>Double.NaN</code> if the designated subarray is empty.
465     * <p>
466     * Uses the formula <div style="white-space:pre"><code>
467     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
468     * </code></div>
469     * <p>
470     * The formula used assumes that the supplied mean value is the weighted arithmetic
471     * mean of the sample data, not a known population parameter. This method
472     * is supplied only to save computation when the mean has already been
473     * computed.</p>
474     * <p>
475     * This formula will not return the same result as the unweighted variance when all
476     * weights are equal, unless all weights are equal to 1. The formula assumes that
477     * weights are to be treated as "expansion values," as will be the case if for example
478     * the weights represent frequency counts. To normalize weights so that the denominator
479     * in the variance computation equals the length of the input vector minus one, use <pre>
480     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
481     * </pre>
482     * <p>
483     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
484     * <p>
485     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
486     * <ul><li>the values array is null</li>
487     *     <li>the weights array is null</li>
488     *     <li>the weights array does not have the same length as the values array</li>
489     *     <li>the weights array contains one or more infinite values</li>
490     *     <li>the weights array contains one or more NaN values</li>
491     *     <li>the weights array contains negative values</li>
492     *     <li>the start and length arguments do not determine a valid array</li>
493     * </ul>
494     * <p>
495     * Does not change the internal state of the statistic.</p>
496     *
497     * @param values the input array
498     * @param weights the weights array
499     * @param mean the precomputed weighted mean value
500     * @param begin index of the first array element to include
501     * @param length the number of elements to include
502     * @return the variance of the values or Double.NaN if length = 0
503     * @throws MathIllegalArgumentException if the parameters are not valid
504     * @since 2.1
505     */
506    public double evaluate(final double[] values, final double[] weights,
507                           final double mean, final int begin, final int length)
508        throws MathIllegalArgumentException {
509
510        double var = Double.NaN;
511
512        if (MathArrays.verifyValues(values, weights, begin, length)) {
513            if (length == 1) {
514                var = 0.0;
515            } else if (length > 1) {
516                double accum = 0.0;
517                double dev = 0.0;
518                double accum2 = 0.0;
519                for (int i = begin; i < begin + length; i++) {
520                    dev = values[i] - mean;
521                    accum += weights[i] * (dev * dev);
522                    accum2 += weights[i] * dev;
523                }
524
525                double sumWts = 0;
526                for (int i = begin; i < begin + length; i++) {
527                    sumWts += weights[i];
528                }
529
530                if (isBiasCorrected) {
531                    var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
532                } else {
533                    var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
534                }
535            }
536        }
537        return var;
538    }
539
540    /**
541     * <p>Returns the weighted variance of the values in the input array, using
542     * the precomputed weighted mean value.</p>
543     * <p>
544     * Uses the formula <div style="white-space:pre"><code>
545     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
546     * </code></div>
547     * <p>
548     * The formula used assumes that the supplied mean value is the weighted arithmetic
549     * mean of the sample data, not a known population parameter. This method
550     * is supplied only to save computation when the mean has already been
551     * computed.</p>
552     * <p>
553     * This formula will not return the same result as the unweighted variance when all
554     * weights are equal, unless all weights are equal to 1. The formula assumes that
555     * weights are to be treated as "expansion values," as will be the case if for example
556     * the weights represent frequency counts. To normalize weights so that the denominator
557     * in the variance computation equals the length of the input vector minus one, use <pre>
558     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
559     * </pre>
560     * <p>
561     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
562     * <p>
563     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
564     * <ul><li>the values array is null</li>
565     *     <li>the weights array is null</li>
566     *     <li>the weights array does not have the same length as the values array</li>
567     *     <li>the weights array contains one or more infinite values</li>
568     *     <li>the weights array contains one or more NaN values</li>
569     *     <li>the weights array contains negative values</li>
570     * </ul>
571     * <p>
572     * Does not change the internal state of the statistic.</p>
573     *
574     * @param values the input array
575     * @param weights the weights array
576     * @param mean the precomputed weighted mean value
577     * @return the variance of the values or Double.NaN if length = 0
578     * @throws MathIllegalArgumentException if the parameters are not valid
579     * @since 2.1
580     */
581    public double evaluate(final double[] values, final double[] weights, final double mean)
582        throws MathIllegalArgumentException {
583        return evaluate(values, weights, mean, 0, values.length);
584    }
585
586    /**
587     * @return Returns the isBiasCorrected.
588     */
589    public boolean isBiasCorrected() {
590        return isBiasCorrected;
591    }
592
593    /**
594     * @param biasCorrected The isBiasCorrected to set.
595     */
596    public void setBiasCorrected(boolean biasCorrected) {
597        this.isBiasCorrected = biasCorrected;
598    }
599
600    /**
601     * {@inheritDoc}
602     */
603    @Override
604    public Variance copy() {
605        Variance result = new Variance();
606        // No try-catch or advertised exception because parameters are guaranteed non-null
607        copy(this, result);
608        return result;
609    }
610
611    /**
612     * Copies source to dest.
613     * <p>Neither source nor dest can be null.</p>
614     *
615     * @param source Variance to copy
616     * @param dest Variance to copy to
617     * @throws NullArgumentException if either source or dest is null
618     */
619    public static void copy(Variance source, Variance dest)
620        throws NullArgumentException {
621        MathUtils.checkNotNull(source);
622        MathUtils.checkNotNull(dest);
623        dest.moment = source.moment.copy();
624        dest.isBiasCorrected = source.isBiasCorrected;
625        dest.incMoment = source.incMoment;
626    }
627}