001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.math4.legacy.stat.descriptive.moment;
018
019import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
020import org.apache.commons.math4.legacy.exception.NullArgumentException;
021import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
022import org.apache.commons.math4.legacy.stat.descriptive.AbstractStorelessUnivariateStatistic;
023import org.apache.commons.math4.legacy.stat.descriptive.WeightedEvaluation;
024import org.apache.commons.math4.legacy.core.MathArrays;
025
026/**
027 * Computes the variance of the available values.  By default, the unbiased
028 * "sample variance" definitional formula is used:
029 * <p>
030 * variance = sum((x_i - mean)^2) / (n - 1) </p>
031 * <p>
032 * where mean is the {@link Mean} and <code>n</code> is the number
033 * of sample observations.</p>
034 * <p>
035 * The definitional formula does not have good numerical properties, so
036 * this implementation does not compute the statistic using the definitional
037 * formula. <ul>
038 * <li> The <code>getResult</code> method computes the variance using
039 * updating formulas based on West's algorithm, as described in
040 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
041 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
042 * vol. 22 no. 9, pp. 526-531.</a></li>
043 * <li> The <code>evaluate</code> methods leverage the fact that they have the
044 * full array of values in memory to execute a two-pass algorithm.
045 * Specifically, these methods use the "corrected two-pass algorithm" from
046 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
047 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
048 * Note that adding values using <code>increment</code> or
049 * <code>incrementAll</code> and then executing <code>getResult</code> will
050 * sometimes give a different, less accurate, result than executing
051 * <code>evaluate</code> with the full array of values. The former approach
052 * should only be used when the full array of values is not available.
053 * <p>
054 * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
055 * be computed using this statistic.  The <code>isBiasCorrected</code>
056 * property determines whether the "population" or "sample" value is
057 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
058 * To compute population variances, set this property to <code>false.</code>
059 * </p>
060 * <p>
061 * <strong>Note that this implementation is not synchronized.</strong> If
062 * multiple threads access an instance of this class concurrently, and at least
063 * one of the threads invokes the <code>increment()</code> or
064 * <code>clear()</code> method, it must be synchronized externally.</p>
065 */
066public class Variance extends AbstractStorelessUnivariateStatistic implements WeightedEvaluation {
067    /** SecondMoment is used in incremental calculation of Variance. */
068    protected SecondMoment moment;
069
070    /**
071     * Whether or not {@link #increment(double)} should increment
072     * the internal second moment. When a Variance is constructed with an
073     * external SecondMoment as a constructor parameter, this property is
074     * set to false and increments must be applied to the second moment
075     * directly.
076     */
077    protected boolean incMoment = true;
078
079    /**
080     * Whether or not bias correction is applied when computing the
081     * value of the statistic. True means that bias is corrected.  See
082     * {@link Variance} for details on the formula.
083     */
084    private boolean isBiasCorrected = true;
085
086    /**
087     * Constructs a Variance with default (true) <code>isBiasCorrected</code>
088     * property.
089     */
090    public Variance() {
091        moment = new SecondMoment();
092    }
093
094    /**
095     * Constructs a Variance based on an external second moment.
096     * <p>
097     * When this constructor is used, the statistic may only be
098     * incremented via the moment, i.e., {@link #increment(double)}
099     * does nothing; whereas {@code m2.increment(value)} increments
100     * both {@code m2} and the Variance instance constructed from it.
101     *
102     * @param m2 the SecondMoment (Third or Fourth moments work here as well.)
103     */
104    public Variance(final SecondMoment m2) {
105        incMoment = false;
106        this.moment = m2;
107    }
108
109    /**
110     * Constructs a Variance with the specified <code>isBiasCorrected</code>
111     * property.
112     *
113     * @param isBiasCorrected  setting for bias correction - true means
114     * bias will be corrected and is equivalent to using the "no arg"
115     * constructor
116     */
117    public Variance(boolean isBiasCorrected) {
118        moment = new SecondMoment();
119        this.isBiasCorrected = isBiasCorrected;
120    }
121
122    /**
123     * Constructs a Variance with the specified <code>isBiasCorrected</code>
124     * property and the supplied external second moment.
125     *
126     * @param isBiasCorrected  setting for bias correction - true means
127     * bias will be corrected
128     * @param m2 the SecondMoment (Third or Fourth moments work
129     * here as well.)
130     */
131    public Variance(boolean isBiasCorrected, SecondMoment m2) {
132        incMoment = false;
133        this.moment = m2;
134        this.isBiasCorrected = isBiasCorrected;
135    }
136
137    /**
138     * Copy constructor, creates a new {@code Variance} identical
139     * to the {@code original}.
140     *
141     * @param original the {@code Variance} instance to copy
142     * @throws NullArgumentException if original is null
143     */
144    public Variance(Variance original) throws NullArgumentException {
145        copy(original, this);
146    }
147
148    /**
149     * {@inheritDoc}
150     * <p>If all values are available, it is more accurate to use
151     * {@link #evaluate(double[])} rather than adding values one at a time
152     * using this method and then executing {@link #getResult}, since
153     * <code>evaluate</code> leverages the fact that is has the full
154     * list of values together to execute a two-pass algorithm.
155     * See {@link Variance}.</p>
156     *
157     * <p>Note also that when {@link #Variance(SecondMoment)} is used to
158     * create a Variance, this method does nothing. In that case, the
159     * SecondMoment should be incremented directly.</p>
160     */
161    @Override
162    public void increment(final double d) {
163        if (incMoment) {
164            moment.increment(d);
165        }
166    }
167
168    /**
169     * {@inheritDoc}
170     */
171    @Override
172    public double getResult() {
173        if (moment.n == 0) {
174            return Double.NaN;
175        } else if (moment.n == 1) {
176            return 0d;
177        } else {
178            if (isBiasCorrected) {
179                return moment.m2 / (moment.n - 1d);
180            } else {
181                return moment.m2 / (moment.n);
182            }
183        }
184    }
185
186    /**
187     * {@inheritDoc}
188     */
189    @Override
190    public long getN() {
191        return moment.getN();
192    }
193
194    /**
195     * {@inheritDoc}
196     */
197    @Override
198    public void clear() {
199        if (incMoment) {
200            moment.clear();
201        }
202    }
203
204    /**
205     * Returns the variance of the entries in the input array, or
206     * <code>Double.NaN</code> if the array is empty.
207     * <p>
208     * See {@link Variance} for details on the computing algorithm.</p>
209     * <p>
210     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
211     * <p>
212     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
213     * <p>
214     * Does not change the internal state of the statistic.</p>
215     *
216     * @param values the input array
217     * @return the variance of the values or Double.NaN if length = 0
218     * @throws MathIllegalArgumentException if the array is null
219     */
220    @Override
221    public double evaluate(final double[] values) throws MathIllegalArgumentException {
222        if (values == null) {
223            throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
224        }
225        return evaluate(values, 0, values.length);
226    }
227
228    /**
229     * Returns the variance of the entries in the specified portion of
230     * the input array, or <code>Double.NaN</code> if the designated subarray
231     * is empty.  Note that Double.NaN may also be returned if the input
232     * includes NaN and / or infinite values.
233     * <p>
234     * See {@link Variance} for details on the computing algorithm.</p>
235     * <p>
236     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
237     * <p>
238     * Does not change the internal state of the statistic.</p>
239     * <p>
240     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
241     *
242     * @param values the input array
243     * @param begin index of the first array element to include
244     * @param length the number of elements to include
245     * @return the variance of the values or Double.NaN if length = 0
246     * @throws MathIllegalArgumentException if the array is null or the array index
247     *  parameters are not valid
248     */
249    @Override
250    public double evaluate(final double[] values, final int begin, final int length)
251        throws MathIllegalArgumentException {
252
253        double var = Double.NaN;
254
255        if (MathArrays.verifyValues(values, begin, length)) {
256            if (length == 1) {
257                var = 0.0;
258            } else if (length > 1) {
259                Mean mean = new Mean();
260                double m = mean.evaluate(values, begin, length);
261                var = evaluate(values, m, begin, length);
262            }
263        }
264        return var;
265    }
266
267    /**
268     * <p>Returns the weighted variance of the entries in the specified portion of
269     * the input array, or <code>Double.NaN</code> if the designated subarray
270     * is empty.</p>
271     * <p>
272     * Uses the formula <div style="white-space: pre"><code>
273     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
274     * </code></div>
275     * where weightedMean is the weighted mean
276     * <p>
277     * This formula will not return the same result as the unweighted variance when all
278     * weights are equal, unless all weights are equal to 1. The formula assumes that
279     * weights are to be treated as "expansion values," as will be the case if for example
280     * the weights represent frequency counts. To normalize weights so that the denominator
281     * in the variance computation equals the length of the input vector minus one, use <pre>
282     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
283     * </pre>
284     * <p>
285     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
286     * <p>
287     * Throws <code>IllegalArgumentException</code> if any of the following are true:
288     * <ul><li>the values array is null</li>
289     *     <li>the weights array is null</li>
290     *     <li>the weights array does not have the same length as the values array</li>
291     *     <li>the weights array contains one or more infinite values</li>
292     *     <li>the weights array contains one or more NaN values</li>
293     *     <li>the weights array contains negative values</li>
294     *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
295     *     <li>the start and length arguments do not determine a valid array</li>
296     * </ul>
297     * <p>
298     * Does not change the internal state of the statistic.</p>
299     * <p>
300     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
301     *
302     * @param values the input array
303     * @param weights the weights array
304     * @param begin index of the first array element to include
305     * @param length the number of elements to include
306     * @return the weighted variance of the values or Double.NaN if length = 0
307     * @throws MathIllegalArgumentException if the parameters are not valid
308     * @since 2.1
309     */
310    @Override
311    public double evaluate(final double[] values, final double[] weights,
312                           final int begin, final int length) throws MathIllegalArgumentException {
313
314        double var = Double.NaN;
315
316        if (MathArrays.verifyValues(values, weights, begin, length)) {
317            if (length == 1) {
318                var = 0.0;
319            } else if (length > 1) {
320                Mean mean = new Mean();
321                double m = mean.evaluate(values, weights, begin, length);
322                var = evaluate(values, weights, m, begin, length);
323            }
324        }
325        return var;
326    }
327
328    /**
329     * <p>
330     * Returns the weighted variance of the entries in the input array.</p>
331     * <p>
332     * Uses the formula <div style="white-space:pre"><code>
333     *   &Sigma;(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
334     * </code></div>
335     * where weightedMean is the weighted mean
336     * <p>
337     * This formula will not return the same result as the unweighted variance when all
338     * weights are equal, unless all weights are equal to 1. The formula assumes that
339     * weights are to be treated as "expansion values," as will be the case if for example
340     * the weights represent frequency counts. To normalize weights so that the denominator
341     * in the variance computation equals the length of the input vector minus one, use <pre>
342     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
343     * </pre>
344     * <p>
345     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
346     * <p>
347     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
348     * <ul><li>the values array is null</li>
349     *     <li>the weights array is null</li>
350     *     <li>the weights array does not have the same length as the values array</li>
351     *     <li>the weights array contains one or more infinite values</li>
352     *     <li>the weights array contains one or more NaN values</li>
353     *     <li>the weights array contains negative values</li>
354     *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
355     * </ul>
356     * <p>
357     * Does not change the internal state of the statistic.</p>
358     * <p>
359     * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
360     *
361     * @param values the input array
362     * @param weights the weights array
363     * @return the weighted variance of the values
364     * @throws MathIllegalArgumentException if the parameters are not valid
365     * @since 2.1
366     */
367    @Override
368    public double evaluate(final double[] values, final double[] weights)
369        throws MathIllegalArgumentException {
370        return evaluate(values, weights, 0, values.length);
371    }
372
373    /**
374     * Returns the variance of the entries in the specified portion of
375     * the input array, using the precomputed mean value.  Returns
376     * <code>Double.NaN</code> if the designated subarray is empty.
377     * <p>
378     * See {@link Variance} for details on the computing algorithm.</p>
379     * <p>
380     * The formula used assumes that the supplied mean value is the arithmetic
381     * mean of the sample data, not a known population parameter.  This method
382     * is supplied only to save computation when the mean has already been
383     * computed.</p>
384     * <p>
385     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
386     * <p>
387     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
388     * <p>
389     * Does not change the internal state of the statistic.</p>
390     *
391     * @param values the input array
392     * @param mean the precomputed mean value
393     * @param begin index of the first array element to include
394     * @param length the number of elements to include
395     * @return the variance of the values or Double.NaN if length = 0
396     * @throws MathIllegalArgumentException if the array is null or the array index
397     *  parameters are not valid
398     */
399    public double evaluate(final double[] values, final double mean,
400                           final int begin, final int length) throws MathIllegalArgumentException {
401
402        double var = Double.NaN;
403
404        if (MathArrays.verifyValues(values, begin, length)) {
405            if (length == 1) {
406                var = 0.0;
407            } else if (length > 1) {
408                double accum = 0.0;
409                double dev = 0.0;
410                double accum2 = 0.0;
411                for (int i = begin; i < begin + length; i++) {
412                    dev = values[i] - mean;
413                    accum += dev * dev;
414                    accum2 += dev;
415                }
416                double len = length;
417                if (isBiasCorrected) {
418                    var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
419                } else {
420                    var = (accum - (accum2 * accum2 / len)) / len;
421                }
422            }
423        }
424        return var;
425    }
426
427    /**
428     * Returns the variance of the entries in the input array, using the
429     * precomputed mean value.  Returns <code>Double.NaN</code> if the array
430     * is empty.
431     * <p>
432     * See {@link Variance} for details on the computing algorithm.</p>
433     * <p>
434     * If <code>isBiasCorrected</code> is <code>true</code> the formula used
435     * assumes that the supplied mean value is the arithmetic mean of the
436     * sample data, not a known population parameter.  If the mean is a known
437     * population parameter, or if the "population" version of the variance is
438     * desired, set <code>isBiasCorrected</code> to <code>false</code> before
439     * invoking this method.</p>
440     * <p>
441     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
442     * <p>
443     * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
444     * <p>
445     * Does not change the internal state of the statistic.</p>
446     *
447     * @param values the input array
448     * @param mean the precomputed mean value
449     * @return the variance of the values or Double.NaN if the array is empty
450     * @throws MathIllegalArgumentException if the array is null
451     */
452    public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
453        return evaluate(values, mean, 0, values.length);
454    }
455
456    /**
457     * Returns the weighted variance of the entries in the specified portion of
458     * the input array, using the precomputed weighted mean value.  Returns
459     * <code>Double.NaN</code> if the designated subarray is empty.
460     * <p>
461     * Uses the formula <div style="white-space:pre"><code>
462     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
463     * </code></div>
464     * <p>
465     * The formula used assumes that the supplied mean value is the weighted arithmetic
466     * mean of the sample data, not a known population parameter. This method
467     * is supplied only to save computation when the mean has already been
468     * computed.</p>
469     * <p>
470     * This formula will not return the same result as the unweighted variance when all
471     * weights are equal, unless all weights are equal to 1. The formula assumes that
472     * weights are to be treated as "expansion values," as will be the case if for example
473     * the weights represent frequency counts. To normalize weights so that the denominator
474     * in the variance computation equals the length of the input vector minus one, use <pre>
475     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
476     * </pre>
477     * <p>
478     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
479     * <p>
480     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
481     * <ul><li>the values array is null</li>
482     *     <li>the weights array is null</li>
483     *     <li>the weights array does not have the same length as the values array</li>
484     *     <li>the weights array contains one or more infinite values</li>
485     *     <li>the weights array contains one or more NaN values</li>
486     *     <li>the weights array contains negative values</li>
487     *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
488     *     <li>the start and length arguments do not determine a valid array</li>
489     * </ul>
490     * <p>
491     * Does not change the internal state of the statistic.</p>
492     *
493     * @param values the input array
494     * @param weights the weights array
495     * @param mean the precomputed weighted mean value
496     * @param begin index of the first array element to include
497     * @param length the number of elements to include
498     * @return the variance of the values or Double.NaN if length = 0
499     * @throws MathIllegalArgumentException if the parameters are not valid
500     * @since 2.1
501     */
502    public double evaluate(final double[] values, final double[] weights,
503                           final double mean, final int begin, final int length)
504        throws MathIllegalArgumentException {
505
506        double var = Double.NaN;
507
508        if (MathArrays.verifyValues(values, weights, begin, length)) {
509            if (length == 1) {
510                var = 0.0;
511            } else if (length > 1) {
512                double accum = 0.0;
513                double dev = 0.0;
514                double accum2 = 0.0;
515                for (int i = begin; i < begin + length; i++) {
516                    dev = values[i] - mean;
517                    accum += weights[i] * (dev * dev);
518                    accum2 += weights[i] * dev;
519                }
520
521                double sumWts = 0;
522                for (int i = begin; i < begin + length; i++) {
523                    sumWts += weights[i];
524                }
525
526                if (isBiasCorrected) {
527                    // Note: For this to be valid the weights should correspond to counts
528                    // of each observation where the weights are positive integers; the
529                    // sum of the weights is the total number of observations and should
530                    // be at least 2.
531                    var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
532                } else {
533                    var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
534                }
535            }
536        }
537        return var;
538    }
539
540    /**
541     * <p>Returns the weighted variance of the values in the input array, using
542     * the precomputed weighted mean value.</p>
543     * <p>
544     * Uses the formula <div style="white-space:pre"><code>
545     *   &Sigma;(weights[i]*(values[i] - mean)<sup>2</sup>)/(&Sigma;(weights[i]) - 1)
546     * </code></div>
547     * <p>
548     * The formula used assumes that the supplied mean value is the weighted arithmetic
549     * mean of the sample data, not a known population parameter. This method
550     * is supplied only to save computation when the mean has already been
551     * computed.</p>
552     * <p>
553     * This formula will not return the same result as the unweighted variance when all
554     * weights are equal, unless all weights are equal to 1. The formula assumes that
555     * weights are to be treated as "expansion values," as will be the case if for example
556     * the weights represent frequency counts. To normalize weights so that the denominator
557     * in the variance computation equals the length of the input vector minus one, use <pre>
558     *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
559     * </pre>
560     * <p>
561     * Returns 0 for a single-value (i.e. length = 1) sample.</p>
562     * <p>
563     * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
564     * <ul><li>the values array is null</li>
565     *     <li>the weights array is null</li>
566     *     <li>the weights array does not have the same length as the values array</li>
567     *     <li>the weights array contains one or more infinite values</li>
568     *     <li>the weights array contains one or more NaN values</li>
569     *     <li>the weights array contains negative values</li>
570     *     <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
571     * </ul>
572     * <p>
573     * Does not change the internal state of the statistic.</p>
574     *
575     * @param values the input array
576     * @param weights the weights array
577     * @param mean the precomputed weighted mean value
578     * @return the variance of the values or Double.NaN if length = 0
579     * @throws MathIllegalArgumentException if the parameters are not valid
580     * @since 2.1
581     */
582    public double evaluate(final double[] values, final double[] weights, final double mean)
583        throws MathIllegalArgumentException {
584        return evaluate(values, weights, mean, 0, values.length);
585    }
586
587    /**
588     * @return Returns the isBiasCorrected.
589     */
590    public boolean isBiasCorrected() {
591        return isBiasCorrected;
592    }
593
594    /**
595     * @param biasCorrected The isBiasCorrected to set.
596     */
597    public void setBiasCorrected(boolean biasCorrected) {
598        this.isBiasCorrected = biasCorrected;
599    }
600
601    /**
602     * {@inheritDoc}
603     */
604    @Override
605    public Variance copy() {
606        Variance result = new Variance();
607        // No try-catch or advertised exception because parameters are guaranteed non-null
608        copy(this, result);
609        return result;
610    }
611
612    /**
613     * Copies source to dest.
614     * <p>Neither source nor dest can be null.</p>
615     *
616     * @param source Variance to copy
617     * @param dest Variance to copy to
618     * @throws NullArgumentException if either source or dest is null
619     */
620    public static void copy(Variance source, Variance dest)
621        throws NullArgumentException {
622        NullArgumentException.check(source);
623        NullArgumentException.check(dest);
624        dest.moment = source.moment.copy();
625        dest.isBiasCorrected = source.isBiasCorrected;
626        dest.incMoment = source.incMoment;
627    }
628}