1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.math4.legacy.stat.descriptive.moment;
18
19 import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
20 import org.apache.commons.math4.legacy.exception.NullArgumentException;
21 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
22 import org.apache.commons.math4.legacy.stat.descriptive.AbstractStorelessUnivariateStatistic;
23 import org.apache.commons.math4.legacy.stat.descriptive.WeightedEvaluation;
24 import org.apache.commons.math4.legacy.core.MathArrays;
25
26 /**
27 * Computes the variance of the available values. By default, the unbiased
28 * "sample variance" definitional formula is used:
29 * <p>
30 * variance = sum((x_i - mean)^2) / (n - 1) </p>
31 * <p>
32 * where mean is the {@link Mean} and <code>n</code> is the number
33 * of sample observations.</p>
34 * <p>
35 * The definitional formula does not have good numerical properties, so
36 * this implementation does not compute the statistic using the definitional
37 * formula. <ul>
38 * <li> The <code>getResult</code> method computes the variance using
39 * updating formulas based on West's algorithm, as described in
40 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
41 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
42 * vol. 22 no. 9, pp. 526-531.</a></li>
43 * <li> The <code>evaluate</code> methods leverage the fact that they have the
44 * full array of values in memory to execute a two-pass algorithm.
45 * Specifically, these methods use the "corrected two-pass algorithm" from
46 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
47 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
48 * Note that adding values using <code>increment</code> or
49 * <code>incrementAll</code> and then executing <code>getResult</code> will
50 * sometimes give a different, less accurate, result than executing
51 * <code>evaluate</code> with the full array of values. The former approach
52 * should only be used when the full array of values is not available.
53 * <p>
54 * The "population variance" ( sum((x_i - mean)^2) / n ) can also
55 * be computed using this statistic. The <code>isBiasCorrected</code>
56 * property determines whether the "population" or "sample" value is
57 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
58 * To compute population variances, set this property to <code>false.</code>
59 * </p>
60 * <p>
61 * <strong>Note that this implementation is not synchronized.</strong> If
62 * multiple threads access an instance of this class concurrently, and at least
63 * one of the threads invokes the <code>increment()</code> or
64 * <code>clear()</code> method, it must be synchronized externally.</p>
65 */
66 public class Variance extends AbstractStorelessUnivariateStatistic implements WeightedEvaluation {
67 /** SecondMoment is used in incremental calculation of Variance. */
68 protected SecondMoment moment;
69
70 /**
71 * Whether or not {@link #increment(double)} should increment
72 * the internal second moment. When a Variance is constructed with an
73 * external SecondMoment as a constructor parameter, this property is
74 * set to false and increments must be applied to the second moment
75 * directly.
76 */
77 protected boolean incMoment = true;
78
79 /**
80 * Whether or not bias correction is applied when computing the
81 * value of the statistic. True means that bias is corrected. See
82 * {@link Variance} for details on the formula.
83 */
84 private boolean isBiasCorrected = true;
85
86 /**
87 * Constructs a Variance with default (true) <code>isBiasCorrected</code>
88 * property.
89 */
90 public Variance() {
91 moment = new SecondMoment();
92 }
93
94 /**
95 * Constructs a Variance based on an external second moment.
96 * <p>
97 * When this constructor is used, the statistic may only be
98 * incremented via the moment, i.e., {@link #increment(double)}
99 * does nothing; whereas {@code m2.increment(value)} increments
100 * both {@code m2} and the Variance instance constructed from it.
101 *
102 * @param m2 the SecondMoment (Third or Fourth moments work here as well.)
103 */
104 public Variance(final SecondMoment m2) {
105 incMoment = false;
106 this.moment = m2;
107 }
108
109 /**
110 * Constructs a Variance with the specified <code>isBiasCorrected</code>
111 * property.
112 *
113 * @param isBiasCorrected setting for bias correction - true means
114 * bias will be corrected and is equivalent to using the "no arg"
115 * constructor
116 */
117 public Variance(boolean isBiasCorrected) {
118 moment = new SecondMoment();
119 this.isBiasCorrected = isBiasCorrected;
120 }
121
122 /**
123 * Constructs a Variance with the specified <code>isBiasCorrected</code>
124 * property and the supplied external second moment.
125 *
126 * @param isBiasCorrected setting for bias correction - true means
127 * bias will be corrected
128 * @param m2 the SecondMoment (Third or Fourth moments work
129 * here as well.)
130 */
131 public Variance(boolean isBiasCorrected, SecondMoment m2) {
132 incMoment = false;
133 this.moment = m2;
134 this.isBiasCorrected = isBiasCorrected;
135 }
136
137 /**
138 * Copy constructor, creates a new {@code Variance} identical
139 * to the {@code original}.
140 *
141 * @param original the {@code Variance} instance to copy
142 * @throws NullArgumentException if original is null
143 */
144 public Variance(Variance original) throws NullArgumentException {
145 copy(original, this);
146 }
147
148 /**
149 * {@inheritDoc}
150 * <p>If all values are available, it is more accurate to use
151 * {@link #evaluate(double[])} rather than adding values one at a time
152 * using this method and then executing {@link #getResult}, since
153 * <code>evaluate</code> leverages the fact that is has the full
154 * list of values together to execute a two-pass algorithm.
155 * See {@link Variance}.</p>
156 *
157 * <p>Note also that when {@link #Variance(SecondMoment)} is used to
158 * create a Variance, this method does nothing. In that case, the
159 * SecondMoment should be incremented directly.</p>
160 */
161 @Override
162 public void increment(final double d) {
163 if (incMoment) {
164 moment.increment(d);
165 }
166 }
167
168 /**
169 * {@inheritDoc}
170 */
171 @Override
172 public double getResult() {
173 if (moment.n == 0) {
174 return Double.NaN;
175 } else if (moment.n == 1) {
176 return 0d;
177 } else {
178 if (isBiasCorrected) {
179 return moment.m2 / (moment.n - 1d);
180 } else {
181 return moment.m2 / (moment.n);
182 }
183 }
184 }
185
186 /**
187 * {@inheritDoc}
188 */
189 @Override
190 public long getN() {
191 return moment.getN();
192 }
193
194 /**
195 * {@inheritDoc}
196 */
197 @Override
198 public void clear() {
199 if (incMoment) {
200 moment.clear();
201 }
202 }
203
204 /**
205 * Returns the variance of the entries in the input array, or
206 * <code>Double.NaN</code> if the array is empty.
207 * <p>
208 * See {@link Variance} for details on the computing algorithm.</p>
209 * <p>
210 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
211 * <p>
212 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
213 * <p>
214 * Does not change the internal state of the statistic.</p>
215 *
216 * @param values the input array
217 * @return the variance of the values or Double.NaN if length = 0
218 * @throws MathIllegalArgumentException if the array is null
219 */
220 @Override
221 public double evaluate(final double[] values) throws MathIllegalArgumentException {
222 if (values == null) {
223 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
224 }
225 return evaluate(values, 0, values.length);
226 }
227
228 /**
229 * Returns the variance of the entries in the specified portion of
230 * the input array, or <code>Double.NaN</code> if the designated subarray
231 * is empty. Note that Double.NaN may also be returned if the input
232 * includes NaN and / or infinite values.
233 * <p>
234 * See {@link Variance} for details on the computing algorithm.</p>
235 * <p>
236 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
237 * <p>
238 * Does not change the internal state of the statistic.</p>
239 * <p>
240 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
241 *
242 * @param values the input array
243 * @param begin index of the first array element to include
244 * @param length the number of elements to include
245 * @return the variance of the values or Double.NaN if length = 0
246 * @throws MathIllegalArgumentException if the array is null or the array index
247 * parameters are not valid
248 */
249 @Override
250 public double evaluate(final double[] values, final int begin, final int length)
251 throws MathIllegalArgumentException {
252
253 double var = Double.NaN;
254
255 if (MathArrays.verifyValues(values, begin, length)) {
256 if (length == 1) {
257 var = 0.0;
258 } else if (length > 1) {
259 Mean mean = new Mean();
260 double m = mean.evaluate(values, begin, length);
261 var = evaluate(values, m, begin, length);
262 }
263 }
264 return var;
265 }
266
267 /**
268 * <p>Returns the weighted variance of the entries in the specified portion of
269 * the input array, or <code>Double.NaN</code> if the designated subarray
270 * is empty.</p>
271 * <p>
272 * Uses the formula <div style="white-space: pre"><code>
273 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
274 * </code></div>
275 * where weightedMean is the weighted mean
276 * <p>
277 * This formula will not return the same result as the unweighted variance when all
278 * weights are equal, unless all weights are equal to 1. The formula assumes that
279 * weights are to be treated as "expansion values," as will be the case if for example
280 * the weights represent frequency counts. To normalize weights so that the denominator
281 * in the variance computation equals the length of the input vector minus one, use <pre>
282 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
283 * </pre>
284 * <p>
285 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
286 * <p>
287 * Throws <code>IllegalArgumentException</code> if any of the following are true:
288 * <ul><li>the values array is null</li>
289 * <li>the weights array is null</li>
290 * <li>the weights array does not have the same length as the values array</li>
291 * <li>the weights array contains one or more infinite values</li>
292 * <li>the weights array contains one or more NaN values</li>
293 * <li>the weights array contains negative values</li>
294 * <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
295 * <li>the start and length arguments do not determine a valid array</li>
296 * </ul>
297 * <p>
298 * Does not change the internal state of the statistic.</p>
299 * <p>
300 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
301 *
302 * @param values the input array
303 * @param weights the weights array
304 * @param begin index of the first array element to include
305 * @param length the number of elements to include
306 * @return the weighted variance of the values or Double.NaN if length = 0
307 * @throws MathIllegalArgumentException if the parameters are not valid
308 * @since 2.1
309 */
310 @Override
311 public double evaluate(final double[] values, final double[] weights,
312 final int begin, final int length) throws MathIllegalArgumentException {
313
314 double var = Double.NaN;
315
316 if (MathArrays.verifyValues(values, weights, begin, length)) {
317 if (length == 1) {
318 var = 0.0;
319 } else if (length > 1) {
320 Mean mean = new Mean();
321 double m = mean.evaluate(values, weights, begin, length);
322 var = evaluate(values, weights, m, begin, length);
323 }
324 }
325 return var;
326 }
327
328 /**
329 * <p>
330 * Returns the weighted variance of the entries in the input array.</p>
331 * <p>
332 * Uses the formula <div style="white-space:pre"><code>
333 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
334 * </code></div>
335 * where weightedMean is the weighted mean
336 * <p>
337 * This formula will not return the same result as the unweighted variance when all
338 * weights are equal, unless all weights are equal to 1. The formula assumes that
339 * weights are to be treated as "expansion values," as will be the case if for example
340 * the weights represent frequency counts. To normalize weights so that the denominator
341 * in the variance computation equals the length of the input vector minus one, use <pre>
342 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
343 * </pre>
344 * <p>
345 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
346 * <p>
347 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
348 * <ul><li>the values array is null</li>
349 * <li>the weights array is null</li>
350 * <li>the weights array does not have the same length as the values array</li>
351 * <li>the weights array contains one or more infinite values</li>
352 * <li>the weights array contains one or more NaN values</li>
353 * <li>the weights array contains negative values</li>
354 * <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
355 * </ul>
356 * <p>
357 * Does not change the internal state of the statistic.</p>
358 * <p>
359 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
360 *
361 * @param values the input array
362 * @param weights the weights array
363 * @return the weighted variance of the values
364 * @throws MathIllegalArgumentException if the parameters are not valid
365 * @since 2.1
366 */
367 @Override
368 public double evaluate(final double[] values, final double[] weights)
369 throws MathIllegalArgumentException {
370 return evaluate(values, weights, 0, values.length);
371 }
372
373 /**
374 * Returns the variance of the entries in the specified portion of
375 * the input array, using the precomputed mean value. Returns
376 * <code>Double.NaN</code> if the designated subarray is empty.
377 * <p>
378 * See {@link Variance} for details on the computing algorithm.</p>
379 * <p>
380 * The formula used assumes that the supplied mean value is the arithmetic
381 * mean of the sample data, not a known population parameter. This method
382 * is supplied only to save computation when the mean has already been
383 * computed.</p>
384 * <p>
385 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
386 * <p>
387 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
388 * <p>
389 * Does not change the internal state of the statistic.</p>
390 *
391 * @param values the input array
392 * @param mean the precomputed mean value
393 * @param begin index of the first array element to include
394 * @param length the number of elements to include
395 * @return the variance of the values or Double.NaN if length = 0
396 * @throws MathIllegalArgumentException if the array is null or the array index
397 * parameters are not valid
398 */
399 public double evaluate(final double[] values, final double mean,
400 final int begin, final int length) throws MathIllegalArgumentException {
401
402 double var = Double.NaN;
403
404 if (MathArrays.verifyValues(values, begin, length)) {
405 if (length == 1) {
406 var = 0.0;
407 } else if (length > 1) {
408 double accum = 0.0;
409 double dev = 0.0;
410 double accum2 = 0.0;
411 for (int i = begin; i < begin + length; i++) {
412 dev = values[i] - mean;
413 accum += dev * dev;
414 accum2 += dev;
415 }
416 double len = length;
417 if (isBiasCorrected) {
418 var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
419 } else {
420 var = (accum - (accum2 * accum2 / len)) / len;
421 }
422 }
423 }
424 return var;
425 }
426
427 /**
428 * Returns the variance of the entries in the input array, using the
429 * precomputed mean value. Returns <code>Double.NaN</code> if the array
430 * is empty.
431 * <p>
432 * See {@link Variance} for details on the computing algorithm.</p>
433 * <p>
434 * If <code>isBiasCorrected</code> is <code>true</code> the formula used
435 * assumes that the supplied mean value is the arithmetic mean of the
436 * sample data, not a known population parameter. If the mean is a known
437 * population parameter, or if the "population" version of the variance is
438 * desired, set <code>isBiasCorrected</code> to <code>false</code> before
439 * invoking this method.</p>
440 * <p>
441 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
442 * <p>
443 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
444 * <p>
445 * Does not change the internal state of the statistic.</p>
446 *
447 * @param values the input array
448 * @param mean the precomputed mean value
449 * @return the variance of the values or Double.NaN if the array is empty
450 * @throws MathIllegalArgumentException if the array is null
451 */
452 public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
453 return evaluate(values, mean, 0, values.length);
454 }
455
456 /**
457 * Returns the weighted variance of the entries in the specified portion of
458 * the input array, using the precomputed weighted mean value. Returns
459 * <code>Double.NaN</code> if the designated subarray is empty.
460 * <p>
461 * Uses the formula <div style="white-space:pre"><code>
462 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
463 * </code></div>
464 * <p>
465 * The formula used assumes that the supplied mean value is the weighted arithmetic
466 * mean of the sample data, not a known population parameter. This method
467 * is supplied only to save computation when the mean has already been
468 * computed.</p>
469 * <p>
470 * This formula will not return the same result as the unweighted variance when all
471 * weights are equal, unless all weights are equal to 1. The formula assumes that
472 * weights are to be treated as "expansion values," as will be the case if for example
473 * the weights represent frequency counts. To normalize weights so that the denominator
474 * in the variance computation equals the length of the input vector minus one, use <pre>
475 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
476 * </pre>
477 * <p>
478 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
479 * <p>
480 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
481 * <ul><li>the values array is null</li>
482 * <li>the weights array is null</li>
483 * <li>the weights array does not have the same length as the values array</li>
484 * <li>the weights array contains one or more infinite values</li>
485 * <li>the weights array contains one or more NaN values</li>
486 * <li>the weights array contains negative values</li>
487 * <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
488 * <li>the start and length arguments do not determine a valid array</li>
489 * </ul>
490 * <p>
491 * Does not change the internal state of the statistic.</p>
492 *
493 * @param values the input array
494 * @param weights the weights array
495 * @param mean the precomputed weighted mean value
496 * @param begin index of the first array element to include
497 * @param length the number of elements to include
498 * @return the variance of the values or Double.NaN if length = 0
499 * @throws MathIllegalArgumentException if the parameters are not valid
500 * @since 2.1
501 */
502 public double evaluate(final double[] values, final double[] weights,
503 final double mean, final int begin, final int length)
504 throws MathIllegalArgumentException {
505
506 double var = Double.NaN;
507
508 if (MathArrays.verifyValues(values, weights, begin, length)) {
509 if (length == 1) {
510 var = 0.0;
511 } else if (length > 1) {
512 double accum = 0.0;
513 double dev = 0.0;
514 double accum2 = 0.0;
515 for (int i = begin; i < begin + length; i++) {
516 dev = values[i] - mean;
517 accum += weights[i] * (dev * dev);
518 accum2 += weights[i] * dev;
519 }
520
521 double sumWts = 0;
522 for (int i = begin; i < begin + length; i++) {
523 sumWts += weights[i];
524 }
525
526 if (isBiasCorrected) {
527 // Note: For this to be valid the weights should correspond to counts
528 // of each observation where the weights are positive integers; the
529 // sum of the weights is the total number of observations and should
530 // be at least 2.
531 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
532 } else {
533 var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
534 }
535 }
536 }
537 return var;
538 }
539
540 /**
541 * <p>Returns the weighted variance of the values in the input array, using
542 * the precomputed weighted mean value.</p>
543 * <p>
544 * Uses the formula <div style="white-space:pre"><code>
545 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
546 * </code></div>
547 * <p>
548 * The formula used assumes that the supplied mean value is the weighted arithmetic
549 * mean of the sample data, not a known population parameter. This method
550 * is supplied only to save computation when the mean has already been
551 * computed.</p>
552 * <p>
553 * This formula will not return the same result as the unweighted variance when all
554 * weights are equal, unless all weights are equal to 1. The formula assumes that
555 * weights are to be treated as "expansion values," as will be the case if for example
556 * the weights represent frequency counts. To normalize weights so that the denominator
557 * in the variance computation equals the length of the input vector minus one, use <pre>
558 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
559 * </pre>
560 * <p>
561 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
562 * <p>
563 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
564 * <ul><li>the values array is null</li>
565 * <li>the weights array is null</li>
566 * <li>the weights array does not have the same length as the values array</li>
567 * <li>the weights array contains one or more infinite values</li>
568 * <li>the weights array contains one or more NaN values</li>
569 * <li>the weights array contains negative values</li>
570 * <li>the weights array does not contain at least one non-zero value (applies when length is non zero)</li>
571 * </ul>
572 * <p>
573 * Does not change the internal state of the statistic.</p>
574 *
575 * @param values the input array
576 * @param weights the weights array
577 * @param mean the precomputed weighted mean value
578 * @return the variance of the values or Double.NaN if length = 0
579 * @throws MathIllegalArgumentException if the parameters are not valid
580 * @since 2.1
581 */
582 public double evaluate(final double[] values, final double[] weights, final double mean)
583 throws MathIllegalArgumentException {
584 return evaluate(values, weights, mean, 0, values.length);
585 }
586
587 /**
588 * @return Returns the isBiasCorrected.
589 */
590 public boolean isBiasCorrected() {
591 return isBiasCorrected;
592 }
593
594 /**
595 * @param biasCorrected The isBiasCorrected to set.
596 */
597 public void setBiasCorrected(boolean biasCorrected) {
598 this.isBiasCorrected = biasCorrected;
599 }
600
601 /**
602 * {@inheritDoc}
603 */
604 @Override
605 public Variance copy() {
606 Variance result = new Variance();
607 // No try-catch or advertised exception because parameters are guaranteed non-null
608 copy(this, result);
609 return result;
610 }
611
612 /**
613 * Copies source to dest.
614 * <p>Neither source nor dest can be null.</p>
615 *
616 * @param source Variance to copy
617 * @param dest Variance to copy to
618 * @throws NullArgumentException if either source or dest is null
619 */
620 public static void copy(Variance source, Variance dest)
621 throws NullArgumentException {
622 NullArgumentException.check(source);
623 NullArgumentException.check(dest);
624 dest.moment = source.moment.copy();
625 dest.isBiasCorrected = source.isBiasCorrected;
626 dest.incMoment = source.incMoment;
627 }
628 }