001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math.stat.descriptive.moment;
018
019 import java.io.Serializable;
020
021 import org.apache.commons.math.exception.NullArgumentException;
022 import org.apache.commons.math.exception.util.LocalizedFormats;
023 import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
024 import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
025 import org.apache.commons.math.util.MathUtils;
026
027 /**
028 * Computes the variance of the available values. By default, the unbiased
029 * "sample variance" definitional formula is used:
030 * <p>
031 * variance = sum((x_i - mean)^2) / (n - 1) </p>
032 * <p>
033 * where mean is the {@link Mean} and <code>n</code> is the number
034 * of sample observations.</p>
035 * <p>
036 * The definitional formula does not have good numerical properties, so
037 * this implementation does not compute the statistic using the definitional
038 * formula. <ul>
039 * <li> The <code>getResult</code> method computes the variance using
040 * updating formulas based on West's algorithm, as described in
041 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
042 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
043 * vol. 22 no. 9, pp. 526-531.</a></li>
044 * <li> The <code>evaluate</code> methods leverage the fact that they have the
045 * full array of values in memory to execute a two-pass algorithm.
046 * Specifically, these methods use the "corrected two-pass algorithm" from
047 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
048 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
049 * Note that adding values using <code>increment</code> or
050 * <code>incrementAll</code> and then executing <code>getResult</code> will
051 * sometimes give a different, less accurate, result than executing
052 * <code>evaluate</code> with the full array of values. The former approach
053 * should only be used when the full array of values is not available.</p>
054 * <p>
055 * The "population variance" ( sum((x_i - mean)^2) / n ) can also
056 * be computed using this statistic. The <code>isBiasCorrected</code>
057 * property determines whether the "population" or "sample" value is
058 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
059 * To compute population variances, set this property to <code>false.</code>
060 * </p>
061 * <p>
062 * <strong>Note that this implementation is not synchronized.</strong> If
063 * multiple threads access an instance of this class concurrently, and at least
064 * one of the threads invokes the <code>increment()</code> or
065 * <code>clear()</code> method, it must be synchronized externally.</p>
066 *
067 * @version $Id: Variance.java 1182147 2011-10-11 23:37:57Z erans $
068 */
069 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
070
071 /** Serializable version identifier */
072 private static final long serialVersionUID = -9111962718267217978L;
073
074 /** SecondMoment is used in incremental calculation of Variance*/
075 protected SecondMoment moment = null;
076
077 /**
078 * Boolean test to determine if this Variance should also increment
079 * the second moment, this evaluates to false when this Variance is
080 * constructed with an external SecondMoment as a parameter.
081 */
082 protected boolean incMoment = true;
083
084 /**
085 * Determines whether or not bias correction is applied when computing the
086 * value of the statisic. True means that bias is corrected. See
087 * {@link Variance} for details on the formula.
088 */
089 private boolean isBiasCorrected = true;
090
091 /**
092 * Constructs a Variance with default (true) <code>isBiasCorrected</code>
093 * property.
094 */
095 public Variance() {
096 moment = new SecondMoment();
097 }
098
099 /**
100 * Constructs a Variance based on an external second moment.
101 *
102 * @param m2 the SecondMoment (Third or Fourth moments work
103 * here as well.)
104 */
105 public Variance(final SecondMoment m2) {
106 incMoment = false;
107 this.moment = m2;
108 }
109
110 /**
111 * Constructs a Variance with the specified <code>isBiasCorrected</code>
112 * property
113 *
114 * @param isBiasCorrected setting for bias correction - true means
115 * bias will be corrected and is equivalent to using the argumentless
116 * constructor
117 */
118 public Variance(boolean isBiasCorrected) {
119 moment = new SecondMoment();
120 this.isBiasCorrected = isBiasCorrected;
121 }
122
123 /**
124 * Constructs a Variance with the specified <code>isBiasCorrected</code>
125 * property and the supplied external second moment.
126 *
127 * @param isBiasCorrected setting for bias correction - true means
128 * bias will be corrected
129 * @param m2 the SecondMoment (Third or Fourth moments work
130 * here as well.)
131 */
132 public Variance(boolean isBiasCorrected, SecondMoment m2) {
133 incMoment = false;
134 this.moment = m2;
135 this.isBiasCorrected = isBiasCorrected;
136 }
137
138 /**
139 * Copy constructor, creates a new {@code Variance} identical
140 * to the {@code original}
141 *
142 * @param original the {@code Variance} instance to copy
143 */
144 public Variance(Variance original) {
145 copy(original, this);
146 }
147
148 /**
149 * {@inheritDoc}
150 * <p>If all values are available, it is more accurate to use
151 * {@link #evaluate(double[])} rather than adding values one at a time
152 * using this method and then executing {@link #getResult}, since
153 * <code>evaluate</code> leverages the fact that is has the full
154 * list of values together to execute a two-pass algorithm.
155 * See {@link Variance}.</p>
156 */
157 @Override
158 public void increment(final double d) {
159 if (incMoment) {
160 moment.increment(d);
161 }
162 }
163
164 /**
165 * {@inheritDoc}
166 */
167 @Override
168 public double getResult() {
169 if (moment.n == 0) {
170 return Double.NaN;
171 } else if (moment.n == 1) {
172 return 0d;
173 } else {
174 if (isBiasCorrected) {
175 return moment.m2 / (moment.n - 1d);
176 } else {
177 return moment.m2 / (moment.n);
178 }
179 }
180 }
181
182 /**
183 * {@inheritDoc}
184 */
185 public long getN() {
186 return moment.getN();
187 }
188
189 /**
190 * {@inheritDoc}
191 */
192 @Override
193 public void clear() {
194 if (incMoment) {
195 moment.clear();
196 }
197 }
198
199 /**
200 * Returns the variance of the entries in the input array, or
201 * <code>Double.NaN</code> if the array is empty.
202 * <p>
203 * See {@link Variance} for details on the computing algorithm.</p>
204 * <p>
205 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
206 * <p>
207 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
208 * <p>
209 * Does not change the internal state of the statistic.</p>
210 *
211 * @param values the input array
212 * @return the variance of the values or Double.NaN if length = 0
213 * @throws IllegalArgumentException if the array is null
214 */
215 @Override
216 public double evaluate(final double[] values) {
217 if (values == null) {
218 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
219 }
220 return evaluate(values, 0, values.length);
221 }
222
223 /**
224 * Returns the variance of the entries in the specified portion of
225 * the input array, or <code>Double.NaN</code> if the designated subarray
226 * is empty.
227 * <p>
228 * See {@link Variance} for details on the computing algorithm.</p>
229 * <p>
230 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
231 * <p>
232 * Does not change the internal state of the statistic.</p>
233 * <p>
234 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
235 *
236 * @param values the input array
237 * @param begin index of the first array element to include
238 * @param length the number of elements to include
239 * @return the variance of the values or Double.NaN if length = 0
240 * @throws IllegalArgumentException if the array is null or the array index
241 * parameters are not valid
242 */
243 @Override
244 public double evaluate(final double[] values, final int begin, final int length) {
245
246 double var = Double.NaN;
247
248 if (test(values, begin, length)) {
249 clear();
250 if (length == 1) {
251 var = 0.0;
252 } else if (length > 1) {
253 Mean mean = new Mean();
254 double m = mean.evaluate(values, begin, length);
255 var = evaluate(values, m, begin, length);
256 }
257 }
258 return var;
259 }
260
261 /**
262 * <p>Returns the weighted variance of the entries in the specified portion of
263 * the input array, or <code>Double.NaN</code> if the designated subarray
264 * is empty.</p>
265 * <p>
266 * Uses the formula <pre>
267 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
268 * </pre>
269 * where weightedMean is the weighted mean</p>
270 * <p>
271 * This formula will not return the same result as the unweighted variance when all
272 * weights are equal, unless all weights are equal to 1. The formula assumes that
273 * weights are to be treated as "expansion values," as will be the case if for example
274 * the weights represent frequency counts. To normalize weights so that the denominator
275 * in the variance computation equals the length of the input vector minus one, use <pre>
276 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
277 * </pre>
278 * <p>
279 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
280 * <p>
281 * Throws <code>IllegalArgumentException</code> if any of the following are true:
282 * <ul><li>the values array is null</li>
283 * <li>the weights array is null</li>
284 * <li>the weights array does not have the same length as the values array</li>
285 * <li>the weights array contains one or more infinite values</li>
286 * <li>the weights array contains one or more NaN values</li>
287 * <li>the weights array contains negative values</li>
288 * <li>the start and length arguments do not determine a valid array</li>
289 * </ul></p>
290 * <p>
291 * Does not change the internal state of the statistic.</p>
292 * <p>
293 * Throws <code>IllegalArgumentException</code> if either array is null.</p>
294 *
295 * @param values the input array
296 * @param weights the weights array
297 * @param begin index of the first array element to include
298 * @param length the number of elements to include
299 * @return the weighted variance of the values or Double.NaN if length = 0
300 * @throws IllegalArgumentException if the parameters are not valid
301 * @since 2.1
302 */
303 public double evaluate(final double[] values, final double[] weights,
304 final int begin, final int length) {
305
306 double var = Double.NaN;
307
308 if (test(values, weights,begin, length)) {
309 clear();
310 if (length == 1) {
311 var = 0.0;
312 } else if (length > 1) {
313 Mean mean = new Mean();
314 double m = mean.evaluate(values, weights, begin, length);
315 var = evaluate(values, weights, m, begin, length);
316 }
317 }
318 return var;
319 }
320
321 /**
322 * <p>
323 * Returns the weighted variance of the entries in the the input array.</p>
324 * <p>
325 * Uses the formula <pre>
326 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
327 * </pre>
328 * where weightedMean is the weighted mean</p>
329 * <p>
330 * This formula will not return the same result as the unweighted variance when all
331 * weights are equal, unless all weights are equal to 1. The formula assumes that
332 * weights are to be treated as "expansion values," as will be the case if for example
333 * the weights represent frequency counts. To normalize weights so that the denominator
334 * in the variance computation equals the length of the input vector minus one, use <pre>
335 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
336 * </pre>
337 * <p>
338 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
339 * <p>
340 * Throws <code>IllegalArgumentException</code> if any of the following are true:
341 * <ul><li>the values array is null</li>
342 * <li>the weights array is null</li>
343 * <li>the weights array does not have the same length as the values array</li>
344 * <li>the weights array contains one or more infinite values</li>
345 * <li>the weights array contains one or more NaN values</li>
346 * <li>the weights array contains negative values</li>
347 * </ul></p>
348 * <p>
349 * Does not change the internal state of the statistic.</p>
350 * <p>
351 * Throws <code>IllegalArgumentException</code> if either array is null.</p>
352 *
353 * @param values the input array
354 * @param weights the weights array
355 * @return the weighted variance of the values
356 * @throws IllegalArgumentException if the parameters are not valid
357 * @since 2.1
358 */
359 public double evaluate(final double[] values, final double[] weights) {
360 return evaluate(values, weights, 0, values.length);
361 }
362
363 /**
364 * Returns the variance of the entries in the specified portion of
365 * the input array, using the precomputed mean value. Returns
366 * <code>Double.NaN</code> if the designated subarray is empty.
367 * <p>
368 * See {@link Variance} for details on the computing algorithm.</p>
369 * <p>
370 * The formula used assumes that the supplied mean value is the arithmetic
371 * mean of the sample data, not a known population parameter. This method
372 * is supplied only to save computation when the mean has already been
373 * computed.</p>
374 * <p>
375 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
376 * <p>
377 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
378 * <p>
379 * Does not change the internal state of the statistic.</p>
380 *
381 * @param values the input array
382 * @param mean the precomputed mean value
383 * @param begin index of the first array element to include
384 * @param length the number of elements to include
385 * @return the variance of the values or Double.NaN if length = 0
386 * @throws IllegalArgumentException if the array is null or the array index
387 * parameters are not valid
388 */
389 public double evaluate(final double[] values, final double mean,
390 final int begin, final int length) {
391
392 double var = Double.NaN;
393
394 if (test(values, begin, length)) {
395 if (length == 1) {
396 var = 0.0;
397 } else if (length > 1) {
398 double accum = 0.0;
399 double dev = 0.0;
400 double accum2 = 0.0;
401 for (int i = begin; i < begin + length; i++) {
402 dev = values[i] - mean;
403 accum += dev * dev;
404 accum2 += dev;
405 }
406 double len = length;
407 if (isBiasCorrected) {
408 var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
409 } else {
410 var = (accum - (accum2 * accum2 / len)) / len;
411 }
412 }
413 }
414 return var;
415 }
416
417 /**
418 * Returns the variance of the entries in the input array, using the
419 * precomputed mean value. Returns <code>Double.NaN</code> if the array
420 * is empty.
421 * <p>
422 * See {@link Variance} for details on the computing algorithm.</p>
423 * <p>
424 * If <code>isBiasCorrected</code> is <code>true</code> the formula used
425 * assumes that the supplied mean value is the arithmetic mean of the
426 * sample data, not a known population parameter. If the mean is a known
427 * population parameter, or if the "population" version of the variance is
428 * desired, set <code>isBiasCorrected</code> to <code>false</code> before
429 * invoking this method.</p>
430 * <p>
431 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
432 * <p>
433 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
434 * <p>
435 * Does not change the internal state of the statistic.</p>
436 *
437 * @param values the input array
438 * @param mean the precomputed mean value
439 * @return the variance of the values or Double.NaN if the array is empty
440 * @throws IllegalArgumentException if the array is null
441 */
442 public double evaluate(final double[] values, final double mean) {
443 return evaluate(values, mean, 0, values.length);
444 }
445
446 /**
447 * Returns the weighted variance of the entries in the specified portion of
448 * the input array, using the precomputed weighted mean value. Returns
449 * <code>Double.NaN</code> if the designated subarray is empty.
450 * <p>
451 * Uses the formula <pre>
452 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
453 * </pre></p>
454 * <p>
455 * The formula used assumes that the supplied mean value is the weighted arithmetic
456 * mean of the sample data, not a known population parameter. This method
457 * is supplied only to save computation when the mean has already been
458 * computed.</p>
459 * <p>
460 * This formula will not return the same result as the unweighted variance when all
461 * weights are equal, unless all weights are equal to 1. The formula assumes that
462 * weights are to be treated as "expansion values," as will be the case if for example
463 * the weights represent frequency counts. To normalize weights so that the denominator
464 * in the variance computation equals the length of the input vector minus one, use <pre>
465 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
466 * </pre>
467 * <p>
468 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
469 * <p>
470 * Throws <code>IllegalArgumentException</code> if any of the following are true:
471 * <ul><li>the values array is null</li>
472 * <li>the weights array is null</li>
473 * <li>the weights array does not have the same length as the values array</li>
474 * <li>the weights array contains one or more infinite values</li>
475 * <li>the weights array contains one or more NaN values</li>
476 * <li>the weights array contains negative values</li>
477 * <li>the start and length arguments do not determine a valid array</li>
478 * </ul></p>
479 * <p>
480 * Does not change the internal state of the statistic.</p>
481 *
482 * @param values the input array
483 * @param weights the weights array
484 * @param mean the precomputed weighted mean value
485 * @param begin index of the first array element to include
486 * @param length the number of elements to include
487 * @return the variance of the values or Double.NaN if length = 0
488 * @throws IllegalArgumentException if the parameters are not valid
489 * @since 2.1
490 */
491 public double evaluate(final double[] values, final double[] weights,
492 final double mean, final int begin, final int length) {
493
494 double var = Double.NaN;
495
496 if (test(values, weights, begin, length)) {
497 if (length == 1) {
498 var = 0.0;
499 } else if (length > 1) {
500 double accum = 0.0;
501 double dev = 0.0;
502 double accum2 = 0.0;
503 for (int i = begin; i < begin + length; i++) {
504 dev = values[i] - mean;
505 accum += weights[i] * (dev * dev);
506 accum2 += weights[i] * dev;
507 }
508
509 double sumWts = 0;
510 for (int i = 0; i < weights.length; i++) {
511 sumWts += weights[i];
512 }
513
514 if (isBiasCorrected) {
515 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
516 } else {
517 var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
518 }
519 }
520 }
521 return var;
522 }
523
524 /**
525 * <p>Returns the weighted variance of the values in the input array, using
526 * the precomputed weighted mean value.</p>
527 * <p>
528 * Uses the formula <pre>
529 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
530 * </pre></p>
531 * <p>
532 * The formula used assumes that the supplied mean value is the weighted arithmetic
533 * mean of the sample data, not a known population parameter. This method
534 * is supplied only to save computation when the mean has already been
535 * computed.</p>
536 * <p>
537 * This formula will not return the same result as the unweighted variance when all
538 * weights are equal, unless all weights are equal to 1. The formula assumes that
539 * weights are to be treated as "expansion values," as will be the case if for example
540 * the weights represent frequency counts. To normalize weights so that the denominator
541 * in the variance computation equals the length of the input vector minus one, use <pre>
542 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
543 * </pre>
544 * <p>
545 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
546 * <p>
547 * Throws <code>IllegalArgumentException</code> if any of the following are true:
548 * <ul><li>the values array is null</li>
549 * <li>the weights array is null</li>
550 * <li>the weights array does not have the same length as the values array</li>
551 * <li>the weights array contains one or more infinite values</li>
552 * <li>the weights array contains one or more NaN values</li>
553 * <li>the weights array contains negative values</li>
554 * </ul></p>
555 * <p>
556 * Does not change the internal state of the statistic.</p>
557 *
558 * @param values the input array
559 * @param weights the weights array
560 * @param mean the precomputed weighted mean value
561 * @return the variance of the values or Double.NaN if length = 0
562 * @throws IllegalArgumentException if the parameters are not valid
563 * @since 2.1
564 */
565 public double evaluate(final double[] values, final double[] weights, final double mean) {
566 return evaluate(values, weights, mean, 0, values.length);
567 }
568
569 /**
570 * @return Returns the isBiasCorrected.
571 */
572 public boolean isBiasCorrected() {
573 return isBiasCorrected;
574 }
575
576 /**
577 * @param biasCorrected The isBiasCorrected to set.
578 */
579 public void setBiasCorrected(boolean biasCorrected) {
580 this.isBiasCorrected = biasCorrected;
581 }
582
583 /**
584 * {@inheritDoc}
585 */
586 @Override
587 public Variance copy() {
588 Variance result = new Variance();
589 copy(this, result);
590 return result;
591 }
592
593 /**
594 * Copies source to dest.
595 * <p>Neither source nor dest can be null.</p>
596 *
597 * @param source Variance to copy
598 * @param dest Variance to copy to
599 * @throws NullArgumentException if either source or dest is null
600 */
601 public static void copy(Variance source, Variance dest)
602 throws NullArgumentException {
603 MathUtils.checkNotNull(source);
604 MathUtils.checkNotNull(dest);
605 dest.setData(source.getDataRef());
606 dest.moment = source.moment.copy();
607 dest.isBiasCorrected = source.isBiasCorrected;
608 dest.incMoment = source.incMoment;
609 }
610 }