001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.math3.stat.descriptive.moment; 018 019import java.io.Serializable; 020 021import org.apache.commons.math3.exception.MathIllegalArgumentException; 022import org.apache.commons.math3.exception.NullArgumentException; 023import org.apache.commons.math3.exception.util.LocalizedFormats; 024import org.apache.commons.math3.stat.descriptive.WeightedEvaluation; 025import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic; 026import org.apache.commons.math3.util.MathUtils; 027 028/** 029 * Computes the variance of the available values. By default, the unbiased 030 * "sample variance" definitional formula is used: 031 * <p> 032 * variance = sum((x_i - mean)^2) / (n - 1) </p> 033 * <p> 034 * where mean is the {@link Mean} and <code>n</code> is the number 035 * of sample observations.</p> 036 * <p> 037 * The definitional formula does not have good numerical properties, so 038 * this implementation does not compute the statistic using the definitional 039 * formula. <ul> 040 * <li> The <code>getResult</code> method computes the variance using 041 * updating formulas based on West's algorithm, as described in 042 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and 043 * J. G. Lewis 1979, <i>Communications of the ACM</i>, 044 * vol. 22 no. 9, pp. 526-531.</a></li> 045 * <li> The <code>evaluate</code> methods leverage the fact that they have the 046 * full array of values in memory to execute a two-pass algorithm. 047 * Specifically, these methods use the "corrected two-pass algorithm" from 048 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>, 049 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul> 050 * Note that adding values using <code>increment</code> or 051 * <code>incrementAll</code> and then executing <code>getResult</code> will 052 * sometimes give a different, less accurate, result than executing 053 * <code>evaluate</code> with the full array of values. The former approach 054 * should only be used when the full array of values is not available.</p> 055 * <p> 056 * The "population variance" ( sum((x_i - mean)^2) / n ) can also 057 * be computed using this statistic. The <code>isBiasCorrected</code> 058 * property determines whether the "population" or "sample" value is 059 * returned by the <code>evaluate</code> and <code>getResult</code> methods. 060 * To compute population variances, set this property to <code>false.</code> 061 * </p> 062 * <p> 063 * <strong>Note that this implementation is not synchronized.</strong> If 064 * multiple threads access an instance of this class concurrently, and at least 065 * one of the threads invokes the <code>increment()</code> or 066 * <code>clear()</code> method, it must be synchronized externally.</p> 067 * 068 */ 069public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation { 070 071 /** Serializable version identifier */ 072 private static final long serialVersionUID = -9111962718267217978L; 073 074 /** SecondMoment is used in incremental calculation of Variance*/ 075 protected SecondMoment moment = null; 076 077 /** 078 * Whether or not {@link #increment(double)} should increment 079 * the internal second moment. When a Variance is constructed with an 080 * external SecondMoment as a constructor parameter, this property is 081 * set to false and increments must be applied to the second moment 082 * directly. 083 */ 084 protected boolean incMoment = true; 085 086 /** 087 * Whether or not bias correction is applied when computing the 088 * value of the statistic. True means that bias is corrected. See 089 * {@link Variance} for details on the formula. 090 */ 091 private boolean isBiasCorrected = true; 092 093 /** 094 * Constructs a Variance with default (true) <code>isBiasCorrected</code> 095 * property. 096 */ 097 public Variance() { 098 moment = new SecondMoment(); 099 } 100 101 /** 102 * Constructs a Variance based on an external second moment. 103 * When this constructor is used, the statistic may only be 104 * incremented via the moment, i.e., {@link #increment(double)} 105 * does nothing; whereas {@code m2.increment(value)} increments 106 * both {@code m2} and the Variance instance constructed from it. 107 * 108 * @param m2 the SecondMoment (Third or Fourth moments work 109 * here as well.) 110 */ 111 public Variance(final SecondMoment m2) { 112 incMoment = false; 113 this.moment = m2; 114 } 115 116 /** 117 * Constructs a Variance with the specified <code>isBiasCorrected</code> 118 * property 119 * 120 * @param isBiasCorrected setting for bias correction - true means 121 * bias will be corrected and is equivalent to using the argumentless 122 * constructor 123 */ 124 public Variance(boolean isBiasCorrected) { 125 moment = new SecondMoment(); 126 this.isBiasCorrected = isBiasCorrected; 127 } 128 129 /** 130 * Constructs a Variance with the specified <code>isBiasCorrected</code> 131 * property and the supplied external second moment. 132 * 133 * @param isBiasCorrected setting for bias correction - true means 134 * bias will be corrected 135 * @param m2 the SecondMoment (Third or Fourth moments work 136 * here as well.) 137 */ 138 public Variance(boolean isBiasCorrected, SecondMoment m2) { 139 incMoment = false; 140 this.moment = m2; 141 this.isBiasCorrected = isBiasCorrected; 142 } 143 144 /** 145 * Copy constructor, creates a new {@code Variance} identical 146 * to the {@code original} 147 * 148 * @param original the {@code Variance} instance to copy 149 * @throws NullArgumentException if original is null 150 */ 151 public Variance(Variance original) throws NullArgumentException { 152 copy(original, this); 153 } 154 155 /** 156 * {@inheritDoc} 157 * <p>If all values are available, it is more accurate to use 158 * {@link #evaluate(double[])} rather than adding values one at a time 159 * using this method and then executing {@link #getResult}, since 160 * <code>evaluate</code> leverages the fact that is has the full 161 * list of values together to execute a two-pass algorithm. 162 * See {@link Variance}.</p> 163 * 164 * <p>Note also that when {@link #Variance(SecondMoment)} is used to 165 * create a Variance, this method does nothing. In that case, the 166 * SecondMoment should be incremented directly.</p> 167 */ 168 @Override 169 public void increment(final double d) { 170 if (incMoment) { 171 moment.increment(d); 172 } 173 } 174 175 /** 176 * {@inheritDoc} 177 */ 178 @Override 179 public double getResult() { 180 if (moment.n == 0) { 181 return Double.NaN; 182 } else if (moment.n == 1) { 183 return 0d; 184 } else { 185 if (isBiasCorrected) { 186 return moment.m2 / (moment.n - 1d); 187 } else { 188 return moment.m2 / (moment.n); 189 } 190 } 191 } 192 193 /** 194 * {@inheritDoc} 195 */ 196 public long getN() { 197 return moment.getN(); 198 } 199 200 /** 201 * {@inheritDoc} 202 */ 203 @Override 204 public void clear() { 205 if (incMoment) { 206 moment.clear(); 207 } 208 } 209 210 /** 211 * Returns the variance of the entries in the input array, or 212 * <code>Double.NaN</code> if the array is empty. 213 * <p> 214 * See {@link Variance} for details on the computing algorithm.</p> 215 * <p> 216 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 217 * <p> 218 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 219 * <p> 220 * Does not change the internal state of the statistic.</p> 221 * 222 * @param values the input array 223 * @return the variance of the values or Double.NaN if length = 0 224 * @throws MathIllegalArgumentException if the array is null 225 */ 226 @Override 227 public double evaluate(final double[] values) throws MathIllegalArgumentException { 228 if (values == null) { 229 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY); 230 } 231 return evaluate(values, 0, values.length); 232 } 233 234 /** 235 * Returns the variance of the entries in the specified portion of 236 * the input array, or <code>Double.NaN</code> if the designated subarray 237 * is empty. Note that Double.NaN may also be returned if the input 238 * includes NaN and / or infinite values. 239 * <p> 240 * See {@link Variance} for details on the computing algorithm.</p> 241 * <p> 242 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 243 * <p> 244 * Does not change the internal state of the statistic.</p> 245 * <p> 246 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 247 * 248 * @param values the input array 249 * @param begin index of the first array element to include 250 * @param length the number of elements to include 251 * @return the variance of the values or Double.NaN if length = 0 252 * @throws MathIllegalArgumentException if the array is null or the array index 253 * parameters are not valid 254 */ 255 @Override 256 public double evaluate(final double[] values, final int begin, final int length) 257 throws MathIllegalArgumentException { 258 259 double var = Double.NaN; 260 261 if (test(values, begin, length)) { 262 clear(); 263 if (length == 1) { 264 var = 0.0; 265 } else if (length > 1) { 266 Mean mean = new Mean(); 267 double m = mean.evaluate(values, begin, length); 268 var = evaluate(values, m, begin, length); 269 } 270 } 271 return var; 272 } 273 274 /** 275 * <p>Returns the weighted variance of the entries in the specified portion of 276 * the input array, or <code>Double.NaN</code> if the designated subarray 277 * is empty.</p> 278 * <p> 279 * Uses the formula <pre> 280 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 281 * </pre> 282 * where weightedMean is the weighted mean</p> 283 * <p> 284 * This formula will not return the same result as the unweighted variance when all 285 * weights are equal, unless all weights are equal to 1. The formula assumes that 286 * weights are to be treated as "expansion values," as will be the case if for example 287 * the weights represent frequency counts. To normalize weights so that the denominator 288 * in the variance computation equals the length of the input vector minus one, use <pre> 289 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code> 290 * </pre> 291 * <p> 292 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 293 * <p> 294 * Throws <code>IllegalArgumentException</code> if any of the following are true: 295 * <ul><li>the values array is null</li> 296 * <li>the weights array is null</li> 297 * <li>the weights array does not have the same length as the values array</li> 298 * <li>the weights array contains one or more infinite values</li> 299 * <li>the weights array contains one or more NaN values</li> 300 * <li>the weights array contains negative values</li> 301 * <li>the start and length arguments do not determine a valid array</li> 302 * </ul></p> 303 * <p> 304 * Does not change the internal state of the statistic.</p> 305 * <p> 306 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p> 307 * 308 * @param values the input array 309 * @param weights the weights array 310 * @param begin index of the first array element to include 311 * @param length the number of elements to include 312 * @return the weighted variance of the values or Double.NaN if length = 0 313 * @throws MathIllegalArgumentException if the parameters are not valid 314 * @since 2.1 315 */ 316 public double evaluate(final double[] values, final double[] weights, 317 final int begin, final int length) throws MathIllegalArgumentException { 318 319 double var = Double.NaN; 320 321 if (test(values, weights,begin, length)) { 322 clear(); 323 if (length == 1) { 324 var = 0.0; 325 } else if (length > 1) { 326 Mean mean = new Mean(); 327 double m = mean.evaluate(values, weights, begin, length); 328 var = evaluate(values, weights, m, begin, length); 329 } 330 } 331 return var; 332 } 333 334 /** 335 * <p> 336 * Returns the weighted variance of the entries in the the input array.</p> 337 * <p> 338 * Uses the formula <pre> 339 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1) 340 * </pre> 341 * where weightedMean is the weighted mean</p> 342 * <p> 343 * This formula will not return the same result as the unweighted variance when all 344 * weights are equal, unless all weights are equal to 1. The formula assumes that 345 * weights are to be treated as "expansion values," as will be the case if for example 346 * the weights represent frequency counts. To normalize weights so that the denominator 347 * in the variance computation equals the length of the input vector minus one, use <pre> 348 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code> 349 * </pre> 350 * <p> 351 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 352 * <p> 353 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 354 * <ul><li>the values array is null</li> 355 * <li>the weights array is null</li> 356 * <li>the weights array does not have the same length as the values array</li> 357 * <li>the weights array contains one or more infinite values</li> 358 * <li>the weights array contains one or more NaN values</li> 359 * <li>the weights array contains negative values</li> 360 * </ul></p> 361 * <p> 362 * Does not change the internal state of the statistic.</p> 363 * <p> 364 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p> 365 * 366 * @param values the input array 367 * @param weights the weights array 368 * @return the weighted variance of the values 369 * @throws MathIllegalArgumentException if the parameters are not valid 370 * @since 2.1 371 */ 372 public double evaluate(final double[] values, final double[] weights) 373 throws MathIllegalArgumentException { 374 return evaluate(values, weights, 0, values.length); 375 } 376 377 /** 378 * Returns the variance of the entries in the specified portion of 379 * the input array, using the precomputed mean value. Returns 380 * <code>Double.NaN</code> if the designated subarray is empty. 381 * <p> 382 * See {@link Variance} for details on the computing algorithm.</p> 383 * <p> 384 * The formula used assumes that the supplied mean value is the arithmetic 385 * mean of the sample data, not a known population parameter. This method 386 * is supplied only to save computation when the mean has already been 387 * computed.</p> 388 * <p> 389 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 390 * <p> 391 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 392 * <p> 393 * Does not change the internal state of the statistic.</p> 394 * 395 * @param values the input array 396 * @param mean the precomputed mean value 397 * @param begin index of the first array element to include 398 * @param length the number of elements to include 399 * @return the variance of the values or Double.NaN if length = 0 400 * @throws MathIllegalArgumentException if the array is null or the array index 401 * parameters are not valid 402 */ 403 public double evaluate(final double[] values, final double mean, 404 final int begin, final int length) throws MathIllegalArgumentException { 405 406 double var = Double.NaN; 407 408 if (test(values, begin, length)) { 409 if (length == 1) { 410 var = 0.0; 411 } else if (length > 1) { 412 double accum = 0.0; 413 double dev = 0.0; 414 double accum2 = 0.0; 415 for (int i = begin; i < begin + length; i++) { 416 dev = values[i] - mean; 417 accum += dev * dev; 418 accum2 += dev; 419 } 420 double len = length; 421 if (isBiasCorrected) { 422 var = (accum - (accum2 * accum2 / len)) / (len - 1.0); 423 } else { 424 var = (accum - (accum2 * accum2 / len)) / len; 425 } 426 } 427 } 428 return var; 429 } 430 431 /** 432 * Returns the variance of the entries in the input array, using the 433 * precomputed mean value. Returns <code>Double.NaN</code> if the array 434 * is empty. 435 * <p> 436 * See {@link Variance} for details on the computing algorithm.</p> 437 * <p> 438 * If <code>isBiasCorrected</code> is <code>true</code> the formula used 439 * assumes that the supplied mean value is the arithmetic mean of the 440 * sample data, not a known population parameter. If the mean is a known 441 * population parameter, or if the "population" version of the variance is 442 * desired, set <code>isBiasCorrected</code> to <code>false</code> before 443 * invoking this method.</p> 444 * <p> 445 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 446 * <p> 447 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p> 448 * <p> 449 * Does not change the internal state of the statistic.</p> 450 * 451 * @param values the input array 452 * @param mean the precomputed mean value 453 * @return the variance of the values or Double.NaN if the array is empty 454 * @throws MathIllegalArgumentException if the array is null 455 */ 456 public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException { 457 return evaluate(values, mean, 0, values.length); 458 } 459 460 /** 461 * Returns the weighted variance of the entries in the specified portion of 462 * the input array, using the precomputed weighted mean value. Returns 463 * <code>Double.NaN</code> if the designated subarray is empty. 464 * <p> 465 * Uses the formula <pre> 466 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 467 * </pre></p> 468 * <p> 469 * The formula used assumes that the supplied mean value is the weighted arithmetic 470 * mean of the sample data, not a known population parameter. This method 471 * is supplied only to save computation when the mean has already been 472 * computed.</p> 473 * <p> 474 * This formula will not return the same result as the unweighted variance when all 475 * weights are equal, unless all weights are equal to 1. The formula assumes that 476 * weights are to be treated as "expansion values," as will be the case if for example 477 * the weights represent frequency counts. To normalize weights so that the denominator 478 * in the variance computation equals the length of the input vector minus one, use <pre> 479 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code> 480 * </pre> 481 * <p> 482 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 483 * <p> 484 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 485 * <ul><li>the values array is null</li> 486 * <li>the weights array is null</li> 487 * <li>the weights array does not have the same length as the values array</li> 488 * <li>the weights array contains one or more infinite values</li> 489 * <li>the weights array contains one or more NaN values</li> 490 * <li>the weights array contains negative values</li> 491 * <li>the start and length arguments do not determine a valid array</li> 492 * </ul></p> 493 * <p> 494 * Does not change the internal state of the statistic.</p> 495 * 496 * @param values the input array 497 * @param weights the weights array 498 * @param mean the precomputed weighted mean value 499 * @param begin index of the first array element to include 500 * @param length the number of elements to include 501 * @return the variance of the values or Double.NaN if length = 0 502 * @throws MathIllegalArgumentException if the parameters are not valid 503 * @since 2.1 504 */ 505 public double evaluate(final double[] values, final double[] weights, 506 final double mean, final int begin, final int length) 507 throws MathIllegalArgumentException { 508 509 double var = Double.NaN; 510 511 if (test(values, weights, begin, length)) { 512 if (length == 1) { 513 var = 0.0; 514 } else if (length > 1) { 515 double accum = 0.0; 516 double dev = 0.0; 517 double accum2 = 0.0; 518 for (int i = begin; i < begin + length; i++) { 519 dev = values[i] - mean; 520 accum += weights[i] * (dev * dev); 521 accum2 += weights[i] * dev; 522 } 523 524 double sumWts = 0; 525 for (int i = begin; i < begin + length; i++) { 526 sumWts += weights[i]; 527 } 528 529 if (isBiasCorrected) { 530 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0); 531 } else { 532 var = (accum - (accum2 * accum2 / sumWts)) / sumWts; 533 } 534 } 535 } 536 return var; 537 } 538 539 /** 540 * <p>Returns the weighted variance of the values in the input array, using 541 * the precomputed weighted mean value.</p> 542 * <p> 543 * Uses the formula <pre> 544 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1) 545 * </pre></p> 546 * <p> 547 * The formula used assumes that the supplied mean value is the weighted arithmetic 548 * mean of the sample data, not a known population parameter. This method 549 * is supplied only to save computation when the mean has already been 550 * computed.</p> 551 * <p> 552 * This formula will not return the same result as the unweighted variance when all 553 * weights are equal, unless all weights are equal to 1. The formula assumes that 554 * weights are to be treated as "expansion values," as will be the case if for example 555 * the weights represent frequency counts. To normalize weights so that the denominator 556 * in the variance computation equals the length of the input vector minus one, use <pre> 557 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code> 558 * </pre> 559 * <p> 560 * Returns 0 for a single-value (i.e. length = 1) sample.</p> 561 * <p> 562 * Throws <code>MathIllegalArgumentException</code> if any of the following are true: 563 * <ul><li>the values array is null</li> 564 * <li>the weights array is null</li> 565 * <li>the weights array does not have the same length as the values array</li> 566 * <li>the weights array contains one or more infinite values</li> 567 * <li>the weights array contains one or more NaN values</li> 568 * <li>the weights array contains negative values</li> 569 * </ul></p> 570 * <p> 571 * Does not change the internal state of the statistic.</p> 572 * 573 * @param values the input array 574 * @param weights the weights array 575 * @param mean the precomputed weighted mean value 576 * @return the variance of the values or Double.NaN if length = 0 577 * @throws MathIllegalArgumentException if the parameters are not valid 578 * @since 2.1 579 */ 580 public double evaluate(final double[] values, final double[] weights, final double mean) 581 throws MathIllegalArgumentException { 582 return evaluate(values, weights, mean, 0, values.length); 583 } 584 585 /** 586 * @return Returns the isBiasCorrected. 587 */ 588 public boolean isBiasCorrected() { 589 return isBiasCorrected; 590 } 591 592 /** 593 * @param biasCorrected The isBiasCorrected to set. 594 */ 595 public void setBiasCorrected(boolean biasCorrected) { 596 this.isBiasCorrected = biasCorrected; 597 } 598 599 /** 600 * {@inheritDoc} 601 */ 602 @Override 603 public Variance copy() { 604 Variance result = new Variance(); 605 // No try-catch or advertised exception because parameters are guaranteed non-null 606 copy(this, result); 607 return result; 608 } 609 610 /** 611 * Copies source to dest. 612 * <p>Neither source nor dest can be null.</p> 613 * 614 * @param source Variance to copy 615 * @param dest Variance to copy to 616 * @throws NullArgumentException if either source or dest is null 617 */ 618 public static void copy(Variance source, Variance dest) 619 throws NullArgumentException { 620 MathUtils.checkNotNull(source); 621 MathUtils.checkNotNull(dest); 622 dest.setData(source.getDataRef()); 623 dest.moment = source.moment.copy(); 624 dest.isBiasCorrected = source.isBiasCorrected; 625 dest.incMoment = source.incMoment; 626 } 627}