001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019/** 020 * Computes the variance of the available values. The default implementation uses the 021 * following definition of the <em>sample variance</em>: 022 * 023 * <p>\[ \tfrac{1}{n-1} \sum_{i=1}^n (x_i-\overline{x})^2 \] 024 * 025 * <p>where \( \overline{x} \) is the sample mean, and \( n \) is the number of samples. 026 * 027 * <ul> 028 * <li>The result is {@code NaN} if no values are added. 029 * <li>The result is {@code NaN} if any of the values is {@code NaN} or infinite. 030 * <li>The result is {@code NaN} if the sum of the squared deviations from the mean is infinite. 031 * <li>The result is zero if there is one finite value in the data set. 032 * </ul> 033 * 034 * <p>The use of the term \( n − 1 \) is called Bessel's correction. This is an unbiased 035 * estimator of the variance of a hypothetical infinite population. If the 036 * {@link #setBiased(boolean) biased} option is enabled the normalisation factor is 037 * changed to \( \frac{1}{n} \) for a biased estimator of the <em>sample variance</em>. 038 * 039 * <p>The {@link #accept(double)} method uses a recursive updating algorithm based on West's 040 * algorithm (see Chan and Lewis (1979)). 041 * 042 * <p>The {@link #of(double...)} method uses the corrected two-pass algorithm from 043 * Chan <i>et al</i>, (1983). 044 * 045 * <p>Note that adding values using {@link #accept(double) accept} and then executing 046 * {@link #getAsDouble() getAsDouble} will 047 * sometimes give a different, less accurate, result than executing 048 * {@link #of(double...) of} with the full array of values. The former approach 049 * should only be used when the full array of values is not available. 050 * 051 * <p>Supports up to 2<sup>63</sup> (exclusive) observations. 052 * This implementation does not check for overflow of the count. 053 * 054 * <p>This class is designed to work with (though does not require) 055 * {@linkplain java.util.stream streams}. 056 * 057 * <p><strong>Note that this instance is not synchronized.</strong> If 058 * multiple threads access an instance of this class concurrently, and at least 059 * one of the threads invokes the {@link java.util.function.DoubleConsumer#accept(double) accept} or 060 * {@link StatisticAccumulator#combine(StatisticResult) combine} method, it must be synchronized externally. 061 * 062 * <p>However, it is safe to use {@link java.util.function.DoubleConsumer#accept(double) accept} 063 * and {@link StatisticAccumulator#combine(StatisticResult) combine} 064 * as {@code accumulator} and {@code combiner} functions of 065 * {@link java.util.stream.Collector Collector} on a parallel stream, 066 * because the parallel instance of {@link java.util.stream.Stream#collect Stream.collect()} 067 * provides the necessary partitioning, isolation, and merging of results for 068 * safe and efficient parallel execution. 069 * 070 * <p>References: 071 * <ul> 072 * <li>Chan and Lewis (1979) 073 * Computing standard deviations: accuracy. 074 * Communications of the ACM, 22, 526-531. 075 * <a href="http://doi.acm.org/10.1145/359146.359152">doi: 10.1145/359146.359152</a> 076 * <li>Chan, Golub and Levesque (1983) 077 * Algorithms for Computing the Sample Variance: Analysis and Recommendations. 078 * American Statistician, 37, 242-247. 079 * <a href="https://doi.org/10.2307/2683386">doi: 10.2307/2683386</a> 080 * </ul> 081 * 082 * @see <a href="https://en.wikipedia.org/wiki/Variance">Variance (Wikipedia)</a> 083 * @see <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a> 084 * @see StandardDeviation 085 * @since 1.1 086 */ 087public final class Variance implements DoubleStatistic, StatisticAccumulator<Variance> { 088 089 /** 090 * An instance of {@link SumOfSquaredDeviations}, which is used to 091 * compute the variance. 092 */ 093 private final SumOfSquaredDeviations ss; 094 095 /** Flag to control if the statistic is biased, or should use a bias correction. */ 096 private boolean biased; 097 098 /** 099 * Create an instance. 100 */ 101 private Variance() { 102 this(new SumOfSquaredDeviations()); 103 } 104 105 /** 106 * Creates an instance with the sum of squared deviations from the mean. 107 * 108 * @param ss Sum of squared deviations. 109 */ 110 Variance(SumOfSquaredDeviations ss) { 111 this.ss = ss; 112 } 113 114 /** 115 * Creates an instance. 116 * 117 * <p>The initial result is {@code NaN}. 118 * 119 * @return {@code Variance} instance. 120 */ 121 public static Variance create() { 122 return new Variance(); 123 } 124 125 /** 126 * Returns an instance populated using the input {@code values}. 127 * 128 * <p>Note: {@code Variance} computed using {@link #accept(double) accept} may be 129 * different from this variance. 130 * 131 * <p>See {@link Variance} for details on the computing algorithm. 132 * 133 * @param values Values. 134 * @return {@code Variance} instance. 135 */ 136 public static Variance of(double... values) { 137 return new Variance(SumOfSquaredDeviations.of(values)); 138 } 139 140 /** 141 * Returns an instance populated using the specified range of {@code values}. 142 * 143 * <p>Note: {@code Variance} computed using {@link #accept(double) accept} may be 144 * different from this variance. 145 * 146 * <p>See {@link Variance} for details on the computing algorithm. 147 * 148 * @param values Values. 149 * @param from Inclusive start of the range. 150 * @param to Exclusive end of the range. 151 * @return {@code Variance} instance. 152 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 153 * @since 1.2 154 */ 155 public static Variance ofRange(double[] values, int from, int to) { 156 Statistics.checkFromToIndex(from, to, values.length); 157 return new Variance(SumOfSquaredDeviations.ofRange(values, from, to)); 158 } 159 160 /** 161 * Updates the state of the statistic to reflect the addition of {@code value}. 162 * 163 * @param value Value. 164 */ 165 @Override 166 public void accept(double value) { 167 ss.accept(value); 168 } 169 170 /** 171 * Gets the variance of all input values. 172 * 173 * <p>When no values have been added, the result is {@code NaN}. 174 * 175 * @return variance of all values. 176 */ 177 @Override 178 public double getAsDouble() { 179 // This method checks the sum of squared is finite 180 // to provide a consistent NaN when the computation is not possible. 181 // Note: The SS checks for n=0 and returns NaN. 182 final double m2 = ss.getSumOfSquaredDeviations(); 183 if (!Double.isFinite(m2)) { 184 return Double.NaN; 185 } 186 final long n = ss.n; 187 // Avoid a divide by zero 188 if (n == 1) { 189 return 0; 190 } 191 return biased ? m2 / n : m2 / (n - 1); 192 } 193 194 @Override 195 public Variance combine(Variance other) { 196 ss.combine(other.ss); 197 return this; 198 } 199 200 /** 201 * Sets the value of the biased flag. The default value is {@code false}. 202 * 203 * <p>If {@code false} the sum of squared deviations from the sample mean is normalised by 204 * {@code n - 1} where {@code n} is the number of samples. This is Bessel's correction 205 * for an unbiased estimator of the variance of a hypothetical infinite population. 206 * 207 * <p>If {@code true} the sum of squared deviations is normalised by the number of samples 208 * {@code n}. 209 * 210 * <p>Note: This option only applies when {@code n > 1}. The variance of {@code n = 1} is 211 * always 0. 212 * 213 * <p>This flag only controls the final computation of the statistic. The value of this flag 214 * will not affect compatibility between instances during a {@link #combine(Variance) combine} 215 * operation. 216 * 217 * @param v Value. 218 * @return {@code this} instance 219 */ 220 public Variance setBiased(boolean v) { 221 biased = v; 222 return this; 223 } 224}