001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019import java.math.BigInteger; 020 021/** 022 * Computes the variance of the available values. The default implementation uses the 023 * following definition of the <em>sample variance</em>: 024 * 025 * <p>\[ \tfrac{1}{n-1} \sum_{i=1}^n (x_i-\overline{x})^2 \] 026 * 027 * <p>where \( \overline{x} \) is the sample mean, and \( n \) is the number of samples. 028 * 029 * <ul> 030 * <li>The result is {@code NaN} if no values are added. 031 * <li>The result is zero if there is one value in the data set. 032 * </ul> 033 * 034 * <p>The use of the term \( n − 1 \) is called Bessel's correction. This is an unbiased 035 * estimator of the variance of a hypothetical infinite population. If the 036 * {@link #setBiased(boolean) biased} option is enabled the normalisation factor is 037 * changed to \( \frac{1}{n} \) for a biased estimator of the <em>sample variance</em>. 038 * 039 * <p>The implementation uses an exact integer sum to compute the scaled (by \( n \)) 040 * sum of squared deviations from the mean; this is normalised by the scaled correction factor. 041 * 042 * <p>\[ \frac {n \times \sum_{i=1}^n x_i^2 - (\sum_{i=1}^n x_i)^2}{n \times (n - 1)} \] 043 * 044 * <p>Supports up to 2<sup>63</sup> (exclusive) observations. 045 * This implementation does not check for overflow of the count. 046 * 047 * <p>This class is designed to work with (though does not require) 048 * {@linkplain java.util.stream streams}. 049 * 050 * <p><strong>This implementation is not thread safe.</strong> 051 * If multiple threads access an instance of this class concurrently, 052 * and at least one of the threads invokes the {@link java.util.function.IntConsumer#accept(int) accept} or 053 * {@link StatisticAccumulator#combine(StatisticResult) combine} method, it must be synchronized externally. 054 * 055 * <p>However, it is safe to use {@link java.util.function.IntConsumer#accept(int) accept} 056 * and {@link StatisticAccumulator#combine(StatisticResult) combine} 057 * as {@code accumulator} and {@code combiner} functions of 058 * {@link java.util.stream.Collector Collector} on a parallel stream, 059 * because the parallel implementation of {@link java.util.stream.Stream#collect Stream.collect()} 060 * provides the necessary partitioning, isolation, and merging of results for 061 * safe and efficient parallel execution. 062 * 063 * @see <a href="https://en.wikipedia.org/wiki/variance">variance (Wikipedia)</a> 064 * @see <a href="https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance"> 065 * Algorithms for computing the variance (Wikipedia)</a> 066 * @see <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a> 067 * @since 1.1 068 */ 069public final class IntVariance implements IntStatistic, StatisticAccumulator<IntVariance> { 070 /** Small array sample size. 071 * Used to avoid computing with UInt96 then converting to UInt128. */ 072 static final int SMALL_SAMPLE = 10; 073 074 /** Sum of the squared values. */ 075 private final UInt128 sumSq; 076 /** Sum of the values. */ 077 private final Int128 sum; 078 /** Count of values that have been added. */ 079 private long n; 080 081 /** Flag to control if the statistic is biased, or should use a bias correction. */ 082 private boolean biased; 083 084 /** 085 * Create an instance. 086 */ 087 private IntVariance() { 088 this(UInt128.create(), Int128.create(), 0); 089 } 090 091 /** 092 * Create an instance. 093 * 094 * @param sumSq Sum of the squared values. 095 * @param sum Sum of the values. 096 * @param n Count of values that have been added. 097 */ 098 private IntVariance(UInt128 sumSq, Int128 sum, int n) { 099 this.sumSq = sumSq; 100 this.sum = sum; 101 this.n = n; 102 } 103 104 /** 105 * Creates an instance. 106 * 107 * <p>The initial result is {@code NaN}. 108 * 109 * @return {@code IntVariance} instance. 110 */ 111 public static IntVariance create() { 112 return new IntVariance(); 113 } 114 115 /** 116 * Returns an instance populated using the input {@code values}. 117 * 118 * @param values Values. 119 * @return {@code IntVariance} instance. 120 */ 121 public static IntVariance of(int... values) { 122 return createFromRange(values, 0, values.length); 123 } 124 125 /** 126 * Returns an instance populated using the specified range of {@code values}. 127 * 128 * @param values Values. 129 * @param from Inclusive start of the range. 130 * @param to Exclusive end of the range. 131 * @return {@code IntVariance} instance. 132 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 133 * @since 1.2 134 */ 135 public static IntVariance ofRange(int[] values, int from, int to) { 136 Statistics.checkFromToIndex(from, to, values.length); 137 return createFromRange(values, from, to); 138 } 139 140 /** 141 * Create an instance using the specified range of {@code values}. 142 * 143 * <p>Warning: No range checks are performed. 144 * 145 * @param values Values. 146 * @param from Inclusive start of the range. 147 * @param to Exclusive end of the range. 148 * @return {@code IntVariance} instance. 149 */ 150 static IntVariance createFromRange(int[] values, int from, int to) { 151 // Small arrays can be processed using the object 152 final int length = to - from; 153 if (length < SMALL_SAMPLE) { 154 final IntVariance stat = new IntVariance(); 155 for (int i = from; i < to; i++) { 156 stat.accept(values[i]); 157 } 158 return stat; 159 } 160 161 // Arrays can be processed using specialised counts knowing the maximum limit 162 // for an array is 2^31 values. 163 long s = 0; 164 final UInt96 ss = UInt96.create(); 165 // Process pairs as we know two maximum value int^2 will not overflow 166 // an unsigned long. 167 final int end = from + (length & ~0x1); 168 for (int i = from; i < end; i += 2) { 169 final long x = values[i]; 170 final long y = values[i + 1]; 171 s += x + y; 172 ss.addPositive(x * x + y * y); 173 } 174 if (end < to) { 175 final long x = values[end]; 176 s += x; 177 ss.addPositive(x * x); 178 } 179 180 // Convert 181 return new IntVariance(UInt128.of(ss), Int128.of(s), length); 182 } 183 184 /** 185 * Updates the state of the statistic to reflect the addition of {@code value}. 186 * 187 * @param value Value. 188 */ 189 @Override 190 public void accept(int value) { 191 sumSq.addPositive((long) value * value); 192 sum.add(value); 193 n++; 194 } 195 196 /** 197 * Gets the variance of all input values. 198 * 199 * <p>When no values have been added, the result is {@code NaN}. 200 * 201 * @return variance of all values. 202 */ 203 @Override 204 public double getAsDouble() { 205 return computeVarianceOrStd(sumSq, sum, n, biased, false); 206 } 207 208 /** 209 * Compute the variance (or standard deviation). 210 * 211 * <p>The {@code std} flag controls if the result is returned as the standard deviation 212 * using the {@link Math#sqrt(double) square root} function. 213 * 214 * @param sumSq Sum of the squared values. 215 * @param sum Sum of the values. 216 * @param n Count of values that have been added. 217 * @param biased Flag to control if the statistic is biased, or should use a bias correction. 218 * @param std Flag to control if the statistic is the standard deviation. 219 * @return the variance (or standard deviation) 220 */ 221 static double computeVarianceOrStd(UInt128 sumSq, Int128 sum, long n, boolean biased, boolean std) { 222 if (n == 0) { 223 return Double.NaN; 224 } 225 // Avoid a divide by zero 226 if (n == 1) { 227 return 0; 228 } 229 // Sum-of-squared deviations: sum(x^2) - sum(x)^2 / n 230 // Sum-of-squared deviations precursor: n * sum(x^2) - sum(x)^2 231 // The precursor is computed in integer precision. 232 // The divide uses double precision. 233 // This ensures we avoid cancellation in the difference and use a fast divide. 234 // The result is limited to by the rounding in the double computation. 235 final double diff = computeSSDevN(sumSq, sum, n); 236 final long n0 = biased ? n : n - 1; 237 final double v = diff / IntMath.unsignedMultiplyToDouble(n, n0); 238 if (std) { 239 return Math.sqrt(v); 240 } 241 return v; 242 } 243 244 /** 245 * Compute the sum-of-squared deviations multiplied by the count of values: 246 * {@code n * sum(x^2) - sum(x)^2}. 247 * 248 * @param sumSq Sum of the squared values. 249 * @param sum Sum of the values. 250 * @param n Count of values that have been added. 251 * @return the sum-of-squared deviations precursor 252 */ 253 private static double computeSSDevN(UInt128 sumSq, Int128 sum, long n) { 254 // Compute the term if possible using fast integer arithmetic. 255 // 128-bit sum(x^2) * n will be OK when the upper 32-bits are zero. 256 // 128-bit sum(x)^2 will be OK when the upper 64-bits are zero. 257 // Both are safe when n < 2^32. 258 if ((n >>> Integer.SIZE) == 0) { 259 return sumSq.unsignedMultiply((int) n).subtract(sum.squareLow()).toDouble(); 260 } else { 261 return sumSq.toBigInteger().multiply(BigInteger.valueOf(n)) 262 .subtract(square(sum.toBigInteger())).doubleValue(); 263 } 264 } 265 266 /** 267 * Compute the sum of the squared deviations from the mean. 268 * 269 * <p>This is a helper method used in higher order moments. 270 * 271 * @return the sum of the squared deviations 272 */ 273 double computeSumOfSquaredDeviations() { 274 return computeSSDevN(sumSq, sum, n) / n; 275 } 276 277 /** 278 * Compute the mean. 279 * 280 * <p>This is a helper method used in higher order moments. 281 * 282 * @return the mean 283 */ 284 double computeMean() { 285 return IntMean.computeMean(sum, n); 286 } 287 288 /** 289 * Convenience method to square a BigInteger. 290 * 291 * @param x Value 292 * @return x^2 293 */ 294 private static BigInteger square(BigInteger x) { 295 return x.multiply(x); 296 } 297 298 @Override 299 public IntVariance combine(IntVariance other) { 300 sumSq.add(other.sumSq); 301 sum.add(other.sum); 302 n += other.n; 303 return this; 304 } 305 306 /** 307 * Sets the value of the biased flag. The default value is {@code false}. 308 * 309 * <p>If {@code false} the sum of squared deviations from the sample mean is normalised by 310 * {@code n - 1} where {@code n} is the number of samples. This is Bessel's correction 311 * for an unbiased estimator of the variance of a hypothetical infinite population. 312 * 313 * <p>If {@code true} the sum of squared deviations is normalised by the number of samples 314 * {@code n}. 315 * 316 * <p>Note: This option only applies when {@code n > 1}. The variance of {@code n = 1} is 317 * always 0. 318 * 319 * <p>This flag only controls the final computation of the statistic. The value of this flag 320 * will not affect compatibility between instances during a {@link #combine(IntVariance) combine} 321 * operation. 322 * 323 * @param v Value. 324 * @return {@code this} instance 325 */ 326 public IntVariance setBiased(boolean v) { 327 biased = v; 328 return this; 329 } 330}