001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math3.stat.descriptive.rank; 018 019 import java.io.Serializable; 020 import java.util.Arrays; 021 022 import org.apache.commons.math3.exception.MathIllegalArgumentException; 023 import org.apache.commons.math3.exception.NullArgumentException; 024 import org.apache.commons.math3.exception.OutOfRangeException; 025 import org.apache.commons.math3.exception.util.LocalizedFormats; 026 import org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic; 027 import org.apache.commons.math3.util.FastMath; 028 import org.apache.commons.math3.util.MathUtils; 029 030 /** 031 * Provides percentile computation. 032 * <p> 033 * There are several commonly used methods for estimating percentiles (a.k.a. 034 * quantiles) based on sample data. For large samples, the different methods 035 * agree closely, but when sample sizes are small, different methods will give 036 * significantly different results. The algorithm implemented here works as follows: 037 * <ol> 038 * <li>Let <code>n</code> be the length of the (sorted) array and 039 * <code>0 < p <= 100</code> be the desired percentile.</li> 040 * <li>If <code> n = 1 </code> return the unique array element (regardless of 041 * the value of <code>p</code>); otherwise </li> 042 * <li>Compute the estimated percentile position 043 * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code> 044 * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional 045 * part of <code>pos</code>).</li> 046 * <li> If <code>pos < 1</code> return the smallest element in the array.</li> 047 * <li> Else if <code>pos >= n</code> return the largest element in the array.</li> 048 * <li> Else let <code>lower</code> be the element in position 049 * <code>floor(pos)</code> in the array and let <code>upper</code> be the 050 * next element in the array. Return <code>lower + d * (upper - lower)</code> 051 * </li> 052 * </ol></p> 053 * <p> 054 * To compute percentiles, the data must be at least partially ordered. Input 055 * arrays are copied and recursively partitioned using an ordering definition. 056 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined 057 * by {@link java.lang.Double#compareTo(Double)}. This ordering makes 058 * <code>Double.NaN</code> larger than any other value (including 059 * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median 060 * (50th percentile) of 061 * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p> 062 * <p> 063 * Since percentile estimation usually involves interpolation between array 064 * elements, arrays containing <code>NaN</code> or infinite values will often 065 * result in <code>NaN</code> or infinite values returned.</p> 066 * <p> 067 * Since 2.2, Percentile uses only selection instead of complete sorting 068 * and caches selection algorithm state between calls to the various 069 * {@code evaluate} methods. This greatly improves efficiency, both for a single 070 * percentile and multiple percentile computations. To maximize performance when 071 * multiple percentiles are computed based on the same data, users should set the 072 * data array once using either one of the {@link #evaluate(double[], double)} or 073 * {@link #setData(double[])} methods and thereafter {@link #evaluate(double)} 074 * with just the percentile provided. 075 * </p> 076 * <p> 077 * <strong>Note that this implementation is not synchronized.</strong> If 078 * multiple threads access an instance of this class concurrently, and at least 079 * one of the threads invokes the <code>increment()</code> or 080 * <code>clear()</code> method, it must be synchronized externally.</p> 081 * 082 * @version $Id: Percentile.java 1416643 2012-12-03 19:37:14Z tn $ 083 */ 084 public class Percentile extends AbstractUnivariateStatistic implements Serializable { 085 086 /** Serializable version identifier */ 087 private static final long serialVersionUID = -8091216485095130416L; 088 089 /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */ 090 private static final int MIN_SELECT_SIZE = 15; 091 092 /** Maximum number of partitioning pivots cached (each level double the number of pivots). */ 093 private static final int MAX_CACHED_LEVELS = 10; 094 095 /** Determines what percentile is computed when evaluate() is activated 096 * with no quantile argument */ 097 private double quantile = 0.0; 098 099 /** Cached pivots. */ 100 private int[] cachedPivots; 101 102 /** 103 * Constructs a Percentile with a default quantile 104 * value of 50.0. 105 */ 106 public Percentile() { 107 // No try-catch or advertised exception here - arg is valid 108 this(50.0); 109 } 110 111 /** 112 * Constructs a Percentile with the specific quantile value. 113 * @param p the quantile 114 * @throws MathIllegalArgumentException if p is not greater than 0 and less 115 * than or equal to 100 116 */ 117 public Percentile(final double p) throws MathIllegalArgumentException { 118 setQuantile(p); 119 cachedPivots = null; 120 } 121 122 /** 123 * Copy constructor, creates a new {@code Percentile} identical 124 * to the {@code original} 125 * 126 * @param original the {@code Percentile} instance to copy 127 * @throws NullArgumentException if original is null 128 */ 129 public Percentile(Percentile original) throws NullArgumentException { 130 copy(original, this); 131 } 132 133 /** {@inheritDoc} */ 134 @Override 135 public void setData(final double[] values) { 136 if (values == null) { 137 cachedPivots = null; 138 } else { 139 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 140 Arrays.fill(cachedPivots, -1); 141 } 142 super.setData(values); 143 } 144 145 /** {@inheritDoc} */ 146 @Override 147 public void setData(final double[] values, final int begin, final int length) 148 throws MathIllegalArgumentException { 149 if (values == null) { 150 cachedPivots = null; 151 } else { 152 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 153 Arrays.fill(cachedPivots, -1); 154 } 155 super.setData(values, begin, length); 156 } 157 158 /** 159 * Returns the result of evaluating the statistic over the stored data. 160 * <p> 161 * The stored array is the one which was set by previous calls to 162 * {@link #setData(double[])} 163 * </p> 164 * @param p the percentile value to compute 165 * @return the value of the statistic applied to the stored data 166 * @throws MathIllegalArgumentException if p is not a valid quantile value 167 * (p must be greater than 0 and less than or equal to 100) 168 */ 169 public double evaluate(final double p) throws MathIllegalArgumentException { 170 return evaluate(getDataRef(), p); 171 } 172 173 /** 174 * Returns an estimate of the <code>p</code>th percentile of the values 175 * in the <code>values</code> array. 176 * <p> 177 * Calls to this method do not modify the internal <code>quantile</code> 178 * state of this statistic.</p> 179 * <p> 180 * <ul> 181 * <li>Returns <code>Double.NaN</code> if <code>values</code> has length 182 * <code>0</code></li> 183 * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> 184 * if <code>values</code> has length <code>1</code></li> 185 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 186 * is null or p is not a valid quantile value (p must be greater than 0 187 * and less than or equal to 100) </li> 188 * </ul></p> 189 * <p> 190 * See {@link Percentile} for a description of the percentile estimation 191 * algorithm used.</p> 192 * 193 * @param values input array of values 194 * @param p the percentile value to compute 195 * @return the percentile value or Double.NaN if the array is empty 196 * @throws MathIllegalArgumentException if <code>values</code> is null 197 * or p is invalid 198 */ 199 public double evaluate(final double[] values, final double p) 200 throws MathIllegalArgumentException { 201 test(values, 0, 0); 202 return evaluate(values, 0, values.length, p); 203 } 204 205 /** 206 * Returns an estimate of the <code>quantile</code>th percentile of the 207 * designated values in the <code>values</code> array. The quantile 208 * estimated is determined by the <code>quantile</code> property. 209 * <p> 210 * <ul> 211 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 212 * <li>Returns (for any value of <code>quantile</code>) 213 * <code>values[begin]</code> if <code>length = 1 </code></li> 214 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 215 * is null, or <code>start</code> or <code>length</code> is invalid</li> 216 * </ul></p> 217 * <p> 218 * See {@link Percentile} for a description of the percentile estimation 219 * algorithm used.</p> 220 * 221 * @param values the input array 222 * @param start index of the first array element to include 223 * @param length the number of elements to include 224 * @return the percentile value 225 * @throws MathIllegalArgumentException if the parameters are not valid 226 * 227 */ 228 @Override 229 public double evaluate(final double[] values, final int start, final int length) 230 throws MathIllegalArgumentException { 231 return evaluate(values, start, length, quantile); 232 } 233 234 /** 235 * Returns an estimate of the <code>p</code>th percentile of the values 236 * in the <code>values</code> array, starting with the element in (0-based) 237 * position <code>begin</code> in the array and including <code>length</code> 238 * values. 239 * <p> 240 * Calls to this method do not modify the internal <code>quantile</code> 241 * state of this statistic.</p> 242 * <p> 243 * <ul> 244 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 245 * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> 246 * if <code>length = 1 </code></li> 247 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 248 * is null , <code>begin</code> or <code>length</code> is invalid, or 249 * <code>p</code> is not a valid quantile value (p must be greater than 0 250 * and less than or equal to 100)</li> 251 * </ul></p> 252 * <p> 253 * See {@link Percentile} for a description of the percentile estimation 254 * algorithm used.</p> 255 * 256 * @param values array of input values 257 * @param p the percentile to compute 258 * @param begin the first (0-based) element to include in the computation 259 * @param length the number of array elements to include 260 * @return the percentile value 261 * @throws MathIllegalArgumentException if the parameters are not valid or the 262 * input array is null 263 */ 264 public double evaluate(final double[] values, final int begin, 265 final int length, final double p) throws MathIllegalArgumentException { 266 267 test(values, begin, length); 268 269 if ((p > 100) || (p <= 0)) { 270 throw new OutOfRangeException( 271 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); 272 } 273 if (length == 0) { 274 return Double.NaN; 275 } 276 if (length == 1) { 277 return values[begin]; // always return single value for n = 1 278 } 279 double n = length; 280 double pos = p * (n + 1) / 100; 281 double fpos = FastMath.floor(pos); 282 int intPos = (int) fpos; 283 double dif = pos - fpos; 284 double[] work; 285 int[] pivotsHeap; 286 if (values == getDataRef()) { 287 work = getDataRef(); 288 pivotsHeap = cachedPivots; 289 } else { 290 work = new double[length]; 291 System.arraycopy(values, begin, work, 0, length); 292 pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; 293 Arrays.fill(pivotsHeap, -1); 294 } 295 296 if (pos < 1) { 297 return select(work, pivotsHeap, 0); 298 } 299 if (pos >= n) { 300 return select(work, pivotsHeap, length - 1); 301 } 302 double lower = select(work, pivotsHeap, intPos - 1); 303 double upper = select(work, pivotsHeap, intPos); 304 return lower + dif * (upper - lower); 305 } 306 307 /** 308 * Select the k<sup>th</sup> smallest element from work array 309 * @param work work array (will be reorganized during the call) 310 * @param pivotsHeap set of pivot index corresponding to elements that 311 * are already at their sorted location, stored as an implicit heap 312 * (i.e. a sorted binary tree stored in a flat array, where the 313 * children of a node at index n are at indices 2n+1 for the left 314 * child and 2n+2 for the right child, with 0-based indices) 315 * @param k index of the desired element 316 * @return k<sup>th</sup> smallest element 317 */ 318 private double select(final double[] work, final int[] pivotsHeap, final int k) { 319 320 int begin = 0; 321 int end = work.length; 322 int node = 0; 323 324 while (end - begin > MIN_SELECT_SIZE) { 325 326 final int pivot; 327 if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) { 328 // the pivot has already been found in a previous call 329 // and the array has already been partitioned around it 330 pivot = pivotsHeap[node]; 331 } else { 332 // select a pivot and partition work array around it 333 pivot = partition(work, begin, end, medianOf3(work, begin, end)); 334 if (node < pivotsHeap.length) { 335 pivotsHeap[node] = pivot; 336 } 337 } 338 339 if (k == pivot) { 340 // the pivot was exactly the element we wanted 341 return work[k]; 342 } else if (k < pivot) { 343 // the element is in the left partition 344 end = pivot; 345 node = FastMath.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow 346 } else { 347 // the element is in the right partition 348 begin = pivot + 1; 349 node = FastMath.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow 350 } 351 352 } 353 354 // the element is somewhere in the small sub-array 355 // sort the sub-array using insertion sort 356 insertionSort(work, begin, end); 357 return work[k]; 358 359 } 360 361 /** Select a pivot index as the median of three 362 * @param work data array 363 * @param begin index of the first element of the slice 364 * @param end index after the last element of the slice 365 * @return the index of the median element chosen between the 366 * first, the middle and the last element of the array slice 367 */ 368 int medianOf3(final double[] work, final int begin, final int end) { 369 370 final int inclusiveEnd = end - 1; 371 final int middle = begin + (inclusiveEnd - begin) / 2; 372 final double wBegin = work[begin]; 373 final double wMiddle = work[middle]; 374 final double wEnd = work[inclusiveEnd]; 375 376 if (wBegin < wMiddle) { 377 if (wMiddle < wEnd) { 378 return middle; 379 } else { 380 return (wBegin < wEnd) ? inclusiveEnd : begin; 381 } 382 } else { 383 if (wBegin < wEnd) { 384 return begin; 385 } else { 386 return (wMiddle < wEnd) ? inclusiveEnd : middle; 387 } 388 } 389 390 } 391 392 /** 393 * Partition an array slice around a pivot 394 * <p> 395 * Partitioning exchanges array elements such that all elements 396 * smaller than pivot are before it and all elements larger than 397 * pivot are after it 398 * </p> 399 * @param work data array 400 * @param begin index of the first element of the slice 401 * @param end index after the last element of the slice 402 * @param pivot initial index of the pivot 403 * @return index of the pivot after partition 404 */ 405 private int partition(final double[] work, final int begin, final int end, final int pivot) { 406 407 final double value = work[pivot]; 408 work[pivot] = work[begin]; 409 410 int i = begin + 1; 411 int j = end - 1; 412 while (i < j) { 413 while ((i < j) && (work[j] > value)) { 414 --j; 415 } 416 while ((i < j) && (work[i] < value)) { 417 ++i; 418 } 419 420 if (i < j) { 421 final double tmp = work[i]; 422 work[i++] = work[j]; 423 work[j--] = tmp; 424 } 425 } 426 427 if ((i >= end) || (work[i] > value)) { 428 --i; 429 } 430 work[begin] = work[i]; 431 work[i] = value; 432 return i; 433 434 } 435 436 /** 437 * Sort in place a (small) array slice using insertion sort 438 * @param work array to sort 439 * @param begin index of the first element of the slice to sort 440 * @param end index after the last element of the slice to sort 441 */ 442 private void insertionSort(final double[] work, final int begin, final int end) { 443 for (int j = begin + 1; j < end; j++) { 444 final double saved = work[j]; 445 int i = j - 1; 446 while ((i >= begin) && (saved < work[i])) { 447 work[i + 1] = work[i]; 448 i--; 449 } 450 work[i + 1] = saved; 451 } 452 } 453 454 /** 455 * Returns the value of the quantile field (determines what percentile is 456 * computed when evaluate() is called with no quantile argument). 457 * 458 * @return quantile 459 */ 460 public double getQuantile() { 461 return quantile; 462 } 463 464 /** 465 * Sets the value of the quantile field (determines what percentile is 466 * computed when evaluate() is called with no quantile argument). 467 * 468 * @param p a value between 0 < p <= 100 469 * @throws MathIllegalArgumentException if p is not greater than 0 and less 470 * than or equal to 100 471 */ 472 public void setQuantile(final double p) throws MathIllegalArgumentException { 473 if (p <= 0 || p > 100) { 474 throw new OutOfRangeException( 475 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); 476 } 477 quantile = p; 478 } 479 480 /** 481 * {@inheritDoc} 482 */ 483 @Override 484 public Percentile copy() { 485 Percentile result = new Percentile(); 486 //No try-catch or advertised exception because args are guaranteed non-null 487 copy(this, result); 488 return result; 489 } 490 491 /** 492 * Copies source to dest. 493 * <p>Neither source nor dest can be null.</p> 494 * 495 * @param source Percentile to copy 496 * @param dest Percentile to copy to 497 * @throws NullArgumentException if either source or dest is null 498 */ 499 public static void copy(Percentile source, Percentile dest) 500 throws NullArgumentException { 501 MathUtils.checkNotNull(source); 502 MathUtils.checkNotNull(dest); 503 dest.setData(source.getDataRef()); 504 if (source.cachedPivots != null) { 505 System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length); 506 } 507 dest.quantile = source.quantile; 508 } 509 510 }