001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.math3.stat.descriptive.rank; 018 019import java.io.Serializable; 020import java.util.Arrays; 021import java.util.BitSet; 022 023import org.apache.commons.math3.exception.MathIllegalArgumentException; 024import org.apache.commons.math3.exception.MathUnsupportedOperationException; 025import org.apache.commons.math3.exception.NullArgumentException; 026import org.apache.commons.math3.exception.OutOfRangeException; 027import org.apache.commons.math3.exception.util.LocalizedFormats; 028import org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic; 029import org.apache.commons.math3.stat.ranking.NaNStrategy; 030import org.apache.commons.math3.util.FastMath; 031import org.apache.commons.math3.util.KthSelector; 032import org.apache.commons.math3.util.MathArrays; 033import org.apache.commons.math3.util.MathUtils; 034import org.apache.commons.math3.util.MedianOf3PivotingStrategy; 035import org.apache.commons.math3.util.PivotingStrategyInterface; 036import org.apache.commons.math3.util.Precision; 037 038/** 039 * Provides percentile computation. 040 * <p> 041 * There are several commonly used methods for estimating percentiles (a.k.a. 042 * quantiles) based on sample data. For large samples, the different methods 043 * agree closely, but when sample sizes are small, different methods will give 044 * significantly different results. The algorithm implemented here works as follows: 045 * <ol> 046 * <li>Let <code>n</code> be the length of the (sorted) array and 047 * <code>0 < p <= 100</code> be the desired percentile.</li> 048 * <li>If <code> n = 1 </code> return the unique array element (regardless of 049 * the value of <code>p</code>); otherwise </li> 050 * <li>Compute the estimated percentile position 051 * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code> 052 * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional 053 * part of <code>pos</code>).</li> 054 * <li> If <code>pos < 1</code> return the smallest element in the array.</li> 055 * <li> Else if <code>pos >= n</code> return the largest element in the array.</li> 056 * <li> Else let <code>lower</code> be the element in position 057 * <code>floor(pos)</code> in the array and let <code>upper</code> be the 058 * next element in the array. Return <code>lower + d * (upper - lower)</code> 059 * </li> 060 * </ol></p> 061 * <p> 062 * To compute percentiles, the data must be at least partially ordered. Input 063 * arrays are copied and recursively partitioned using an ordering definition. 064 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined 065 * by {@link java.lang.Double#compareTo(Double)}. This ordering makes 066 * <code>Double.NaN</code> larger than any other value (including 067 * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median 068 * (50th percentile) of 069 * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p> 070 * <p> 071 * Since percentile estimation usually involves interpolation between array 072 * elements, arrays containing <code>NaN</code> or infinite values will often 073 * result in <code>NaN</code> or infinite values returned.</p> 074 * <p> 075 * Further, to include different estimation types such as R1, R2 as mentioned in 076 * <a href="http://en.wikipedia.org/wiki/Quantile">Quantile page(wikipedia)</a>, 077 * a type specific NaN handling strategy is used to closely match with the 078 * typically observed results from popular tools like R(R1-R9), Excel(R7).</p> 079 * <p> 080 * Since 2.2, Percentile uses only selection instead of complete sorting 081 * and caches selection algorithm state between calls to the various 082 * {@code evaluate} methods. This greatly improves efficiency, both for a single 083 * percentile and multiple percentile computations. To maximize performance when 084 * multiple percentiles are computed based on the same data, users should set the 085 * data array once using either one of the {@link #evaluate(double[], double)} or 086 * {@link #setData(double[])} methods and thereafter {@link #evaluate(double)} 087 * with just the percentile provided. 088 * </p> 089 * <p> 090 * <strong>Note that this implementation is not synchronized.</strong> If 091 * multiple threads access an instance of this class concurrently, and at least 092 * one of the threads invokes the <code>increment()</code> or 093 * <code>clear()</code> method, it must be synchronized externally.</p> 094 * 095 */ 096public class Percentile extends AbstractUnivariateStatistic implements Serializable { 097 098 /** Serializable version identifier */ 099 private static final long serialVersionUID = -8091216485095130416L; 100 101 /** Maximum number of partitioning pivots cached (each level double the number of pivots). */ 102 private static final int MAX_CACHED_LEVELS = 10; 103 104 /** Maximum number of cached pivots in the pivots cached array */ 105 private static final int PIVOTS_HEAP_LENGTH = 0x1 << MAX_CACHED_LEVELS - 1; 106 107 /** Default KthSelector used with default pivoting strategy */ 108 private final KthSelector kthSelector; 109 110 /** Any of the {@link EstimationType}s such as {@link EstimationType#LEGACY CM} can be used. */ 111 private final EstimationType estimationType; 112 113 /** NaN Handling of the input as defined by {@link NaNStrategy} */ 114 private final NaNStrategy nanStrategy; 115 116 /** Determines what percentile is computed when evaluate() is activated 117 * with no quantile argument */ 118 private double quantile; 119 120 /** Cached pivots. */ 121 private int[] cachedPivots; 122 123 /** 124 * Constructs a Percentile with the following defaults. 125 * <ul> 126 * <li>default quantile: 50.0, can be reset with {@link #setQuantile(double)}</li> 127 * <li>default estimation type: {@link EstimationType#LEGACY}, 128 * can be reset with {@link #withEstimationType(EstimationType)}</li> 129 * <li>default NaN strategy: {@link NaNStrategy#REMOVED}, 130 * can be reset with {@link #withNaNStrategy(NaNStrategy)}</li> 131 * <li>a KthSelector that makes use of {@link MedianOf3PivotingStrategy}, 132 * can be reset with {@link #withKthSelector(KthSelector)}</li> 133 * </ul> 134 */ 135 public Percentile() { 136 // No try-catch or advertised exception here - arg is valid 137 this(50.0); 138 } 139 140 /** 141 * Constructs a Percentile with the specific quantile value and the following 142 * <ul> 143 * <li>default method type: {@link EstimationType#LEGACY}</li> 144 * <li>default NaN strategy: {@link NaNStrategy#REMOVED}</li> 145 * <li>a Kth Selector : {@link KthSelector}</li> 146 * </ul> 147 * @param quantile the quantile 148 * @throws MathIllegalArgumentException if p is not greater than 0 and less 149 * than or equal to 100 150 */ 151 public Percentile(final double quantile) throws MathIllegalArgumentException { 152 this(quantile, EstimationType.LEGACY, NaNStrategy.REMOVED, 153 new KthSelector(new MedianOf3PivotingStrategy())); 154 } 155 156 /** 157 * Copy constructor, creates a new {@code Percentile} identical 158 * to the {@code original} 159 * 160 * @param original the {@code Percentile} instance to copy 161 * @throws NullArgumentException if original is null 162 */ 163 public Percentile(final Percentile original) throws NullArgumentException { 164 165 MathUtils.checkNotNull(original); 166 estimationType = original.getEstimationType(); 167 nanStrategy = original.getNaNStrategy(); 168 kthSelector = original.getKthSelector(); 169 170 setData(original.getDataRef()); 171 if (original.cachedPivots != null) { 172 System.arraycopy(original.cachedPivots, 0, cachedPivots, 0, original.cachedPivots.length); 173 } 174 setQuantile(original.quantile); 175 176 } 177 178 /** 179 * Constructs a Percentile with the specific quantile value, 180 * {@link EstimationType}, {@link NaNStrategy} and {@link KthSelector}. 181 * 182 * @param quantile the quantile to be computed 183 * @param estimationType one of the percentile {@link EstimationType estimation types} 184 * @param nanStrategy one of {@link NaNStrategy} to handle with NaNs 185 * @param kthSelector a {@link KthSelector} to use for pivoting during search 186 * @throws MathIllegalArgumentException if p is not within (0,100] 187 * @throws NullArgumentException if type or NaNStrategy passed is null 188 */ 189 protected Percentile(final double quantile, 190 final EstimationType estimationType, 191 final NaNStrategy nanStrategy, 192 final KthSelector kthSelector) 193 throws MathIllegalArgumentException { 194 setQuantile(quantile); 195 cachedPivots = null; 196 MathUtils.checkNotNull(estimationType); 197 MathUtils.checkNotNull(nanStrategy); 198 MathUtils.checkNotNull(kthSelector); 199 this.estimationType = estimationType; 200 this.nanStrategy = nanStrategy; 201 this.kthSelector = kthSelector; 202 } 203 204 /** {@inheritDoc} */ 205 @Override 206 public void setData(final double[] values) { 207 if (values == null) { 208 cachedPivots = null; 209 } else { 210 cachedPivots = new int[PIVOTS_HEAP_LENGTH]; 211 Arrays.fill(cachedPivots, -1); 212 } 213 super.setData(values); 214 } 215 216 /** {@inheritDoc} */ 217 @Override 218 public void setData(final double[] values, final int begin, final int length) 219 throws MathIllegalArgumentException { 220 if (values == null) { 221 cachedPivots = null; 222 } else { 223 cachedPivots = new int[PIVOTS_HEAP_LENGTH]; 224 Arrays.fill(cachedPivots, -1); 225 } 226 super.setData(values, begin, length); 227 } 228 229 /** 230 * Returns the result of evaluating the statistic over the stored data. 231 * <p> 232 * The stored array is the one which was set by previous calls to 233 * {@link #setData(double[])} 234 * </p> 235 * @param p the percentile value to compute 236 * @return the value of the statistic applied to the stored data 237 * @throws MathIllegalArgumentException if p is not a valid quantile value 238 * (p must be greater than 0 and less than or equal to 100) 239 */ 240 public double evaluate(final double p) throws MathIllegalArgumentException { 241 return evaluate(getDataRef(), p); 242 } 243 244 /** 245 * Returns an estimate of the <code>p</code>th percentile of the values 246 * in the <code>values</code> array. 247 * <p> 248 * Calls to this method do not modify the internal <code>quantile</code> 249 * state of this statistic.</p> 250 * <p> 251 * <ul> 252 * <li>Returns <code>Double.NaN</code> if <code>values</code> has length 253 * <code>0</code></li> 254 * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> 255 * if <code>values</code> has length <code>1</code></li> 256 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 257 * is null or p is not a valid quantile value (p must be greater than 0 258 * and less than or equal to 100) </li> 259 * </ul></p> 260 * <p> 261 * See {@link Percentile} for a description of the percentile estimation 262 * algorithm used.</p> 263 * 264 * @param values input array of values 265 * @param p the percentile value to compute 266 * @return the percentile value or Double.NaN if the array is empty 267 * @throws MathIllegalArgumentException if <code>values</code> is null 268 * or p is invalid 269 */ 270 public double evaluate(final double[] values, final double p) 271 throws MathIllegalArgumentException { 272 test(values, 0, 0); 273 return evaluate(values, 0, values.length, p); 274 } 275 276 /** 277 * Returns an estimate of the <code>quantile</code>th percentile of the 278 * designated values in the <code>values</code> array. The quantile 279 * estimated is determined by the <code>quantile</code> property. 280 * <p> 281 * <ul> 282 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 283 * <li>Returns (for any value of <code>quantile</code>) 284 * <code>values[begin]</code> if <code>length = 1 </code></li> 285 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 286 * is null, or <code>start</code> or <code>length</code> is invalid</li> 287 * </ul></p> 288 * <p> 289 * See {@link Percentile} for a description of the percentile estimation 290 * algorithm used.</p> 291 * 292 * @param values the input array 293 * @param start index of the first array element to include 294 * @param length the number of elements to include 295 * @return the percentile value 296 * @throws MathIllegalArgumentException if the parameters are not valid 297 * 298 */ 299 @Override 300 public double evaluate(final double[] values, final int start, final int length) 301 throws MathIllegalArgumentException { 302 return evaluate(values, start, length, quantile); 303 } 304 305 /** 306 * Returns an estimate of the <code>p</code>th percentile of the values 307 * in the <code>values</code> array, starting with the element in (0-based) 308 * position <code>begin</code> in the array and including <code>length</code> 309 * values. 310 * <p> 311 * Calls to this method do not modify the internal <code>quantile</code> 312 * state of this statistic.</p> 313 * <p> 314 * <ul> 315 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> 316 * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> 317 * if <code>length = 1 </code></li> 318 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code> 319 * is null , <code>begin</code> or <code>length</code> is invalid, or 320 * <code>p</code> is not a valid quantile value (p must be greater than 0 321 * and less than or equal to 100)</li> 322 * </ul></p> 323 * <p> 324 * See {@link Percentile} for a description of the percentile estimation 325 * algorithm used.</p> 326 * 327 * @param values array of input values 328 * @param p the percentile to compute 329 * @param begin the first (0-based) element to include in the computation 330 * @param length the number of array elements to include 331 * @return the percentile value 332 * @throws MathIllegalArgumentException if the parameters are not valid or the 333 * input array is null 334 */ 335 public double evaluate(final double[] values, final int begin, 336 final int length, final double p) 337 throws MathIllegalArgumentException { 338 339 test(values, begin, length); 340 if (p > 100 || p <= 0) { 341 throw new OutOfRangeException( 342 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); 343 } 344 if (length == 0) { 345 return Double.NaN; 346 } 347 if (length == 1) { 348 return values[begin]; // always return single value for n = 1 349 } 350 351 final double[] work = getWorkArray(values, begin, length); 352 final int[] pivotsHeap = getPivots(values); 353 return work.length == 0 ? Double.NaN : 354 estimationType.evaluate(work, pivotsHeap, p, kthSelector); 355 } 356 357 /** Select a pivot index as the median of three 358 * <p> 359 * <b>Note:</b> With the effect of allowing {@link KthSelector} to be set on 360 * {@link Percentile} instances(thus indirectly {@link PivotingStrategy}) 361 * this method wont take effect any more and hence is unsupported. 362 * @param work data array 363 * @param begin index of the first element of the slice 364 * @param end index after the last element of the slice 365 * @return the index of the median element chosen between the 366 * first, the middle and the last element of the array slice 367 * @deprecated Please refrain from using this method (as it wont take effect) 368 * and instead use {@link Percentile#withKthSelector(newKthSelector)} if 369 * required. 370 * 371 */ 372 @Deprecated 373 int medianOf3(final double[] work, final int begin, final int end) { 374 return new MedianOf3PivotingStrategy().pivotIndex(work, begin, end); 375 //throw new MathUnsupportedOperationException(); 376 } 377 378 /** 379 * Returns the value of the quantile field (determines what percentile is 380 * computed when evaluate() is called with no quantile argument). 381 * 382 * @return quantile set while construction or {@link #setQuantile(double)} 383 */ 384 public double getQuantile() { 385 return quantile; 386 } 387 388 /** 389 * Sets the value of the quantile field (determines what percentile is 390 * computed when evaluate() is called with no quantile argument). 391 * 392 * @param p a value between 0 < p <= 100 393 * @throws MathIllegalArgumentException if p is not greater than 0 and less 394 * than or equal to 100 395 */ 396 public void setQuantile(final double p) throws MathIllegalArgumentException { 397 if (p <= 0 || p > 100) { 398 throw new OutOfRangeException( 399 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); 400 } 401 quantile = p; 402 } 403 404 /** 405 * {@inheritDoc} 406 */ 407 @Override 408 public Percentile copy() { 409 return new Percentile(this); 410 } 411 412 /** 413 * Copies source to dest. 414 * @param source Percentile to copy 415 * @param dest Percentile to copy to 416 * @exception MathUnsupportedOperationException always thrown since 3.4 417 * @deprecated as of 3.4 this method does not work anymore, as it fails to 418 * copy internal states between instances configured with different 419 * {@link EstimationType estimation type}, {@link NaNStrategy NaN handling strategies} 420 * and {@link KthSelector kthSelector}, it therefore always 421 * throw {@link MathUnsupportedOperationException} 422 */ 423 @Deprecated 424 public static void copy(final Percentile source, final Percentile dest) 425 throws MathUnsupportedOperationException { 426 throw new MathUnsupportedOperationException(); 427 } 428 429 /** 430 * Get the work array to operate. Makes use of prior {@code storedData} if 431 * it exists or else do a check on NaNs and copy a subset of the array 432 * defined by begin and length parameters. The set {@link #nanStrategy} will 433 * be used to either retain/remove/replace any NaNs present before returning 434 * the resultant array. 435 * 436 * @param values the array of numbers 437 * @param begin index to start reading the array 438 * @param length the length of array to be read from the begin index 439 * @return work array sliced from values in the range [begin,begin+length) 440 * @throws MathIllegalArgumentException if values or indices are invalid 441 */ 442 protected double[] getWorkArray(final double[] values, final int begin, final int length) { 443 final double[] work; 444 if (values == getDataRef()) { 445 work = getDataRef(); 446 } else { 447 switch (nanStrategy) { 448 case MAXIMAL:// Replace NaNs with +INFs 449 work = replaceAndSlice(values, begin, length, Double.NaN, Double.POSITIVE_INFINITY); 450 break; 451 case MINIMAL:// Replace NaNs with -INFs 452 work = replaceAndSlice(values, begin, length, Double.NaN, Double.NEGATIVE_INFINITY); 453 break; 454 case REMOVED:// Drop NaNs from data 455 work = removeAndSlice(values, begin, length, Double.NaN); 456 break; 457 case FAILED:// just throw exception as NaN is un-acceptable 458 work = copyOf(values, begin, length); 459 MathArrays.checkNotNaN(work); 460 break; 461 default: //FIXED 462 work = copyOf(values,begin,length); 463 break; 464 } 465 } 466 return work; 467 } 468 469 /** 470 * Make a copy of the array for the slice defined by array part from 471 * [begin, begin+length) 472 * @param values the input array 473 * @param begin start index of the array to include 474 * @param length number of elements to include from begin 475 * @return copy of a slice of the original array 476 */ 477 private static double[] copyOf(final double[] values, final int begin, final int length) { 478 MathArrays.verifyValues(values, begin, length); 479 return MathArrays.copyOfRange(values, begin, begin + length); 480 } 481 482 /** 483 * Replace every occurrence of a given value with a replacement value in a 484 * copied slice of array defined by array part from [begin, begin+length). 485 * @param values the input array 486 * @param begin start index of the array to include 487 * @param length number of elements to include from begin 488 * @param original the value to be replaced with 489 * @param replacement the value to be used for replacement 490 * @return the copy of sliced array with replaced values 491 */ 492 private static double[] replaceAndSlice(final double[] values, 493 final int begin, final int length, 494 final double original, 495 final double replacement) { 496 final double[] temp = copyOf(values, begin, length); 497 for(int i = 0; i < length; i++) { 498 temp[i] = Precision.equalsIncludingNaN(original, temp[i]) ? 499 replacement : temp[i]; 500 } 501 return temp; 502 } 503 504 /** 505 * Remove the occurrence of a given value in a copied slice of array 506 * defined by the array part from [begin, begin+length). 507 * @param values the input array 508 * @param begin start index of the array to include 509 * @param length number of elements to include from begin 510 * @param removedValue the value to be removed from the sliced array 511 * @return the copy of the sliced array after removing the removedValue 512 */ 513 private static double[] removeAndSlice(final double[] values, 514 final int begin, final int length, 515 final double removedValue) { 516 MathArrays.verifyValues(values, begin, length); 517 final double[] temp; 518 //BitSet(length) to indicate where the removedValue is located 519 final BitSet bits = new BitSet(length); 520 for (int i = begin; i < begin+length; i++) { 521 if (Precision.equalsIncludingNaN(removedValue, values[i])) { 522 bits.set(i - begin); 523 } 524 } 525 //Check if empty then create a new copy 526 if (bits.isEmpty()) { 527 temp = copyOf(values, begin, length); // Nothing removed, just copy 528 } else if(bits.cardinality() == length){ 529 temp = new double[0]; // All removed, just empty 530 }else { // Some removable, so new 531 temp = new double[length - bits.cardinality()]; 532 int start = begin; //start index from source array (i.e values) 533 int dest = 0; //dest index in destination array(i.e temp) 534 int nextOne = -1; //nextOne is the index of bit set of next one 535 int bitSetPtr = 0; //bitSetPtr is start index pointer of bitset 536 while ((nextOne = bits.nextSetBit(bitSetPtr)) != -1) { 537 final int lengthToCopy = nextOne - bitSetPtr; 538 System.arraycopy(values, start, temp, dest, lengthToCopy); 539 dest += lengthToCopy; 540 start = begin + (bitSetPtr = bits.nextClearBit(nextOne)); 541 } 542 //Copy any residue past start index till begin+length 543 if (start < begin + length) { 544 System.arraycopy(values,start,temp,dest,begin + length - start); 545 } 546 } 547 return temp; 548 } 549 550 /** 551 * Get pivots which is either cached or a newly created one 552 * 553 * @param values array containing the input numbers 554 * @return cached pivots or a newly created one 555 */ 556 private int[] getPivots(final double[] values) { 557 final int[] pivotsHeap; 558 if (values == getDataRef()) { 559 pivotsHeap = cachedPivots; 560 } else { 561 pivotsHeap = new int[PIVOTS_HEAP_LENGTH]; 562 Arrays.fill(pivotsHeap, -1); 563 } 564 return pivotsHeap; 565 } 566 567 /** 568 * Get the estimation {@link EstimationType type} used for computation. 569 * 570 * @return the {@code estimationType} set 571 */ 572 public EstimationType getEstimationType() { 573 return estimationType; 574 } 575 576 /** 577 * Build a new instance similar to the current one except for the 578 * {@link EstimationType estimation type}. 579 * <p> 580 * This method is intended to be used as part of a fluent-type builder 581 * pattern. Building finely tune instances should be done as follows: 582 * </p> 583 * <pre> 584 * Percentile customized = new Percentile(quantile). 585 * withEstimationType(estimationType). 586 * withNaNStrategy(nanStrategy). 587 * withKthSelector(kthSelector); 588 * </pre> 589 * <p> 590 * If any of the {@code withXxx} method is omitted, the default value for 591 * the corresponding customization parameter will be used. 592 * </p> 593 * @param newEstimationType estimation type for the new instance 594 * @return a new instance, with changed estimation type 595 * @throws NullArgumentException when newEstimationType is null 596 */ 597 public Percentile withEstimationType(final EstimationType newEstimationType) { 598 return new Percentile(quantile, newEstimationType, nanStrategy, kthSelector); 599 } 600 601 /** 602 * Get the {@link NaNStrategy NaN Handling} strategy used for computation. 603 * @return {@code NaN Handling} strategy set during construction 604 */ 605 public NaNStrategy getNaNStrategy() { 606 return nanStrategy; 607 } 608 609 /** 610 * Build a new instance similar to the current one except for the 611 * {@link NaNStrategy NaN handling} strategy. 612 * <p> 613 * This method is intended to be used as part of a fluent-type builder 614 * pattern. Building finely tune instances should be done as follows: 615 * </p> 616 * <pre> 617 * Percentile customized = new Percentile(quantile). 618 * withEstimationType(estimationType). 619 * withNaNStrategy(nanStrategy). 620 * withKthSelector(kthSelector); 621 * </pre> 622 * <p> 623 * If any of the {@code withXxx} method is omitted, the default value for 624 * the corresponding customization parameter will be used. 625 * </p> 626 * @param newNaNStrategy NaN strategy for the new instance 627 * @return a new instance, with changed NaN handling strategy 628 * @throws NullArgumentException when newNaNStrategy is null 629 */ 630 public Percentile withNaNStrategy(final NaNStrategy newNaNStrategy) { 631 return new Percentile(quantile, estimationType, newNaNStrategy, kthSelector); 632 } 633 634 /** 635 * Get the {@link KthSelector kthSelector} used for computation. 636 * @return the {@code kthSelector} set 637 */ 638 public KthSelector getKthSelector() { 639 return kthSelector; 640 } 641 642 /** 643 * Get the {@link PivotingStrategyInterface} used in KthSelector for computation. 644 * @return the pivoting strategy set 645 */ 646 public PivotingStrategyInterface getPivotingStrategy() { 647 return kthSelector.getPivotingStrategy(); 648 } 649 650 /** 651 * Build a new instance similar to the current one except for the 652 * {@link KthSelector kthSelector} instance specifically set. 653 * <p> 654 * This method is intended to be used as part of a fluent-type builder 655 * pattern. Building finely tune instances should be done as follows: 656 * </p> 657 * <pre> 658 * Percentile customized = new Percentile(quantile). 659 * withEstimationType(estimationType). 660 * withNaNStrategy(nanStrategy). 661 * withKthSelector(newKthSelector); 662 * </pre> 663 * <p> 664 * If any of the {@code withXxx} method is omitted, the default value for 665 * the corresponding customization parameter will be used. 666 * </p> 667 * @param newKthSelector KthSelector for the new instance 668 * @return a new instance, with changed KthSelector 669 * @throws NullArgumentException when newKthSelector is null 670 */ 671 public Percentile withKthSelector(final KthSelector newKthSelector) { 672 return new Percentile(quantile, estimationType, nanStrategy, 673 newKthSelector); 674 } 675 676 /** 677 * An enum for various estimation strategies of a percentile referred in 678 * <a href="http://en.wikipedia.org/wiki/Quantile">wikipedia on quantile</a> 679 * with the names of enum matching those of types mentioned in 680 * wikipedia. 681 * <p> 682 * Each enum corresponding to the specific type of estimation in wikipedia 683 * implements the respective formulae that specializes in the below aspects 684 * <ul> 685 * <li>An <b>index method</b> to calculate approximate index of the 686 * estimate</li> 687 * <li>An <b>estimate method</b> to estimate a value found at the earlier 688 * computed index</li> 689 * <li>A <b> minLimit</b> on the quantile for which first element of sorted 690 * input is returned as an estimate </li> 691 * <li>A <b> maxLimit</b> on the quantile for which last element of sorted 692 * input is returned as an estimate </li> 693 * </ul> 694 * <p> 695 * Users can now create {@link Percentile} by explicitly passing this enum; 696 * such as by invoking {@link Percentile#withEstimationType(EstimationType)} 697 * <p> 698 * References: 699 * <ol> 700 * <li> 701 * <a href="http://en.wikipedia.org/wiki/Quantile">Wikipedia on quantile</a> 702 * </li> 703 * <li> 704 * <a href="https://www.amherst.edu/media/view/129116/.../Sample+Quantiles.pdf"> 705 * Hyndman, R. J. and Fan, Y. (1996) Sample quantiles in statistical 706 * packages, American Statistician 50, 361–365</a> </li> 707 * <li> 708 * <a href="http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html"> 709 * R-Manual </a></li> 710 * </ol> 711 * 712 */ 713 public enum EstimationType { 714 /** 715 * This is the default type used in the {@link Percentile}.This method 716 * has the following formulae for index and estimates<br> 717 * \( \begin{align} 718 * &index = (N+1)p\ \\ 719 * &estimate = x_{\lceil h\,-\,1/2 \rceil} \\ 720 * &minLimit = 0 \\ 721 * &maxLimit = 1 \\ 722 * \end{align}\) 723 */ 724 LEGACY("Legacy Apache Commons Math") { 725 /** 726 * {@inheritDoc}.This method in particular makes use of existing 727 * Apache Commons Math style of picking up the index. 728 */ 729 @Override 730 protected double index(final double p, final int length) { 731 final double minLimit = 0d; 732 final double maxLimit = 1d; 733 return Double.compare(p, minLimit) == 0 ? 0 : 734 Double.compare(p, maxLimit) == 0 ? 735 length : p * (length + 1); 736 } 737 }, 738 /** 739 * The method R_1 has the following formulae for index and estimates<br> 740 * \( \begin{align} 741 * &index= Np + 1/2\, \\ 742 * &estimate= x_{\lceil h\,-\,1/2 \rceil} \\ 743 * &minLimit = 0 \\ 744 * \end{align}\) 745 */ 746 R_1("R-1") { 747 748 @Override 749 protected double index(final double p, final int length) { 750 final double minLimit = 0d; 751 return Double.compare(p, minLimit) == 0 ? 0 : length * p + 0.5; 752 } 753 754 /** 755 * {@inheritDoc}This method in particular for R_1 uses ceil(pos-0.5) 756 */ 757 @Override 758 protected double estimate(final double[] values, 759 final int[] pivotsHeap, final double pos, 760 final int length, final KthSelector selector) { 761 return super.estimate(values, pivotsHeap, FastMath.ceil(pos - 0.5), length, selector); 762 } 763 764 }, 765 /** 766 * The method R_2 has the following formulae for index and estimates<br> 767 * \( \begin{align} 768 * &index= Np + 1/2\, \\ 769 * &estimate=\frac{x_{\lceil h\,-\,1/2 \rceil} + 770 * x_{\lfloor h\,+\,1/2 \rfloor}}{2} \\ 771 * &minLimit = 0 \\ 772 * &maxLimit = 1 \\ 773 * \end{align}\) 774 */ 775 R_2("R-2") { 776 777 @Override 778 protected double index(final double p, final int length) { 779 final double minLimit = 0d; 780 final double maxLimit = 1d; 781 return Double.compare(p, maxLimit) == 0 ? length : 782 Double.compare(p, minLimit) == 0 ? 0 : length * p + 0.5; 783 } 784 785 /** 786 * {@inheritDoc}This method in particular for R_2 averages the 787 * values at ceil(p+0.5) and floor(p-0.5). 788 */ 789 @Override 790 protected double estimate(final double[] values, 791 final int[] pivotsHeap, final double pos, 792 final int length, final KthSelector selector) { 793 final double low = 794 super.estimate(values, pivotsHeap, FastMath.ceil(pos - 0.5), length, selector); 795 final double high = 796 super.estimate(values, pivotsHeap,FastMath.floor(pos + 0.5), length, selector); 797 return (low + high) / 2; 798 } 799 800 }, 801 /** 802 * The method R_3 has the following formulae for index and estimates<br> 803 * \( \begin{align} 804 * &index= Np \\ 805 * &estimate= x_{\lfloor h \rceil}\, \\ 806 * &minLimit = 0.5/N \\ 807 * \end{align}\) 808 */ 809 R_3("R-3") { 810 @Override 811 protected double index(final double p, final int length) { 812 final double minLimit = 1d/2 / length; 813 return Double.compare(p, minLimit) <= 0 ? 814 0 : FastMath.rint(length * p); 815 } 816 817 }, 818 /** 819 * The method R_4 has the following formulae for index and estimates<br> 820 * \( \begin{align} 821 * &index= Np\, \\ 822 * &estimate= x_{\lfloor h \rfloor} + (h - 823 * \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 824 * \rfloor}) \\ 825 * &minLimit = 1/N \\ 826 * &maxLimit = 1 \\ 827 * \end{align}\) 828 */ 829 R_4("R-4") { 830 @Override 831 protected double index(final double p, final int length) { 832 final double minLimit = 1d / length; 833 final double maxLimit = 1d; 834 return Double.compare(p, minLimit) < 0 ? 0 : 835 Double.compare(p, maxLimit) == 0 ? length : length * p; 836 } 837 838 }, 839 /** 840 * The method R_5 has the following formulae for index and estimates<br> 841 * \( \begin{align} 842 * &index= Np + 1/2\\ 843 * &estimate= x_{\lfloor h \rfloor} + (h - 844 * \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 845 * \rfloor}) \\ 846 * &minLimit = 0.5/N \\ 847 * &maxLimit = (N-0.5)/N 848 * \end{align}\) 849 */ 850 R_5("R-5"){ 851 852 @Override 853 protected double index(final double p, final int length) { 854 final double minLimit = 1d/2 / length; 855 final double maxLimit = (length - 0.5) / length; 856 return Double.compare(p, minLimit) < 0 ? 0 : 857 Double.compare(p, maxLimit) >= 0 ? 858 length : length * p + 0.5; 859 } 860 }, 861 /** 862 * The method R_6 has the following formulae for index and estimates<br> 863 * \( \begin{align} 864 * &index= (N + 1)p \\ 865 * &estimate= x_{\lfloor h \rfloor} + (h - 866 * \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 867 * \rfloor}) \\ 868 * &minLimit = 1/(N+1) \\ 869 * &maxLimit = N/(N+1) \\ 870 * \end{align}\) 871 * <p> 872 * <b>Note:</b> This method computes the index in a manner very close to 873 * the default Commons Math Percentile existing implementation. However 874 * the difference to be noted is in picking up the limits with which 875 * first element (p<1(N+1)) and last elements (p>N/(N+1))are done. 876 * While in default case; these are done with p=0 and p=1 respectively. 877 */ 878 R_6("R-6"){ 879 880 @Override 881 protected double index(final double p, final int length) { 882 final double minLimit = 1d / (length + 1); 883 final double maxLimit = 1d * length / (length + 1); 884 return Double.compare(p, minLimit) < 0 ? 0 : 885 Double.compare(p, maxLimit) >= 0 ? 886 length : (length + 1) * p; 887 } 888 }, 889 890 /** 891 * The method R_7 implements Microsoft Excel style computation has the 892 * following formulae for index and estimates.<br> 893 * \( \begin{align} 894 * &index = (N-1)p + 1 \\ 895 * &estimate = x_{\lfloor h \rfloor} + (h - 896 * \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 897 * \rfloor}) \\ 898 * &minLimit = 0 \\ 899 * &maxLimit = 1 \\ 900 * \end{align}\) 901 */ 902 R_7("R-7") { 903 @Override 904 protected double index(final double p, final int length) { 905 final double minLimit = 0d; 906 final double maxLimit = 1d; 907 return Double.compare(p, minLimit) == 0 ? 0 : 908 Double.compare(p, maxLimit) == 0 ? 909 length : 1 + (length - 1) * p; 910 } 911 912 }, 913 914 /** 915 * The method R_8 has the following formulae for index and estimates<br> 916 * \( \begin{align} 917 * &index = (N + 1/3)p + 1/3 \\ 918 * &estimate = x_{\lfloor h \rfloor} + (h - 919 \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 920 * \rfloor}) \\ 921 * &minLimit = (2/3)/(N+1/3) \\ 922 * &maxLimit = (N-1/3)/(N+1/3) \\ 923 * \end{align}\) 924 * <p> 925 * As per Ref [2,3] this approach is most recommended as it provides 926 * an approximate median-unbiased estimate regardless of distribution. 927 */ 928 R_8("R-8") { 929 @Override 930 protected double index(final double p, final int length) { 931 final double minLimit = 2 * (1d / 3) / (length + 1d / 3); 932 final double maxLimit = 933 (length - 1d / 3) / (length + 1d / 3); 934 return Double.compare(p, minLimit) < 0 ? 0 : 935 Double.compare(p, maxLimit) >= 0 ? length : 936 (length + 1d / 3) * p + 1d / 3; 937 } 938 }, 939 940 /** 941 * The method R_9 has the following formulae for index and estimates<br> 942 * \( \begin{align} 943 * &index = (N + 1/4)p + 3/8\\ 944 * &estimate = x_{\lfloor h \rfloor} + (h - 945 \lfloor h \rfloor) (x_{\lfloor h \rfloor + 1} - x_{\lfloor h 946 * \rfloor}) \\ 947 * &minLimit = (5/8)/(N+1/4) \\ 948 * &maxLimit = (N-3/8)/(N+1/4) \\ 949 * \end{align}\) 950 */ 951 R_9("R-9") { 952 @Override 953 protected double index(final double p, final int length) { 954 final double minLimit = 5d/8 / (length + 0.25); 955 final double maxLimit = (length - 3d/8) / (length + 0.25); 956 return Double.compare(p, minLimit) < 0 ? 0 : 957 Double.compare(p, maxLimit) >= 0 ? length : 958 (length + 0.25) * p + 3d/8; 959 } 960 961 }, 962 ; 963 964 /** Simple name such as R-1, R-2 corresponding to those in wikipedia. */ 965 private final String name; 966 967 /** 968 * Constructor 969 * 970 * @param type name of estimation type as per wikipedia 971 */ 972 EstimationType(final String type) { 973 this.name = type; 974 } 975 976 /** 977 * Finds the index of array that can be used as starting index to 978 * {@link #estimate(double[], int[], double, int, KthSelector) estimate} 979 * percentile. The calculation of index calculation is specific to each 980 * {@link EstimationType}. 981 * 982 * @param p the p<sup>th</sup> quantile 983 * @param length the total number of array elements in the work array 984 * @return a computed real valued index as explained in the wikipedia 985 */ 986 protected abstract double index(final double p, final int length); 987 988 /** 989 * Estimation based on K<sup>th</sup> selection. This may be overridden 990 * in specific enums to compute slightly different estimations. 991 * 992 * @param work array of numbers to be used for finding the percentile 993 * @param pos indicated positional index prior computed from calling 994 * {@link #index(double, int)} 995 * @param pivotsHeap an earlier populated cache if exists; will be used 996 * @param length size of array considered 997 * @param selector a {@link KthSelector} used for pivoting during search 998 * @return estimated percentile 999 */ 1000 protected double estimate(final double[] work, final int[] pivotsHeap, 1001 final double pos, final int length, 1002 final KthSelector selector) { 1003 1004 final double fpos = FastMath.floor(pos); 1005 final int intPos = (int) fpos; 1006 final double dif = pos - fpos; 1007 1008 if (pos < 1) { 1009 return selector.select(work, pivotsHeap, 0); 1010 } 1011 if (pos >= length) { 1012 return selector.select(work, pivotsHeap, length - 1); 1013 } 1014 1015 final double lower = selector.select(work, pivotsHeap, intPos - 1); 1016 final double upper = selector.select(work, pivotsHeap, intPos); 1017 return lower + dif * (upper - lower); 1018 } 1019 1020 /** 1021 * Evaluate method to compute the percentile for a given bounded array 1022 * using earlier computed pivots heap.<br> 1023 * This basically calls the {@link #index(double, int) index} and then 1024 * {@link #estimate(double[], int[], double, int, KthSelector) estimate} 1025 * functions to return the estimated percentile value. 1026 * 1027 * @param work array of numbers to be used for finding the percentile 1028 * @param pivotsHeap a prior cached heap which can speed up estimation 1029 * @param p the p<sup>th</sup> quantile to be computed 1030 * @param selector a {@link KthSelector} used for pivoting during search 1031 * @return estimated percentile 1032 * @throws OutOfRangeException if p is out of range 1033 * @throws NullArgumentException if work array is null 1034 */ 1035 protected double evaluate(final double[] work, final int[] pivotsHeap, final double p, 1036 final KthSelector selector) { 1037 MathUtils.checkNotNull(work); 1038 if (p > 100 || p <= 0) { 1039 throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, 1040 p, 0, 100); 1041 } 1042 return estimate(work, pivotsHeap, index(p/100d, work.length), work.length, selector); 1043 } 1044 1045 /** 1046 * Evaluate method to compute the percentile for a given bounded array. 1047 * This basically calls the {@link #index(double, int) index} and then 1048 * {@link #estimate(double[], int[], double, int, KthSelector) estimate} 1049 * functions to return the estimated percentile value. Please 1050 * note that this method does not make use of cached pivots. 1051 * 1052 * @param work array of numbers to be used for finding the percentile 1053 * @param p the p<sup>th</sup> quantile to be computed 1054 * @return estimated percentile 1055 * @param selector a {@link KthSelector} used for pivoting during search 1056 * @throws OutOfRangeException if length or p is out of range 1057 * @throws NullArgumentException if work array is null 1058 */ 1059 public double evaluate(final double[] work, final double p, final KthSelector selector) { 1060 return this.evaluate(work, null, p, selector); 1061 } 1062 1063 /** 1064 * Gets the name of the enum 1065 * 1066 * @return the name 1067 */ 1068 String getName() { 1069 return name; 1070 } 1071 } 1072}