001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019import java.util.Arrays; 020import java.util.Objects; 021import java.util.function.IntToDoubleFunction; 022import org.apache.commons.numbers.arrays.Selection; 023 024/** 025 * Provides quantile computation. 026 * 027 * <p>For values of length {@code n}: 028 * <ul> 029 * <li>The result is {@code NaN} if {@code n = 0}. 030 * <li>The result is {@code values[0]} if {@code n = 1}. 031 * <li>Otherwise the result is computed using the {@link EstimationMethod}. 032 * </ul> 033 * 034 * <p>Computation of multiple quantiles and will handle duplicate and unordered 035 * probabilities. Passing ordered probabilities is recommended if the order is already 036 * known as this can improve efficiency; for example using uniform spacing through the 037 * array data, or to identify extreme values from the data such as {@code [0.001, 0.999]}. 038 * 039 * <p>This implementation respects the ordering imposed by 040 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs 041 * in the selected positions in the fully sorted values then the result is {@code NaN}. 042 * 043 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values. 044 * 045 * <p>Instances of this class are immutable and thread-safe. 046 * 047 * @see #with(NaNPolicy) 048 * @see <a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a> 049 * @since 1.1 050 */ 051public final class Quantile { 052 /** Message when the probability is not in the range {@code [0, 1]}. */ 053 private static final String INVALID_PROBABILITY = "Invalid probability: "; 054 /** Message when no probabilities are provided for the varargs method. */ 055 private static final String NO_PROBABILITIES_SPECIFIED = "No probabilities specified"; 056 /** Message when the size is not valid. */ 057 private static final String INVALID_SIZE = "Invalid size: "; 058 /** Message when the number of probabilities in a range is not valid. */ 059 private static final String INVALID_NUMBER_OF_PROBABILITIES = "Invalid number of probabilities: "; 060 061 /** Default instance. Method 8 is recommended by Hyndman and Fan. */ 062 private static final Quantile DEFAULT = new Quantile(false, NaNPolicy.INCLUDE, EstimationMethod.HF8); 063 064 /** Flag to indicate if the data should be copied. */ 065 private final boolean copy; 066 /** NaN policy for floating point data. */ 067 private final NaNPolicy nanPolicy; 068 /** Transformer for NaN data. */ 069 private final NaNTransformer nanTransformer; 070 /** Estimation type used to determine the value from the quantile. */ 071 private final EstimationMethod estimationType; 072 073 /** 074 * @param copy Flag to indicate if the data should be copied. 075 * @param nanPolicy NaN policy. 076 * @param estimationType Estimation type used to determine the value from the quantile. 077 */ 078 private Quantile(boolean copy, NaNPolicy nanPolicy, EstimationMethod estimationType) { 079 this.copy = copy; 080 this.nanPolicy = nanPolicy; 081 this.estimationType = estimationType; 082 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy); 083 } 084 085 /** 086 * Return a new instance with the default options. 087 * 088 * <ul> 089 * <li>{@linkplain #withCopy(boolean) Copy = false} 090 * <li>{@linkplain #with(NaNPolicy) NaN policy = include} 091 * <li>{@linkplain #with(EstimationMethod) Estimation method = HF8} 092 * </ul> 093 * 094 * <p>Note: The default options configure for processing in-place and including 095 * {@code NaN} values in the data. This is the most efficient mode and has the 096 * smallest memory consumption. 097 * 098 * @return the quantile implementation 099 * @see #withCopy(boolean) 100 * @see #with(NaNPolicy) 101 * @see #with(EstimationMethod) 102 */ 103 public static Quantile withDefaults() { 104 return DEFAULT; 105 } 106 107 /** 108 * Return an instance with the configured copy behaviour. If {@code false} then 109 * the input array will be modified by the call to evaluate the quantiles; otherwise 110 * the computation uses a copy of the data. 111 * 112 * @param v Value. 113 * @return an instance 114 */ 115 public Quantile withCopy(boolean v) { 116 return new Quantile(v, nanPolicy, estimationType); 117 } 118 119 /** 120 * Return an instance with the configured {@link NaNPolicy}. 121 * 122 * <p>Note: This implementation respects the ordering imposed by 123 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is 124 * considered greater than all other values, and all {@code NaN} values are equal. The 125 * {@link NaNPolicy} changes the computation of the statistic in the presence of 126 * {@code NaN} values. 127 * 128 * <ul> 129 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data; 130 * the size of the data <em>includes</em> the {@code NaN} values and the quantile will be 131 * {@code NaN} if any value used for quantile interpolation is {@code NaN}. 132 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data; 133 * the size of the data <em>excludes</em> the {@code NaN} values and the quantile will 134 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero 135 * and the result is {@code NaN}. 136 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN} 137 * values. 138 * </ul> 139 * 140 * <p>Note that the result is identical for all policies if no {@code NaN} values are present. 141 * 142 * @param v Value. 143 * @return an instance 144 */ 145 public Quantile with(NaNPolicy v) { 146 return new Quantile(copy, Objects.requireNonNull(v), estimationType); 147 } 148 149 /** 150 * Return an instance with the configured {@link EstimationMethod}. 151 * 152 * @param v Value. 153 * @return an instance 154 */ 155 public Quantile with(EstimationMethod v) { 156 return new Quantile(copy, nanPolicy, Objects.requireNonNull(v)); 157 } 158 159 /** 160 * Generate {@code n} evenly spaced probabilities in the range {@code [0, 1]}. 161 * 162 * <pre> 163 * 1/(n + 1), 2/(n + 1), ..., n/(n + 1) 164 * </pre> 165 * 166 * @param n Number of probabilities. 167 * @return the probabilities 168 * @throws IllegalArgumentException if {@code n < 1} 169 */ 170 public static double[] probabilities(int n) { 171 checkNumberOfProbabilities(n); 172 final double c1 = n + 1.0; 173 final double[] p = new double[n]; 174 for (int i = 0; i < n; i++) { 175 p[i] = (i + 1.0) / c1; 176 } 177 return p; 178 } 179 180 /** 181 * Generate {@code n} evenly spaced probabilities in the range {@code [p1, p2]}. 182 * 183 * <pre> 184 * w = p2 - p1 185 * p1 + w/(n + 1), p1 + 2w/(n + 1), ..., p1 + nw/(n + 1) 186 * </pre> 187 * 188 * @param n Number of probabilities. 189 * @param p1 Lower probability. 190 * @param p2 Upper probability. 191 * @return the probabilities 192 * @throws IllegalArgumentException if {@code n < 1}; if the probabilities are not in the 193 * range {@code [0, 1]}; or {@code p2 <= p1}. 194 */ 195 public static double[] probabilities(int n, double p1, double p2) { 196 checkProbability(p1); 197 checkProbability(p2); 198 if (p2 <= p1) { 199 throw new IllegalArgumentException("Invalid range: [" + p1 + ", " + p2 + "]"); 200 } 201 final double[] p = probabilities(n); 202 for (int i = 0; i < n; i++) { 203 p[i] = (1 - p[i]) * p1 + p[i] * p2; 204 } 205 return p; 206 } 207 208 /** 209 * Evaluate the {@code p}-th quantile of the values. 210 * 211 * <p>Note: This method may partially sort the input values if not configured to 212 * {@link #withCopy(boolean) copy} the input data. 213 * 214 * <p><strong>Performance</strong> 215 * 216 * <p>It is not recommended to use this method for repeat calls for different quantiles 217 * within the same values. The {@link #evaluate(double[], double...)} method should be used 218 * which provides better performance. 219 * 220 * @param values Values. 221 * @param p Probability for the quantile to compute. 222 * @return the quantile 223 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; 224 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 225 * @see #evaluate(double[], double...) 226 * @see #with(NaNPolicy) 227 */ 228 public double evaluate(double[] values, double p) { 229 return compute(values, 0, values.length, p); 230 } 231 232 /** 233 * Evaluate the {@code p}-th quantile of the specified range of values. 234 * 235 * <p>Note: This method may partially sort the input values if not configured to 236 * {@link #withCopy(boolean) copy} the input data. 237 * 238 * <p><strong>Performance</strong> 239 * 240 * <p>It is not recommended to use this method for repeat calls for different quantiles 241 * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used 242 * which provides better performance. 243 * 244 * @param values Values. 245 * @param from Inclusive start of the range. 246 * @param to Exclusive end of the range. 247 * @param p Probability for the quantile to compute. 248 * @return the quantile 249 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; 250 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 251 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 252 * @see #evaluateRange(double[], int, int, double...) 253 * @see #with(NaNPolicy) 254 * @since 1.2 255 */ 256 public double evaluateRange(double[] values, int from, int to, double p) { 257 Statistics.checkFromToIndex(from, to, values.length); 258 return compute(values, from, to, p); 259 } 260 261 /** 262 * Compute the {@code p}-th quantile of the specified range of values. 263 * 264 * @param values Values. 265 * @param from Inclusive start of the range. 266 * @param to Exclusive end of the range. 267 * @param p Probability for the quantile to compute. 268 * @return the quantile 269 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 270 */ 271 private double compute(double[] values, int from, int to, double p) { 272 checkProbability(p); 273 // Floating-point data handling 274 final int[] bounds = new int[2]; 275 final double[] x = nanTransformer.apply(values, from, to, bounds); 276 final int start = bounds[0]; 277 final int end = bounds[1]; 278 final int n = end - start; 279 // Special cases 280 if (n <= 1) { 281 return n == 0 ? Double.NaN : x[start]; 282 } 283 284 final double pos = estimationType.index(p, n); 285 final int ip = (int) pos; 286 final int i = start + ip; 287 288 // Partition and compute 289 if (pos > ip) { 290 Selection.select(x, start, end, new int[] {i, i + 1}); 291 return Interpolation.interpolate(x[i], x[i + 1], pos - ip); 292 } 293 Selection.select(x, start, end, i); 294 return x[i]; 295 } 296 297 /** 298 * Evaluate the {@code p}-th quantiles of the values. 299 * 300 * <p>Note: This method may partially sort the input values if not configured to 301 * {@link #withCopy(boolean) copy} the input data. 302 * 303 * @param values Values. 304 * @param p Probabilities for the quantiles to compute. 305 * @return the quantiles 306 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 307 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 308 * @see #with(NaNPolicy) 309 */ 310 public double[] evaluate(double[] values, double... p) { 311 return compute(values, 0, values.length, p); 312 } 313 314 /** 315 * Evaluate the {@code p}-th quantiles of the specified range of values. 316 * 317 * <p>Note: This method may partially sort the input values if not configured to 318 * {@link #withCopy(boolean) copy} the input data. 319 * 320 * @param values Values. 321 * @param from Inclusive start of the range. 322 * @param to Exclusive end of the range. 323 * @param p Probabilities for the quantiles to compute. 324 * @return the quantiles 325 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 326 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 327 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 328 * @see #with(NaNPolicy) 329 * @since 1.2 330 */ 331 public double[] evaluateRange(double[] values, int from, int to, double... p) { 332 Statistics.checkFromToIndex(from, to, values.length); 333 return compute(values, from, to, p); 334 } 335 336 /** 337 * Compute the {@code p}-th quantiles of the specified range of values. 338 * 339 * @param values Values. 340 * @param from Inclusive start of the range. 341 * @param to Exclusive end of the range. 342 * @param p Probabilities for the quantiles to compute. 343 * @return the quantiles 344 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 345 * or no probabilities are specified. 346 */ 347 private double[] compute(double[] values, int from, int to, double... p) { 348 checkProbabilities(p); 349 // Floating-point data handling 350 final int[] bounds = new int[2]; 351 final double[] x = nanTransformer.apply(values, from, to, bounds); 352 final int start = bounds[0]; 353 final int end = bounds[1]; 354 final int n = end - start; 355 // Special cases 356 final double[] q = new double[p.length]; 357 if (n <= 1) { 358 Arrays.fill(q, n == 0 ? Double.NaN : x[start]); 359 return q; 360 } 361 362 // Collect interpolation positions. We use the output q as storage. 363 final int[] indices = computeIndices(n, p, q, start); 364 365 // Partition 366 Selection.select(x, start, end, indices); 367 368 // Compute 369 for (int k = 0; k < p.length; k++) { 370 // ip in [0, n); i in [start, end) 371 final int ip = (int) q[k]; 372 final int i = start + ip; 373 if (q[k] > ip) { 374 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); 375 } else { 376 q[k] = x[i]; 377 } 378 } 379 return q; 380 } 381 382 /** 383 * Evaluate the {@code p}-th quantile of the values. 384 * 385 * <p>Note: This method may partially sort the input values if not configured to 386 * {@link #withCopy(boolean) copy} the input data. 387 * 388 * <p><strong>Performance</strong> 389 * 390 * <p>It is not recommended to use this method for repeat calls for different quantiles 391 * within the same values. The {@link #evaluate(int[], double...)} method should be used 392 * which provides better performance. 393 * 394 * @param values Values. 395 * @param p Probability for the quantile to compute. 396 * @return the quantile 397 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 398 * @see #evaluate(int[], double...) 399 */ 400 public double evaluate(int[] values, double p) { 401 return compute(values, 0, values.length, p); 402 } 403 404 /** 405 * Evaluate the {@code p}-th quantile of the specified range of values. 406 * 407 * <p>Note: This method may partially sort the input values if not configured to 408 * {@link #withCopy(boolean) copy} the input data. 409 * 410 * <p><strong>Performance</strong> 411 * 412 * <p>It is not recommended to use this method for repeat calls for different quantiles 413 * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used 414 * which provides better performance. 415 * 416 * @param values Values. 417 * @param from Inclusive start of the range. 418 * @param to Exclusive end of the range. 419 * @param p Probability for the quantile to compute. 420 * @return the quantile 421 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 422 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 423 * @see #evaluateRange(int[], int, int, double...) 424 * @since 1.2 425 */ 426 public double evaluateRange(int[] values, int from, int to, double p) { 427 Statistics.checkFromToIndex(from, to, values.length); 428 return compute(values, from, to, p); 429 } 430 431 /** 432 * Compute the {@code p}-th quantile of the specified range of values. 433 * 434 * @param values Values. 435 * @param from Inclusive start of the range. 436 * @param to Exclusive end of the range. 437 * @param p Probability for the quantile to compute. 438 * @return the quantile 439 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 440 */ 441 private double compute(int[] values, int from, int to, double p) { 442 checkProbability(p); 443 final int n = to - from; 444 // Special cases 445 if (n <= 1) { 446 return n == 0 ? Double.NaN : values[from]; 447 } 448 449 // Create the range 450 final int[] x; 451 final int start; 452 final int end; 453 if (copy) { 454 x = Statistics.copy(values, from, to); 455 start = 0; 456 end = n; 457 } else { 458 x = values; 459 start = from; 460 end = to; 461 } 462 463 final double pos = estimationType.index(p, n); 464 final int ip = (int) pos; 465 final int i = start + ip; 466 467 // Partition and compute 468 if (pos > ip) { 469 Selection.select(x, start, end, new int[] {i, i + 1}); 470 return Interpolation.interpolate(x[i], x[i + 1], pos - ip); 471 } 472 Selection.select(x, start, end, i); 473 return x[i]; 474 } 475 476 /** 477 * Evaluate the {@code p}-th quantiles of the values. 478 * 479 * <p>Note: This method may partially sort the input values if not configured to 480 * {@link #withCopy(boolean) copy} the input data. 481 * 482 * @param values Values. 483 * @param p Probabilities for the quantiles to compute. 484 * @return the quantiles 485 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 486 * or no probabilities are specified. 487 */ 488 public double[] evaluate(int[] values, double... p) { 489 return compute(values, 0, values.length, p); 490 } 491 492 /** 493 * Evaluate the {@code p}-th quantiles of the specified range of values.. 494 * 495 * <p>Note: This method may partially sort the input values if not configured to 496 * {@link #withCopy(boolean) copy} the input data. 497 * 498 * @param values Values. 499 * @param from Inclusive start of the range. 500 * @param to Exclusive end of the range. 501 * @param p Probabilities for the quantiles to compute. 502 * @return the quantiles 503 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 504 * or no probabilities are specified. 505 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 506 * @since 1.2 507 */ 508 public double[] evaluateRange(int[] values, int from, int to, double... p) { 509 Statistics.checkFromToIndex(from, to, values.length); 510 return compute(values, from, to, p); 511 } 512 513 /** 514 * Evaluate the {@code p}-th quantiles of the specified range of values.. 515 * 516 * <p>Note: This method may partially sort the input values if not configured to 517 * {@link #withCopy(boolean) copy} the input data. 518 * 519 * @param values Values. 520 * @param from Inclusive start of the range. 521 * @param to Exclusive end of the range. 522 * @param p Probabilities for the quantiles to compute. 523 * @return the quantiles 524 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 525 * or no probabilities are specified. 526 */ 527 private double[] compute(int[] values, int from, int to, double... p) { 528 checkProbabilities(p); 529 final int n = to - from; 530 // Special cases 531 final double[] q = new double[p.length]; 532 if (n <= 1) { 533 Arrays.fill(q, n == 0 ? Double.NaN : values[from]); 534 return q; 535 } 536 537 // Create the range 538 final int[] x; 539 final int start; 540 final int end; 541 if (copy) { 542 x = Statistics.copy(values, from, to); 543 start = 0; 544 end = n; 545 } else { 546 x = values; 547 start = from; 548 end = to; 549 } 550 551 // Collect interpolation positions. We use the output q as storage. 552 final int[] indices = computeIndices(n, p, q, start); 553 554 // Partition 555 Selection.select(x, start, end, indices); 556 557 // Compute 558 for (int k = 0; k < p.length; k++) { 559 // ip in [0, n); i in [start, end) 560 final int ip = (int) q[k]; 561 final int i = start + ip; 562 if (q[k] > ip) { 563 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); 564 } else { 565 q[k] = x[i]; 566 } 567 } 568 return q; 569 } 570 571 /** 572 * Evaluate the {@code p}-th quantile of the values. 573 * 574 * <p>This method can be used when the values of known size are already sorted. 575 * 576 * <pre>{@code 577 * short[] x = ... 578 * Arrays.sort(x); 579 * double q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.05); 580 * }</pre> 581 * 582 * @param n Size of the values. 583 * @param values Values function. 584 * @param p Probability for the quantile to compute. 585 * @return the quantile 586 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is 587 * not in the range {@code [0, 1]}. 588 */ 589 public double evaluate(int n, IntToDoubleFunction values, double p) { 590 checkSize(n); 591 checkProbability(p); 592 // Special case 593 if (n <= 1) { 594 return n == 0 ? Double.NaN : values.applyAsDouble(0); 595 } 596 final double pos = estimationType.index(p, n); 597 final int i = (int) pos; 598 final double v1 = values.applyAsDouble(i); 599 if (pos > i) { 600 final double v2 = values.applyAsDouble(i + 1); 601 return Interpolation.interpolate(v1, v2, pos - i); 602 } 603 return v1; 604 } 605 606 /** 607 * Evaluate the {@code p}-th quantiles of the values. 608 * 609 * <p>This method can be used when the values of known size are already sorted. 610 * 611 * <pre>{@code 612 * short[] x = ... 613 * Arrays.sort(x); 614 * double[] q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.25, 0.5, 0.75); 615 * }</pre> 616 * 617 * @param n Size of the values. 618 * @param values Values function. 619 * @param p Probabilities for the quantiles to compute. 620 * @return the quantiles 621 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is 622 * not in the range {@code [0, 1]}; or no probabilities are specified. 623 */ 624 public double[] evaluate(int n, IntToDoubleFunction values, double... p) { 625 checkSize(n); 626 checkProbabilities(p); 627 // Special case 628 final double[] q = new double[p.length]; 629 if (n <= 1) { 630 Arrays.fill(q, n == 0 ? Double.NaN : values.applyAsDouble(0)); 631 return q; 632 } 633 for (int k = 0; k < p.length; k++) { 634 final double pos = estimationType.index(p[k], n); 635 final int i = (int) pos; 636 final double v1 = values.applyAsDouble(i); 637 if (pos > i) { 638 final double v2 = values.applyAsDouble(i + 1); 639 q[k] = Interpolation.interpolate(v1, v2, pos - i); 640 } else { 641 q[k] = v1; 642 } 643 } 644 return q; 645 } 646 647 /** 648 * Check the probability {@code p} is in the range {@code [0, 1]}. 649 * 650 * @param p Probability for the quantile to compute. 651 * @throws IllegalArgumentException if the probability is not in the range {@code [0, 1]} 652 */ 653 private static void checkProbability(double p) { 654 // Logic negation will detect NaN 655 if (!(p >= 0 && p <= 1)) { 656 throw new IllegalArgumentException(INVALID_PROBABILITY + p); 657 } 658 } 659 660 /** 661 * Check the probabilities {@code p} are in the range {@code [0, 1]}. 662 * 663 * @param p Probabilities for the quantiles to compute. 664 * @throws IllegalArgumentException if any probabilities {@code p} is not in the range {@code [0, 1]}; 665 * or no probabilities are specified. 666 */ 667 private static void checkProbabilities(double... p) { 668 if (p.length == 0) { 669 throw new IllegalArgumentException(NO_PROBABILITIES_SPECIFIED); 670 } 671 for (final double pp : p) { 672 checkProbability(pp); 673 } 674 } 675 676 /** 677 * Check the {@code size} is positive. 678 * 679 * @param n Size of the values. 680 * @throws IllegalArgumentException if {@code size < 0} 681 */ 682 private static void checkSize(int n) { 683 if (n < 0) { 684 throw new IllegalArgumentException(INVALID_SIZE + n); 685 } 686 } 687 688 /** 689 * Check the number of probabilities {@code n} is strictly positive. 690 * 691 * @param n Number of probabilities. 692 * @throws IllegalArgumentException if {@code c < 1} 693 */ 694 private static void checkNumberOfProbabilities(int n) { 695 if (n < 1) { 696 throw new IllegalArgumentException(INVALID_NUMBER_OF_PROBABILITIES + n); 697 } 698 } 699 700 /** 701 * Compute the indices required for quantile interpolation. 702 * 703 * <p>The zero-based interpolation index in {@code [0, n)} is 704 * saved into the working array {@code q} for each {@code p}. 705 * 706 * <p>The indices are incremented by the provided {@code offset} to allow 707 * addressing sub-ranges of a larger array. 708 * 709 * @param n Size of the data. 710 * @param p Probabilities for the quantiles to compute. 711 * @param q Working array for quantiles in {@code [0, n)}. 712 * @param offset Array offset. 713 * @return the indices in {@code [offset, offset + n)} 714 */ 715 private int[] computeIndices(int n, double[] p, double[] q, int offset) { 716 final int[] indices = new int[p.length << 1]; 717 int count = 0; 718 for (int k = 0; k < p.length; k++) { 719 final double pos = estimationType.index(p[k], n); 720 q[k] = pos; 721 final int i = (int) pos; 722 indices[count++] = offset + i; 723 if (pos > i) { 724 // Require the next index for interpolation 725 indices[count++] = offset + i + 1; 726 } 727 } 728 if (count < indices.length) { 729 return Arrays.copyOf(indices, count); 730 } 731 return indices; 732 } 733 734 /** 735 * Estimation methods for a quantile. Provides the nine quantile algorithms 736 * defined in Hyndman and Fan (1996)[1] as {@code HF1 - HF9}. 737 * 738 * <p>Samples quantiles are defined by: 739 * 740 * <p>\[ Q(p) = (1 - \gamma) x_j + \gamma x_{j+1} \] 741 * 742 * <p>where \( \frac{j-m}{n} \leq p \le \frac{j-m+1}{n} \), \( x_j \) is the \( j \)th 743 * order statistic, \( n \) is the sample size, the value of \( \gamma \) is a function 744 * of \( j = \lfloor np+m \rfloor \) and \( g = np + m - j \), and \( m \) is a constant 745 * determined by the sample quantile type. 746 * 747 * <p>Note that the real-valued position \( np + m \) is a 1-based index and 748 * \( j \in [1, n] \). If the real valued position is computed as beyond the lowest or 749 * highest values in the sample, this implementation will return the minimum or maximum 750 * observation respectively. 751 * 752 * <p>Types 1, 2, and 3 are discontinuous functions of \( p \); types 4 to 9 are continuous 753 * functions of \( p \). 754 * 755 * <p>For the continuous functions, the probability \( p_k \) is provided for the \( k \)-th order 756 * statistic in size \( n \). Samples quantiles are equivalently obtained to \( Q(p) \) by 757 * linear interpolation between points \( (p_k, x_k) \) and \( (p_{k+1}, x_{k+1}) \) for 758 * any \( p_k \leq p \leq p_{k+1} \). 759 * 760 * <ol> 761 * <li>Hyndman and Fan (1996) 762 * <i>Sample Quantiles in Statistical Packages.</i> 763 * The American Statistician, 50, 361-365. 764 * <a href="https://www.jstor.org/stable/2684934">doi.org/10.2307/2684934</a> 765 * <li><a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a> 766 * </ol> 767 */ 768 public enum EstimationMethod { 769 /** 770 * Inverse of the empirical distribution function. 771 * 772 * <p>\( m = 0 \). \( \gamma = 0 \) if \( g = 0 \), and 1 otherwise. 773 */ 774 HF1 { 775 @Override 776 double position0(double p, int n) { 777 // position = np + 0. This is 1-based so adjust to 0-based. 778 return Math.ceil(n * p) - 1; 779 } 780 }, 781 /** 782 * Similar to {@link #HF1} with averaging at discontinuities. 783 * 784 * <p>\( m = 0 \). \( \gamma = 0.5 \) if \( g = 0 \), and 1 otherwise. 785 */ 786 HF2 { 787 @Override 788 double position0(double p, int n) { 789 final double pos = n * p; 790 // Average at discontinuities 791 final int j = (int) pos; 792 final double g = pos - j; 793 if (g == 0) { 794 return j - 0.5; 795 } 796 // As HF1 : ceil(j + g) - 1 797 return j; 798 } 799 }, 800 /** 801 * The observation closest to \( np \). Ties are resolved to the nearest even order statistic. 802 * 803 * <p>\( m = -1/2 \). \( \gamma = 0 \) if \( g = 0 \) and \( j \) is even, and 1 otherwise. 804 */ 805 HF3 { 806 @Override 807 double position0(double p, int n) { 808 // Let rint do the work for ties to even 809 return Math.rint(n * p) - 1; 810 } 811 }, 812 /** 813 * Linear interpolation of the inverse of the empirical CDF. 814 * 815 * <p>\( m = 0 \). \( p_k = \frac{k}{n} \). 816 */ 817 HF4 { 818 @Override 819 double position0(double p, int n) { 820 // np + 0 - 1 821 return n * p - 1; 822 } 823 }, 824 /** 825 * A piecewise linear function where the knots are the values midway through the steps of 826 * the empirical CDF. Proposed by Hazen (1914) and popular amongst hydrologists. 827 * 828 * <p>\( m = 1/2 \). \( p_k = \frac{k - 1/2}{n} \). 829 */ 830 HF5 { 831 @Override 832 double position0(double p, int n) { 833 // np + 0.5 - 1 834 return n * p - 0.5; 835 } 836 }, 837 /** 838 * Linear interpolation of the expectations for the order statistics for the uniform 839 * distribution on [0,1]. Proposed by Weibull (1939). 840 * 841 * <p>\( m = p \). \( p_k = \frac{k}{n + 1} \). 842 * 843 * <p>This method computes the quantile as per the Apache Commons Math Percentile 844 * legacy implementation. 845 */ 846 HF6 { 847 @Override 848 double position0(double p, int n) { 849 // np + p - 1 850 return (n + 1) * p - 1; 851 } 852 }, 853 /** 854 * Linear interpolation of the modes for the order statistics for the uniform 855 * distribution on [0,1]. Proposed by Gumbull (1939). 856 * 857 * <p>\( m = 1 - p \). \( p_k = \frac{k - 1}{n - 1} \). 858 */ 859 HF7 { 860 @Override 861 double position0(double p, int n) { 862 // np + 1-p - 1 863 return (n - 1) * p; 864 } 865 }, 866 /** 867 * Linear interpolation of the approximate medians for order statistics. 868 * 869 * <p>\( m = (p + 1)/3 \). \( p_k = \frac{k - 1/3}{n + 1/3} \). 870 * 871 * <p>As per Hyndman and Fan (1996) this approach is most recommended as it provides 872 * an approximate median-unbiased estimate regardless of distribution. 873 */ 874 HF8 { 875 @Override 876 double position0(double p, int n) { 877 return n * p + (p + 1) / 3 - 1; 878 } 879 }, 880 /** 881 * Quantile estimates are approximately unbiased for the expected order statistics if 882 * \( x \) is normally distributed. 883 * 884 * <p>\( m = p/4 + 3/8 \). \( p_k = \frac{k - 3/8}{n + 1/4} \). 885 */ 886 HF9 { 887 @Override 888 double position0(double p, int n) { 889 // np + p/4 + 3/8 - 1 890 return (n + 0.25) * p - 0.625; 891 } 892 }; 893 894 /** 895 * Finds the real-valued position for calculation of the quantile. 896 * 897 * <p>Return {@code i + g} such that the quantile value from sorted data is: 898 * 899 * <p>value = data[i] + g * (data[i+1] - data[i]) 900 * 901 * <p>Warning: Interpolation should not use {@code data[i+1]} unless {@code g != 0}. 902 * 903 * <p>Note: In contrast to the definition of Hyndman and Fan in the class header 904 * which uses a 1-based position, this is a zero based index. This change is for 905 * convenience when addressing array positions. 906 * 907 * @param p p<sup>th</sup> quantile. 908 * @param n Size. 909 * @return a real-valued position (0-based) into the range {@code [0, n)} 910 */ 911 abstract double position0(double p, int n); 912 913 /** 914 * Finds the index {@code i} and fractional part {@code g} of a real-valued position 915 * to interpolate the quantile. 916 * 917 * <p>Return {@code i + g} such that the quantile value from sorted data is: 918 * 919 * <p>value = data[i] + g * (data[i+1] - data[i]) 920 * 921 * <p>Note: Interpolation should not use {@code data[i+1]} unless {@code g != 0}. 922 * 923 * @param p p<sup>th</sup> quantile. 924 * @param n Size. 925 * @return index (in [0, n-1]) 926 */ 927 final double index(double p, int n) { 928 final double pos = position0(p, n); 929 // Bounds check in [0, n-1] 930 if (pos < 0) { 931 return 0; 932 } 933 if (pos > n - 1.0) { 934 return n - 1.0; 935 } 936 return pos; 937 } 938 } 939}