001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019import java.util.Arrays; 020import java.util.Objects; 021import java.util.function.IntToDoubleFunction; 022import java.util.function.IntToLongFunction; 023import org.apache.commons.numbers.arrays.Selection; 024 025/** 026 * Provides quantile computation. 027 * 028 * <p>For values of length {@code n}: 029 * <ul> 030 * <li>The result is {@code NaN} if {@code n = 0}.</li> 031 * <li>The result is {@code values[0]} if {@code n = 1}.</li> 032 * <li>Otherwise the result is computed using the {@link EstimationMethod}.</li> 033 * </ul> 034 * 035 * <p>Computation of multiple quantiles will handle duplicate and unordered 036 * probabilities. Passing ordered probabilities is recommended if the order is already 037 * known as this can improve efficiency; for example using uniform spacing through the 038 * array data, or to identify extreme values from the data such as {@code [0.001, 0.999]}. 039 * 040 * <p>This implementation respects the ordering imposed by 041 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs 042 * in the selected positions in the fully sorted values then the result is {@code NaN}. 043 * 044 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values. 045 * 046 * <p>Instances of this class are immutable and thread-safe. 047 * 048 * <p><strong>Support for {@code long} arrays</strong> 049 * 050 * <p>The result on {@code long} values can be returned as a {@code double} or 051 * a {@code long} using a {@link StatisticResult}. 052 * 053 * <p>The {@code double} result is computed within 1 ULP of the exact result. In some 054 * cases this may be outside the range defined by the minimum and maximum of the input 055 * array following rounding to a 53-bit floating point representation. For example a 056 * quantile of an array containing only {@link Long#MAX_VALUE} as a {@code double} is 057 * 2<sup>63</sup>, which is the closest representation of 2<sup>63</sup> - 1. 058 * 059 * <p>The {@code long} result is returned using the nearest whole number. 060 * In the event of ties the result is rounded towards positive infinity. 061 * This value will always be within the range defined by the minimum and maximum 062 * of the input array. Due to interpolation it may be a value not observed in 063 * the input values. 064 * 065 * <p>Interpolation between two {@code long} values requires extended precision 066 * floating-point arithmetic. This can be avoided using a discontinuous {@link EstimationMethod}. 067 * In this case the {@code long} quantile will be a value observed in the input values. 068 * 069 * <p>If the array length {@code n} is zero the result as a {@code double} is 070 * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}. 071 * 072 * <p>Multiple quantile results required as only one of the primitive values can be converted 073 * to a primitive array using a stream, for example: 074 * 075 * <pre>{@code 076 * long[] values = ... 077 * double[] p = Quantile.probabilities(10); 078 * Quantile q = Quantile.withDefaults(); 079 * long[] result = Arrays.stream(q.evaluate(values, p)) 080 * .mapToLong(StatisticResult::getAsLong) 081 * .toArray(); 082 * }</pre> 083 * 084 * @see #with(NaNPolicy) 085 * @see <a href="https://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a> 086 * @since 1.1 087 */ 088public final class Quantile { 089 /** Message when the probability is not in the range {@code [0, 1]}. */ 090 private static final String INVALID_PROBABILITY = "Invalid probability: "; 091 /** Message when no probabilities are provided for the varargs method. */ 092 private static final String NO_PROBABILITIES_SPECIFIED = "No probabilities specified"; 093 /** Message when the size is not valid. */ 094 private static final String INVALID_SIZE = "Invalid size: "; 095 /** Message when the number of probabilities in a range is not valid. */ 096 private static final String INVALID_NUMBER_OF_PROBABILITIES = "Invalid number of probabilities: "; 097 098 /** Default instance. Method 8 is recommended by Hyndman and Fan. */ 099 private static final Quantile DEFAULT = new Quantile(false, NaNPolicy.INCLUDE, EstimationMethod.HF8); 100 101 /** Flag to indicate if the data should be copied. */ 102 private final boolean copy; 103 /** NaN policy for floating point data. */ 104 private final NaNPolicy nanPolicy; 105 /** Transformer for NaN data. */ 106 private final NaNTransformer nanTransformer; 107 /** Estimation type used to determine the value from the quantile. */ 108 private final EstimationMethod estimationType; 109 110 /** 111 * @param copy Flag to indicate if the data should be copied. 112 * @param nanPolicy NaN policy. 113 * @param estimationType Estimation type used to determine the value from the quantile. 114 */ 115 private Quantile(boolean copy, NaNPolicy nanPolicy, EstimationMethod estimationType) { 116 this.copy = copy; 117 this.nanPolicy = nanPolicy; 118 this.estimationType = estimationType; 119 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy); 120 } 121 122 /** 123 * Return a new instance with the default options. 124 * 125 * <ul> 126 * <li>{@linkplain #withCopy(boolean) Copy = false}</li> 127 * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li> 128 * <li>{@linkplain #with(EstimationMethod) Estimation method = HF8}</li> 129 * </ul> 130 * 131 * <p>Note: The default options configure for processing in-place and including 132 * {@code NaN} values in the data. This is the most efficient mode and has the 133 * smallest memory consumption. 134 * 135 * @return the quantile implementation 136 * @see #withCopy(boolean) 137 * @see #with(NaNPolicy) 138 * @see #with(EstimationMethod) 139 */ 140 public static Quantile withDefaults() { 141 return DEFAULT; 142 } 143 144 /** 145 * Return an instance with the configured copy behaviour. If {@code false} then 146 * the input array will be modified by the call to evaluate the quantiles; otherwise 147 * the computation uses a copy of the data. 148 * 149 * @param v Value. 150 * @return an instance 151 */ 152 public Quantile withCopy(boolean v) { 153 return new Quantile(v, nanPolicy, estimationType); 154 } 155 156 /** 157 * Return an instance with the configured {@link NaNPolicy}. 158 * 159 * <p>Note: This implementation respects the ordering imposed by 160 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is 161 * considered greater than all other values, and all {@code NaN} values are equal. The 162 * {@link NaNPolicy} changes the computation of the statistic in the presence of 163 * {@code NaN} values. 164 * 165 * <ul> 166 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data; 167 * the size of the data <em>includes</em> the {@code NaN} values and the quantile will be 168 * {@code NaN} if any value used for quantile interpolation is {@code NaN}.</li> 169 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data; 170 * the size of the data <em>excludes</em> the {@code NaN} values and the quantile will 171 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero 172 * and the result is {@code NaN}.</li> 173 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN} 174 * values.</li> 175 * </ul> 176 * 177 * <p>Note that the result is identical for all policies if no {@code NaN} values are present. 178 * 179 * @param v Value. 180 * @return an instance 181 */ 182 public Quantile with(NaNPolicy v) { 183 return new Quantile(copy, Objects.requireNonNull(v), estimationType); 184 } 185 186 /** 187 * Return an instance with the configured {@link EstimationMethod}. 188 * 189 * @param v Value. 190 * @return an instance 191 */ 192 public Quantile with(EstimationMethod v) { 193 return new Quantile(copy, nanPolicy, Objects.requireNonNull(v)); 194 } 195 196 /** 197 * Generate {@code n} evenly spaced probabilities in the range {@code [0, 1]}. 198 * 199 * <pre> 200 * 1/(n + 1), 2/(n + 1), ..., n/(n + 1) 201 * </pre> 202 * 203 * @param n Number of probabilities. 204 * @return the probabilities 205 * @throws IllegalArgumentException if {@code n < 1} 206 */ 207 public static double[] probabilities(int n) { 208 checkNumberOfProbabilities(n); 209 final double c1 = n + 1.0; 210 final double[] p = new double[n]; 211 for (int i = 0; i < n; i++) { 212 p[i] = (i + 1.0) / c1; 213 } 214 return p; 215 } 216 217 /** 218 * Generate {@code n} evenly spaced probabilities in the range {@code [p1, p2]}. 219 * 220 * <pre> 221 * w = p2 - p1 222 * p1 + w/(n + 1), p1 + 2w/(n + 1), ..., p1 + nw/(n + 1) 223 * </pre> 224 * 225 * @param n Number of probabilities. 226 * @param p1 Lower probability. 227 * @param p2 Upper probability. 228 * @return the probabilities 229 * @throws IllegalArgumentException if {@code n < 1}; if the probabilities are not in the 230 * range {@code [0, 1]}; or {@code p2 <= p1}. 231 */ 232 public static double[] probabilities(int n, double p1, double p2) { 233 checkProbability(p1); 234 checkProbability(p2); 235 if (p2 <= p1) { 236 throw new IllegalArgumentException("Invalid range: [" + p1 + ", " + p2 + "]"); 237 } 238 final double[] p = probabilities(n); 239 for (int i = 0; i < n; i++) { 240 p[i] = (1 - p[i]) * p1 + p[i] * p2; 241 } 242 return p; 243 } 244 245 /** 246 * Evaluate the {@code p}-th quantile of the values. 247 * 248 * <p>Note: This method may partially sort the input values if not configured to 249 * {@link #withCopy(boolean) copy} the input data. 250 * 251 * <p><strong>Performance</strong> 252 * 253 * <p>It is not recommended to use this method for repeat calls for different quantiles 254 * within the same values. The {@link #evaluate(double[], double...)} method should be used 255 * which provides better performance. 256 * 257 * @param values Values. 258 * @param p Probability for the quantile to compute. 259 * @return the quantile 260 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; 261 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 262 * @see #evaluate(double[], double...) 263 * @see #with(NaNPolicy) 264 */ 265 public double evaluate(double[] values, double p) { 266 return compute(values, 0, values.length, p); 267 } 268 269 /** 270 * Evaluate the {@code p}-th quantile of the specified range of values. 271 * 272 * <p>Note: This method may partially sort the input values if not configured to 273 * {@link #withCopy(boolean) copy} the input data. 274 * 275 * <p><strong>Performance</strong> 276 * 277 * <p>It is not recommended to use this method for repeat calls for different quantiles 278 * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used 279 * which provides better performance. 280 * 281 * @param values Values. 282 * @param from Inclusive start of the range. 283 * @param to Exclusive end of the range. 284 * @param p Probability for the quantile to compute. 285 * @return the quantile 286 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}; 287 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 288 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 289 * @see #evaluateRange(double[], int, int, double...) 290 * @see #with(NaNPolicy) 291 * @since 1.2 292 */ 293 public double evaluateRange(double[] values, int from, int to, double p) { 294 Statistics.checkFromToIndex(from, to, values.length); 295 return compute(values, from, to, p); 296 } 297 298 /** 299 * Compute the {@code p}-th quantile of the specified range of values. 300 * 301 * @param values Values. 302 * @param from Inclusive start of the range. 303 * @param to Exclusive end of the range. 304 * @param p Probability for the quantile to compute. 305 * @return the quantile 306 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 307 */ 308 private double compute(double[] values, int from, int to, double p) { 309 checkProbability(p); 310 // Floating-point data handling 311 final int[] bounds = new int[2]; 312 final double[] x = nanTransformer.apply(values, from, to, bounds); 313 final int start = bounds[0]; 314 final int end = bounds[1]; 315 final int n = end - start; 316 // Special cases 317 if (n <= 1) { 318 return n == 0 ? Double.NaN : x[start]; 319 } 320 321 final double pos = estimationType.index(p, n); 322 final int ip = (int) pos; 323 final int i = start + ip; 324 325 // Partition and compute 326 if (pos > ip) { 327 Selection.select(x, start, end, new int[] {i, i + 1}); 328 return Interpolation.interpolate(x[i], x[i + 1], pos - ip); 329 } 330 Selection.select(x, start, end, i); 331 return x[i]; 332 } 333 334 /** 335 * Evaluate the {@code p}-th quantiles of the values. 336 * 337 * <p>Note: This method may partially sort the input values if not configured to 338 * {@link #withCopy(boolean) copy} the input data. 339 * 340 * @param values Values. 341 * @param p Probabilities for the quantiles to compute. 342 * @return the quantiles 343 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 344 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 345 * @see #with(NaNPolicy) 346 */ 347 public double[] evaluate(double[] values, double... p) { 348 return compute(values, 0, values.length, p); 349 } 350 351 /** 352 * Evaluate the {@code p}-th quantiles of the specified range of values. 353 * 354 * <p>Note: This method may partially sort the input values if not configured to 355 * {@link #withCopy(boolean) copy} the input data. 356 * 357 * @param values Values. 358 * @param from Inclusive start of the range. 359 * @param to Exclusive end of the range. 360 * @param p Probabilities for the quantiles to compute. 361 * @return the quantiles 362 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 363 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR} 364 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 365 * @see #with(NaNPolicy) 366 * @since 1.2 367 */ 368 public double[] evaluateRange(double[] values, int from, int to, double... p) { 369 Statistics.checkFromToIndex(from, to, values.length); 370 return compute(values, from, to, p); 371 } 372 373 /** 374 * Compute the {@code p}-th quantiles of the specified range of values. 375 * 376 * @param values Values. 377 * @param from Inclusive start of the range. 378 * @param to Exclusive end of the range. 379 * @param p Probabilities for the quantiles to compute. 380 * @return the quantiles 381 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 382 * or no probabilities are specified. 383 */ 384 private double[] compute(double[] values, int from, int to, double... p) { 385 checkProbabilities(p); 386 // Floating-point data handling 387 final int[] bounds = new int[2]; 388 final double[] x = nanTransformer.apply(values, from, to, bounds); 389 final int start = bounds[0]; 390 final int end = bounds[1]; 391 final int n = end - start; 392 // Special cases 393 final double[] q = new double[p.length]; 394 if (n <= 1) { 395 Arrays.fill(q, n == 0 ? Double.NaN : x[start]); 396 return q; 397 } 398 399 // Collect interpolation positions. We use the output q as storage. 400 final int[] indices = computeIndices(n, p, q, start); 401 402 // Partition 403 Selection.select(x, start, end, indices); 404 405 // Compute 406 for (int k = 0; k < p.length; k++) { 407 // ip in [0, n); i in [start, end) 408 final int ip = (int) q[k]; 409 final int i = start + ip; 410 if (q[k] > ip) { 411 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); 412 } else { 413 q[k] = x[i]; 414 } 415 } 416 return q; 417 } 418 419 /** 420 * Evaluate the {@code p}-th quantile of the values. 421 * 422 * <p>Note: This method may partially sort the input values if not configured to 423 * {@link #withCopy(boolean) copy} the input data. 424 * 425 * <p><strong>Performance</strong> 426 * 427 * <p>It is not recommended to use this method for repeat calls for different quantiles 428 * within the same values. The {@link #evaluate(int[], double...)} method should be used 429 * which provides better performance. 430 * 431 * @param values Values. 432 * @param p Probability for the quantile to compute. 433 * @return the quantile 434 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 435 * @see #evaluate(int[], double...) 436 */ 437 public double evaluate(int[] values, double p) { 438 return compute(values, 0, values.length, p); 439 } 440 441 /** 442 * Evaluate the {@code p}-th quantile of the specified range of values. 443 * 444 * <p>Note: This method may partially sort the input values if not configured to 445 * {@link #withCopy(boolean) copy} the input data. 446 * 447 * <p><strong>Performance</strong> 448 * 449 * <p>It is not recommended to use this method for repeat calls for different quantiles 450 * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used 451 * which provides better performance. 452 * 453 * @param values Values. 454 * @param from Inclusive start of the range. 455 * @param to Exclusive end of the range. 456 * @param p Probability for the quantile to compute. 457 * @return the quantile 458 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 459 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 460 * @see #evaluateRange(int[], int, int, double...) 461 * @since 1.2 462 */ 463 public double evaluateRange(int[] values, int from, int to, double p) { 464 Statistics.checkFromToIndex(from, to, values.length); 465 return compute(values, from, to, p); 466 } 467 468 /** 469 * Compute the {@code p}-th quantile of the specified range of values. 470 * 471 * @param values Values. 472 * @param from Inclusive start of the range. 473 * @param to Exclusive end of the range. 474 * @param p Probability for the quantile to compute. 475 * @return the quantile 476 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 477 */ 478 private double compute(int[] values, int from, int to, double p) { 479 checkProbability(p); 480 final int n = to - from; 481 // Special cases 482 if (n <= 1) { 483 return n == 0 ? Double.NaN : values[from]; 484 } 485 486 // Create the range 487 final int[] x; 488 final int start; 489 final int end; 490 if (copy) { 491 x = Statistics.copy(values, from, to); 492 start = 0; 493 end = n; 494 } else { 495 x = values; 496 start = from; 497 end = to; 498 } 499 500 final double pos = estimationType.index(p, n); 501 final int ip = (int) pos; 502 final int i = start + ip; 503 504 // Partition and compute 505 if (pos > ip) { 506 Selection.select(x, start, end, new int[] {i, i + 1}); 507 return Interpolation.interpolate((double) x[i], (double) x[i + 1], pos - ip); 508 } 509 Selection.select(x, start, end, i); 510 return x[i]; 511 } 512 513 /** 514 * Evaluate the {@code p}-th quantiles of the values. 515 * 516 * <p>Note: This method may partially sort the input values if not configured to 517 * {@link #withCopy(boolean) copy} the input data. 518 * 519 * @param values Values. 520 * @param p Probabilities for the quantiles to compute. 521 * @return the quantiles 522 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 523 * or no probabilities are specified. 524 */ 525 public double[] evaluate(int[] values, double... p) { 526 return compute(values, 0, values.length, p); 527 } 528 529 /** 530 * Evaluate the {@code p}-th quantiles of the specified range of values.. 531 * 532 * <p>Note: This method may partially sort the input values if not configured to 533 * {@link #withCopy(boolean) copy} the input data. 534 * 535 * @param values Values. 536 * @param from Inclusive start of the range. 537 * @param to Exclusive end of the range. 538 * @param p Probabilities for the quantiles to compute. 539 * @return the quantiles 540 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 541 * or no probabilities are specified. 542 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 543 * @since 1.2 544 */ 545 public double[] evaluateRange(int[] values, int from, int to, double... p) { 546 Statistics.checkFromToIndex(from, to, values.length); 547 return compute(values, from, to, p); 548 } 549 550 /** 551 * Evaluate the {@code p}-th quantiles of the specified range of values.. 552 * 553 * <p>Note: This method may partially sort the input values if not configured to 554 * {@link #withCopy(boolean) copy} the input data. 555 * 556 * @param values Values. 557 * @param from Inclusive start of the range. 558 * @param to Exclusive end of the range. 559 * @param p Probabilities for the quantiles to compute. 560 * @return the quantiles 561 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 562 * or no probabilities are specified. 563 */ 564 private double[] compute(int[] values, int from, int to, double... p) { 565 checkProbabilities(p); 566 final int n = to - from; 567 // Special cases 568 final double[] q = new double[p.length]; 569 if (n <= 1) { 570 Arrays.fill(q, n == 0 ? Double.NaN : values[from]); 571 return q; 572 } 573 574 // Create the range 575 final int[] x; 576 final int start; 577 final int end; 578 if (copy) { 579 x = Statistics.copy(values, from, to); 580 start = 0; 581 end = n; 582 } else { 583 x = values; 584 start = from; 585 end = to; 586 } 587 588 // Collect interpolation positions. We use the output q as storage. 589 final int[] indices = computeIndices(n, p, q, start); 590 591 // Partition 592 Selection.select(x, start, end, indices); 593 594 // Compute 595 for (int k = 0; k < p.length; k++) { 596 // ip in [0, n); i in [start, end) 597 final int ip = (int) q[k]; 598 final int i = start + ip; 599 if (q[k] > ip) { 600 q[k] = Interpolation.interpolate((double) x[i], (double) x[i + 1], q[k] - ip); 601 } else { 602 q[k] = x[i]; 603 } 604 } 605 return q; 606 } 607 608 /** 609 * Evaluate the {@code p}-th quantile of the values. 610 * 611 * <p>Note: This method may partially sort the input values if not configured to 612 * {@link #withCopy(boolean) copy} the input data. 613 * 614 * <p><strong>Performance</strong> 615 * 616 * <p>It is not recommended to use this method for repeat calls for different 617 * quantiles within the same values. The {@link #evaluate(long[], double...)} method 618 * should be used which provides better performance. 619 * 620 * @param values Values. 621 * @param p Probability for the quantile to compute. 622 * @return the quantile 623 * @throws IllegalArgumentException if the probability {@code p} is not in the range 624 * {@code [0, 1]} 625 * @see #evaluate(long[], double...) 626 * @since 1.3 627 */ 628 public StatisticResult evaluate(long[] values, double p) { 629 return compute(values, 0, values.length, p); 630 } 631 632 /** 633 * Evaluate the {@code p}-th quantile of the specified range of values. 634 * 635 * <p>Note: This method may partially sort the input values if not configured to 636 * {@link #withCopy(boolean) copy} the input data. 637 * 638 * <p><strong>Performance</strong> 639 * 640 * <p>It is not recommended to use this method for repeat calls for different quantiles 641 * within the same values. The {@link #evaluateRange(long[], int, int, double...)} method should be used 642 * which provides better performance. 643 * 644 * @param values Values. 645 * @param from Inclusive start of the range. 646 * @param to Exclusive end of the range. 647 * @param p Probability for the quantile to compute. 648 * @return the quantile 649 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 650 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 651 * @see #evaluateRange(long[], int, int, double...) 652 * @since 1.3 653 */ 654 public StatisticResult evaluateRange(long[] values, int from, int to, double p) { 655 Statistics.checkFromToIndex(from, to, values.length); 656 return compute(values, from, to, p); 657 } 658 659 /** 660 * Compute the {@code p}-th quantile of the specified range of values. 661 * 662 * @param values Values. 663 * @param from Inclusive start of the range. 664 * @param to Exclusive end of the range. 665 * @param p Probability for the quantile to compute. 666 * @return the quantile 667 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]} 668 * @since 1.3 669 */ 670 private StatisticResult compute(long[] values, int from, int to, double p) { 671 checkProbability(p); 672 final int n = to - from; 673 // Special cases 674 if (n <= 1) { 675 return n == 0 ? 676 () -> Double.NaN : 677 Statistics.createStatisticResult(values[from]); 678 } 679 680 // Create the range 681 final long[] x; 682 final int start; 683 final int end; 684 if (copy) { 685 x = Statistics.copy(values, from, to); 686 start = 0; 687 end = n; 688 } else { 689 x = values; 690 start = from; 691 end = to; 692 } 693 694 final double pos = estimationType.index(p, n); 695 final int ip = (int) pos; 696 final int i = start + ip; 697 698 // Partition and compute 699 if (pos > ip) { 700 Selection.select(x, start, end, new int[] {i, i + 1}); 701 return Interpolation.interpolate(x[i], x[i + 1], pos - ip); 702 } 703 Selection.select(x, start, end, i); 704 return Statistics.createStatisticResult(x[i]); 705 } 706 707 /** 708 * Evaluate the {@code p}-th quantiles of the values. 709 * 710 * <p>Note: This method may partially sort the input values if not configured to 711 * {@link #withCopy(boolean) copy} the input data. 712 * 713 * @param values Values. 714 * @param p Probabilities for the quantiles to compute. 715 * @return the quantiles 716 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 717 * or no probabilities are specified. 718 * @since 1.3 719 */ 720 public StatisticResult[] evaluate(long[] values, double... p) { 721 return compute(values, 0, values.length, p); 722 } 723 724 /** 725 * Evaluate the {@code p}-th quantiles of the specified range of values.. 726 * 727 * <p>Note: This method may partially sort the input values if not configured to 728 * {@link #withCopy(boolean) copy} the input data. 729 * 730 * @param values Values. 731 * @param from Inclusive start of the range. 732 * @param to Exclusive end of the range. 733 * @param p Probabilities for the quantiles to compute. 734 * @return the quantiles 735 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 736 * or no probabilities are specified. 737 * @throws IndexOutOfBoundsException if the sub-range is out of bounds 738 * @since 1.3 739 */ 740 public StatisticResult[] evaluateRange(long[] values, int from, int to, double... p) { 741 Statistics.checkFromToIndex(from, to, values.length); 742 return compute(values, from, to, p); 743 } 744 745 /** 746 * Evaluate the {@code p}-th quantiles of the specified range of values.. 747 * 748 * <p>Note: This method may partially sort the input values if not configured to 749 * {@link #withCopy(boolean) copy} the input data. 750 * 751 * @param values Values. 752 * @param from Inclusive start of the range. 753 * @param to Exclusive end of the range. 754 * @param p Probabilities for the quantiles to compute. 755 * @return the quantiles 756 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]}; 757 * or no probabilities are specified. 758 */ 759 private StatisticResult[] compute(long[] values, int from, int to, double... p) { 760 checkProbabilities(p); 761 final int n = to - from; 762 // Special cases 763 final StatisticResult[] result = new StatisticResult[p.length]; 764 if (n <= 1) { 765 final StatisticResult r = n == 0 ? 766 () -> Double.NaN : 767 Statistics.createStatisticResult(values[from]); 768 Arrays.fill(result, r); 769 return result; 770 } 771 772 // Create the range 773 final long[] x; 774 final int start; 775 final int end; 776 if (copy) { 777 x = Statistics.copy(values, from, to); 778 start = 0; 779 end = n; 780 } else { 781 x = values; 782 start = from; 783 end = to; 784 } 785 786 // Collect interpolation positions 787 final double[] q = new double[p.length]; 788 final int[] indices = computeIndices(n, p, q, start); 789 790 // Partition 791 Selection.select(x, start, end, indices); 792 793 // Compute 794 for (int k = 0; k < p.length; k++) { 795 // ip in [0, n); i in [start, end) 796 final int ip = (int) q[k]; 797 final int i = start + ip; 798 if (q[k] > ip) { 799 result[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip); 800 } else { 801 result[k] = Statistics.createStatisticResult(x[i]); 802 } 803 } 804 return result; 805 } 806 807 /** 808 * Evaluate the {@code p}-th quantile of the sorted values provided as a {@code double}. 809 * 810 * <p>This method can be used when the values of known size are already sorted. 811 * It can be used for primitive types not supported by other evaluation methods. 812 * Numeric types {@code byte}, {@code short} and {@code float} can be converted to 813 * type {@code double} without loss of precision. 814 * 815 * <pre>{@code 816 * short[] x = ... 817 * Arrays.sort(x); 818 * double q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.05); 819 * }</pre> 820 * 821 * <p>If the sorted array is a {@code long} datatype this method can lose information 822 * about the precision of the quantiles due to primitive type conversion. Use 823 * the method {@link #evaluateAsLong(int, IntToLongFunction, double)} to compute 824 * the {@code long} quantile result. 825 * 826 * @param n Size of the values. 827 * @param values Values function. 828 * @param p Probability for the quantile to compute. 829 * @return the quantile 830 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is 831 * not in the range {@code [0, 1]}. 832 * @see #evaluateAsLong(int, IntToLongFunction, double) 833 */ 834 public double evaluate(int n, IntToDoubleFunction values, double p) { 835 checkSize(n); 836 checkProbability(p); 837 // Special case 838 if (n <= 1) { 839 return n == 0 ? Double.NaN : values.applyAsDouble(0); 840 } 841 final double pos = estimationType.index(p, n); 842 final int i = (int) pos; 843 final double v1 = values.applyAsDouble(i); 844 if (pos > i) { 845 final double v2 = values.applyAsDouble(i + 1); 846 return Interpolation.interpolate(v1, v2, pos - i); 847 } 848 return v1; 849 } 850 851 /** 852 * Evaluate the {@code p}-th quantiles of the sorted values provided as a {@code double}. 853 * 854 * <p>This method can be used when the values of known size are already sorted. 855 * It can be used for primitive types not supported by other evaluation methods. 856 * Numeric types {@code byte}, {@code short} and {@code float} can be converted to 857 * type {@code double} without loss of precision. 858 * 859 * <pre>{@code 860 * short[] x = ... 861 * Arrays.sort(x); 862 * double[] q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.25, 0.5, 0.75); 863 * }</pre> 864 * 865 * <p>If the sorted array is a {@code long} datatype this method can lose information 866 * about the precision of the quantiles due to primitive type conversion. Use 867 * the method {@link #evaluateAsLong(int, IntToLongFunction, double...)} to compute 868 * the {@code long} quantile result. 869 * 870 * @param n Size of the values. 871 * @param values Values function. 872 * @param p Probabilities for the quantiles to compute. 873 * @return the quantiles 874 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is 875 * not in the range {@code [0, 1]}; or no probabilities are specified. 876 * @see #evaluateAsLong(int, IntToLongFunction, double...) 877 */ 878 public double[] evaluate(int n, IntToDoubleFunction values, double... p) { 879 checkSize(n); 880 checkProbabilities(p); 881 // Special case 882 final double[] q = new double[p.length]; 883 if (n <= 1) { 884 Arrays.fill(q, n == 0 ? Double.NaN : values.applyAsDouble(0)); 885 return q; 886 } 887 for (int k = 0; k < p.length; k++) { 888 final double pos = estimationType.index(p[k], n); 889 final int i = (int) pos; 890 final double v1 = values.applyAsDouble(i); 891 if (pos > i) { 892 final double v2 = values.applyAsDouble(i + 1); 893 q[k] = Interpolation.interpolate(v1, v2, pos - i); 894 } else { 895 q[k] = v1; 896 } 897 } 898 return q; 899 } 900 901 /** 902 * Evaluate the {@code p}-th quantile of the sorted values provided as a {@code long}. 903 * 904 * <p>This method can be used when the values of known size are already sorted. 905 * 906 * <pre>{@code 907 * long[] x = ... 908 * Arrays.sort(x); 909 * StatisticResult q = Quantile.withDefaults() 910 * .evaluateAsLong(x.length, i -> x[i], 0.05); 911 * }</pre> 912 * 913 * <p>Note: It is not recommended to sort data for use only in the quantile computation. 914 * The {@link #evaluate(long[], double)} method will partially sort the data as required 915 * and in most cases will be more efficient. 916 * 917 * @param n Size of the values. 918 * @param values Values function. 919 * @param p Probability for the quantile to compute. 920 * @return the quantile 921 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is 922 * not in the range {@code [0, 1]}. 923 * @since 1.3 924 */ 925 public StatisticResult evaluateAsLong(int n, IntToLongFunction values, double p) { 926 checkSize(n); 927 checkProbability(p); 928 // Special case 929 if (n <= 1) { 930 return n == 0 ? 931 () -> Double.NaN : 932 Statistics.createStatisticResult(values.applyAsLong(0)); 933 } 934 final double pos = estimationType.index(p, n); 935 final int i = (int) pos; 936 final long v1 = values.applyAsLong(i); 937 if (pos > i) { 938 final long v2 = values.applyAsLong(i + 1); 939 return Interpolation.interpolate(v1, v2, pos - i); 940 } 941 return Statistics.createStatisticResult(v1); 942 } 943 944 /** 945 * Evaluate the {@code p}-th quantiles of the sorted values provided as a {@code long}. 946 * 947 * <p>This method can be used when the values of known size are already sorted. 948 * 949 * <pre>{@code 950 * long[] x = ... 951 * Arrays.sort(x); 952 * StatisticResult[] q = Quantile.withDefaults() 953 * .evaluateAsLong(x.length, i -> x[i], 0.25, 0.5, 0.75); 954 * }</pre> 955 * 956 * <p>Note: It is not recommended to sort data for use only in the quantile computation. 957 * The {@link #evaluate(long[], double...)} method will partially sort the data as required 958 * and in most cases will be more efficient. 959 * 960 * @param n Size of the values. 961 * @param values Values function. 962 * @param p Probabilities for the quantiles to compute. 963 * @return the quantiles 964 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is 965 * not in the range {@code [0, 1]}; or no probabilities are specified. 966 * @since 1.3 967 */ 968 public StatisticResult[] evaluateAsLong(int n, IntToLongFunction values, double... p) { 969 checkSize(n); 970 checkProbabilities(p); 971 // Special case 972 final StatisticResult[] result = new StatisticResult[p.length]; 973 if (n <= 1) { 974 final StatisticResult r = n == 0 ? 975 () -> Double.NaN : 976 Statistics.createStatisticResult(values.applyAsLong(0)); 977 Arrays.fill(result, r); 978 return result; 979 } 980 for (int k = 0; k < p.length; k++) { 981 final double pos = estimationType.index(p[k], n); 982 final int i = (int) pos; 983 final long v1 = values.applyAsLong(i); 984 if (pos > i) { 985 final long v2 = values.applyAsLong(i + 1); 986 result[k] = Interpolation.interpolate(v1, v2, pos - i); 987 } else { 988 result[k] = Statistics.createStatisticResult(v1); 989 } 990 } 991 return result; 992 } 993 994 /** 995 * Check the probability {@code p} is in the range {@code [0, 1]}. 996 * 997 * @param p Probability for the quantile to compute. 998 * @throws IllegalArgumentException if the probability is not in the range {@code [0, 1]} 999 */ 1000 private static void checkProbability(double p) { 1001 // Logic negation will detect NaN 1002 if (!(p >= 0 && p <= 1)) { 1003 throw new IllegalArgumentException(INVALID_PROBABILITY + p); 1004 } 1005 } 1006 1007 /** 1008 * Check the probabilities {@code p} are in the range {@code [0, 1]}. 1009 * 1010 * @param p Probabilities for the quantiles to compute. 1011 * @throws IllegalArgumentException if any probabilities {@code p} is not in the range {@code [0, 1]}; 1012 * or no probabilities are specified. 1013 */ 1014 private static void checkProbabilities(double... p) { 1015 if (p.length == 0) { 1016 throw new IllegalArgumentException(NO_PROBABILITIES_SPECIFIED); 1017 } 1018 for (final double pp : p) { 1019 checkProbability(pp); 1020 } 1021 } 1022 1023 /** 1024 * Check the {@code size} is positive. 1025 * 1026 * @param n Size of the values. 1027 * @throws IllegalArgumentException if {@code size < 0} 1028 */ 1029 private static void checkSize(int n) { 1030 if (n < 0) { 1031 throw new IllegalArgumentException(INVALID_SIZE + n); 1032 } 1033 } 1034 1035 /** 1036 * Check the number of probabilities {@code n} is strictly positive. 1037 * 1038 * @param n Number of probabilities. 1039 * @throws IllegalArgumentException if {@code c < 1} 1040 */ 1041 private static void checkNumberOfProbabilities(int n) { 1042 if (n < 1) { 1043 throw new IllegalArgumentException(INVALID_NUMBER_OF_PROBABILITIES + n); 1044 } 1045 } 1046 1047 /** 1048 * Compute the indices required for quantile interpolation. 1049 * 1050 * <p>The zero-based interpolation index in {@code [0, n)} is 1051 * saved into the working array {@code q} for each {@code p}. 1052 * 1053 * <p>The indices are incremented by the provided {@code offset} to allow 1054 * addressing sub-ranges of a larger array. 1055 * 1056 * @param n Size of the data. 1057 * @param p Probabilities for the quantiles to compute. 1058 * @param q Working array for quantiles in {@code [0, n)}. 1059 * @param offset Array offset. 1060 * @return the indices in {@code [offset, offset + n)} 1061 */ 1062 private int[] computeIndices(int n, double[] p, double[] q, int offset) { 1063 final int[] indices = new int[p.length << 1]; 1064 int count = 0; 1065 for (int k = 0; k < p.length; k++) { 1066 final double pos = estimationType.index(p[k], n); 1067 q[k] = pos; 1068 final int i = (int) pos; 1069 indices[count++] = offset + i; 1070 if (pos > i) { 1071 // Require the next index for interpolation 1072 indices[count++] = offset + i + 1; 1073 } 1074 } 1075 if (count < indices.length) { 1076 return Arrays.copyOf(indices, count); 1077 } 1078 return indices; 1079 } 1080 1081 /** 1082 * Enumerates estimation methods for a quantile. Provides the nine quantile algorithms 1083 * defined in Hyndman and Fan (1996)[1] as {@code HF1 - HF9}. 1084 * 1085 * <p>Samples quantiles are defined by: 1086 * 1087 * <p>\[ Q(p) = (1 - \gamma) x_j + \gamma x_{j+1} \] 1088 * 1089 * <p>where \( \frac{j-m}{n} \leq p \le \frac{j-m+1}{n} \), \( x_j \) is the \( j \)th 1090 * order statistic, \( n \) is the sample size, the value of \( \gamma \) is a function 1091 * of \( j = \lfloor np+m \rfloor \) and \( g = np + m - j \), and \( m \) is a constant 1092 * determined by the sample quantile type. 1093 * 1094 * <p>Note that the real-valued position \( np + m \) is a 1-based index and 1095 * \( j \in [1, n] \). If the real valued position is computed as beyond the lowest or 1096 * highest values in the sample, this implementation will return the minimum or maximum 1097 * observation respectively. 1098 * 1099 * <p>Types 1, 2, and 3 are discontinuous functions of \( p \); types 4 to 9 are continuous 1100 * functions of \( p \). 1101 * 1102 * <p>For the continuous functions, the probability \( p_k \) is provided for the \( k \)-th order 1103 * statistic in size \( n \). Samples quantiles are equivalently obtained to \( Q(p) \) by 1104 * linear interpolation between points \( (p_k, x_k) \) and \( (p_{k+1}, x_{k+1}) \) for 1105 * any \( p_k \leq p \leq p_{k+1} \). 1106 * 1107 * <ol> 1108 * <li>Hyndman and Fan (1996) 1109 * <i>Sample Quantiles in Statistical Packages.</i> 1110 * The American Statistician, 50, 361-365. 1111 * <a href="https://www.jstor.org/stable/2684934">doi.org/10.2307/2684934</a></li> 1112 * <li><a href="https://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a></li> 1113 * </ol> 1114 */ 1115 public enum EstimationMethod { 1116 /** 1117 * Inverse of the empirical distribution function. 1118 * 1119 * <p>\( m = 0 \). \( \gamma = 0 \) if \( g = 0 \), and 1 otherwise. 1120 */ 1121 HF1 { 1122 @Override 1123 double position0(double p, int n) { 1124 // position = np + 0. This is 1-based so adjust to 0-based. 1125 return Math.ceil(n * p) - 1; 1126 } 1127 }, 1128 /** 1129 * Similar to {@link #HF1} with averaging at discontinuities. 1130 * 1131 * <p>\( m = 0 \). \( \gamma = 0.5 \) if \( g = 0 \), and 1 otherwise. 1132 */ 1133 HF2 { 1134 @Override 1135 double position0(double p, int n) { 1136 final double pos = n * p; 1137 // Average at discontinuities 1138 final int j = (int) pos; 1139 final double g = pos - j; 1140 if (g == 0) { 1141 return j - 0.5; 1142 } 1143 // As HF1 : ceil(j + g) - 1 1144 return j; 1145 } 1146 }, 1147 /** 1148 * The observation closest to \( np \). Ties are resolved to the nearest even order statistic. 1149 * 1150 * <p>\( m = -1/2 \). \( \gamma = 0 \) if \( g = 0 \) and \( j \) is even, and 1 otherwise. 1151 */ 1152 HF3 { 1153 @Override 1154 double position0(double p, int n) { 1155 // Let rint do the work for ties to even 1156 return Math.rint(n * p) - 1; 1157 } 1158 }, 1159 /** 1160 * Linear interpolation of the inverse of the empirical CDF. 1161 * 1162 * <p>\( m = 0 \). \( p_k = \frac{k}{n} \). 1163 */ 1164 HF4 { 1165 @Override 1166 double position0(double p, int n) { 1167 // np + 0 - 1 1168 return n * p - 1; 1169 } 1170 }, 1171 /** 1172 * A piecewise linear function where the knots are the values midway through the steps of 1173 * the empirical CDF. Proposed by Hazen (1914) and popular amongst hydrologists. 1174 * 1175 * <p>\( m = 1/2 \). \( p_k = \frac{k - 1/2}{n} \). 1176 */ 1177 HF5 { 1178 @Override 1179 double position0(double p, int n) { 1180 // np + 0.5 - 1 1181 return n * p - 0.5; 1182 } 1183 }, 1184 /** 1185 * Linear interpolation of the expectations for the order statistics for the uniform 1186 * distribution on [0,1]. Proposed by Weibull (1939). 1187 * 1188 * <p>\( m = p \). \( p_k = \frac{k}{n + 1} \). 1189 * 1190 * <p>This method computes the quantile as per the Apache Commons Math Percentile 1191 * legacy implementation. 1192 */ 1193 HF6 { 1194 @Override 1195 double position0(double p, int n) { 1196 // np + p - 1 1197 return (n + 1) * p - 1; 1198 } 1199 }, 1200 /** 1201 * Linear interpolation of the modes for the order statistics for the uniform 1202 * distribution on [0,1]. Proposed by Gumbull (1939). 1203 * 1204 * <p>\( m = 1 - p \). \( p_k = \frac{k - 1}{n - 1} \). 1205 */ 1206 HF7 { 1207 @Override 1208 double position0(double p, int n) { 1209 // np + 1-p - 1 1210 return (n - 1) * p; 1211 } 1212 }, 1213 /** 1214 * Linear interpolation of the approximate medians for order statistics. 1215 * 1216 * <p>\( m = (p + 1)/3 \). \( p_k = \frac{k - 1/3}{n + 1/3} \). 1217 * 1218 * <p>As per Hyndman and Fan (1996) this approach is most recommended as it provides 1219 * an approximate median-unbiased estimate regardless of distribution. 1220 */ 1221 HF8 { 1222 @Override 1223 double position0(double p, int n) { 1224 return n * p + (p + 1) / 3 - 1; 1225 } 1226 }, 1227 /** 1228 * Quantile estimates are approximately unbiased for the expected order statistics if 1229 * \( x \) is normally distributed. 1230 * 1231 * <p>\( m = p/4 + 3/8 \). \( p_k = \frac{k - 3/8}{n + 1/4} \). 1232 */ 1233 HF9 { 1234 @Override 1235 double position0(double p, int n) { 1236 // np + p/4 + 3/8 - 1 1237 return (n + 0.25) * p - 0.625; 1238 } 1239 }; 1240 1241 /** 1242 * Finds the real-valued position for calculation of the quantile. 1243 * 1244 * <p>Return {@code i + g} such that the quantile value from sorted data is: 1245 * 1246 * <p>value = data[i] + g * (data[i+1] - data[i]) 1247 * 1248 * <p>Warning: Interpolation should not use {@code data[i+1]} unless {@code g != 0}. 1249 * 1250 * <p>Note: In contrast to the definition of Hyndman and Fan in the class header 1251 * which uses a 1-based position, this is a zero based index. This change is for 1252 * convenience when addressing array positions. 1253 * 1254 * @param p p<sup>th</sup> quantile. 1255 * @param n Size. 1256 * @return a real-valued position (0-based) into the range {@code [0, n)} 1257 */ 1258 abstract double position0(double p, int n); 1259 1260 /** 1261 * Finds the index {@code i} and fractional part {@code g} of a real-valued position 1262 * to interpolate the quantile. 1263 * 1264 * <p>Return {@code i + g} such that the quantile value from sorted data is: 1265 * 1266 * <p>value = data[i] + g * (data[i+1] - data[i]) 1267 * 1268 * <p>Note: Interpolation should not use {@code data[i+1]} unless {@code g != 0}. 1269 * 1270 * @param p p<sup>th</sup> quantile. 1271 * @param n Size. 1272 * @return index (in [0, n-1]) 1273 */ 1274 final double index(double p, int n) { 1275 final double pos = position0(p, n); 1276 // Bounds check in [0, n-1] 1277 if (pos < 0) { 1278 return 0; 1279 } 1280 if (pos > n - 1.0) { 1281 return n - 1.0; 1282 } 1283 return pos; 1284 } 1285 } 1286}