1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.statistics.descriptive;
18
19 import java.util.Arrays;
20 import java.util.Objects;
21 import java.util.function.IntToDoubleFunction;
22 import org.apache.commons.numbers.arrays.Selection;
23
24 /**
25 * Provides quantile computation.
26 *
27 * <p>For values of length {@code n}:
28 * <ul>
29 * <li>The result is {@code NaN} if {@code n = 0}.
30 * <li>The result is {@code values[0]} if {@code n = 1}.
31 * <li>Otherwise the result is computed using the {@link EstimationMethod}.
32 * </ul>
33 *
34 * <p>Computation of multiple quantiles and will handle duplicate and unordered
35 * probabilities. Passing ordered probabilities is recommended if the order is already
36 * known as this can improve efficiency; for example using uniform spacing through the
37 * array data, or to identify extreme values from the data such as {@code [0.001, 0.999]}.
38 *
39 * <p>This implementation respects the ordering imposed by
40 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
41 * in the selected positions in the fully sorted values then the result is {@code NaN}.
42 *
43 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
44 *
45 * <p>Instances of this class are immutable and thread-safe.
46 *
47 * @see #with(NaNPolicy)
48 * @see <a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a>
49 * @since 1.1
50 */
51 public final class Quantile {
52 /** Message when the probability is not in the range {@code [0, 1]}. */
53 private static final String INVALID_PROBABILITY = "Invalid probability: ";
54 /** Message when no probabilities are provided for the varargs method. */
55 private static final String NO_PROBABILITIES_SPECIFIED = "No probabilities specified";
56 /** Message when the size is not valid. */
57 private static final String INVALID_SIZE = "Invalid size: ";
58 /** Message when the number of probabilities in a range is not valid. */
59 private static final String INVALID_NUMBER_OF_PROBABILITIES = "Invalid number of probabilities: ";
60
61 /** Default instance. Method 8 is recommended by Hyndman and Fan. */
62 private static final Quantile DEFAULT = new Quantile(false, NaNPolicy.INCLUDE, EstimationMethod.HF8);
63
64 /** Flag to indicate if the data should be copied. */
65 private final boolean copy;
66 /** NaN policy for floating point data. */
67 private final NaNPolicy nanPolicy;
68 /** Transformer for NaN data. */
69 private final NaNTransformer nanTransformer;
70 /** Estimation type used to determine the value from the quantile. */
71 private final EstimationMethod estimationType;
72
73 /**
74 * @param copy Flag to indicate if the data should be copied.
75 * @param nanPolicy NaN policy.
76 * @param estimationType Estimation type used to determine the value from the quantile.
77 */
78 private Quantile(boolean copy, NaNPolicy nanPolicy, EstimationMethod estimationType) {
79 this.copy = copy;
80 this.nanPolicy = nanPolicy;
81 this.estimationType = estimationType;
82 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
83 }
84
85 /**
86 * Return a new instance with the default options.
87 *
88 * <ul>
89 * <li>{@linkplain #withCopy(boolean) Copy = false}
90 * <li>{@linkplain #with(NaNPolicy) NaN policy = include}
91 * <li>{@linkplain #with(EstimationMethod) Estimation method = HF8}
92 * </ul>
93 *
94 * <p>Note: The default options configure for processing in-place and including
95 * {@code NaN} values in the data. This is the most efficient mode and has the
96 * smallest memory consumption.
97 *
98 * @return the quantile implementation
99 * @see #withCopy(boolean)
100 * @see #with(NaNPolicy)
101 * @see #with(EstimationMethod)
102 */
103 public static Quantile withDefaults() {
104 return DEFAULT;
105 }
106
107 /**
108 * Return an instance with the configured copy behaviour. If {@code false} then
109 * the input array will be modified by the call to evaluate the quantiles; otherwise
110 * the computation uses a copy of the data.
111 *
112 * @param v Value.
113 * @return an instance
114 */
115 public Quantile withCopy(boolean v) {
116 return new Quantile(v, nanPolicy, estimationType);
117 }
118
119 /**
120 * Return an instance with the configured {@link NaNPolicy}.
121 *
122 * <p>Note: This implementation respects the ordering imposed by
123 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
124 * considered greater than all other values, and all {@code NaN} values are equal. The
125 * {@link NaNPolicy} changes the computation of the statistic in the presence of
126 * {@code NaN} values.
127 *
128 * <ul>
129 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
130 * the size of the data <em>includes</em> the {@code NaN} values and the quantile will be
131 * {@code NaN} if any value used for quantile interpolation is {@code NaN}.
132 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
133 * the size of the data <em>excludes</em> the {@code NaN} values and the quantile will
134 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
135 * and the result is {@code NaN}.
136 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
137 * values.
138 * </ul>
139 *
140 * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
141 *
142 * @param v Value.
143 * @return an instance
144 */
145 public Quantile with(NaNPolicy v) {
146 return new Quantile(copy, Objects.requireNonNull(v), estimationType);
147 }
148
149 /**
150 * Return an instance with the configured {@link EstimationMethod}.
151 *
152 * @param v Value.
153 * @return an instance
154 */
155 public Quantile with(EstimationMethod v) {
156 return new Quantile(copy, nanPolicy, Objects.requireNonNull(v));
157 }
158
159 /**
160 * Generate {@code n} evenly spaced probabilities in the range {@code [0, 1]}.
161 *
162 * <pre>
163 * 1/(n + 1), 2/(n + 1), ..., n/(n + 1)
164 * </pre>
165 *
166 * @param n Number of probabilities.
167 * @return the probabilities
168 * @throws IllegalArgumentException if {@code n < 1}
169 */
170 public static double[] probabilities(int n) {
171 checkNumberOfProbabilities(n);
172 final double c1 = n + 1.0;
173 final double[] p = new double[n];
174 for (int i = 0; i < n; i++) {
175 p[i] = (i + 1.0) / c1;
176 }
177 return p;
178 }
179
180 /**
181 * Generate {@code n} evenly spaced probabilities in the range {@code [p1, p2]}.
182 *
183 * <pre>
184 * w = p2 - p1
185 * p1 + w/(n + 1), p1 + 2w/(n + 1), ..., p1 + nw/(n + 1)
186 * </pre>
187 *
188 * @param n Number of probabilities.
189 * @param p1 Lower probability.
190 * @param p2 Upper probability.
191 * @return the probabilities
192 * @throws IllegalArgumentException if {@code n < 1}; if the probabilities are not in the
193 * range {@code [0, 1]}; or {@code p2 <= p1}.
194 */
195 public static double[] probabilities(int n, double p1, double p2) {
196 checkProbability(p1);
197 checkProbability(p2);
198 if (p2 <= p1) {
199 throw new IllegalArgumentException("Invalid range: [" + p1 + ", " + p2 + "]");
200 }
201 final double[] p = probabilities(n);
202 for (int i = 0; i < n; i++) {
203 p[i] = (1 - p[i]) * p1 + p[i] * p2;
204 }
205 return p;
206 }
207
208 /**
209 * Evaluate the {@code p}-th quantile of the values.
210 *
211 * <p>Note: This method may partially sort the input values if not configured to
212 * {@link #withCopy(boolean) copy} the input data.
213 *
214 * <p><strong>Performance</strong>
215 *
216 * <p>It is not recommended to use this method for repeat calls for different quantiles
217 * within the same values. The {@link #evaluate(double[], double...)} method should be used
218 * which provides better performance.
219 *
220 * @param values Values.
221 * @param p Probability for the quantile to compute.
222 * @return the quantile
223 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
224 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
225 * @see #evaluate(double[], double...)
226 * @see #with(NaNPolicy)
227 */
228 public double evaluate(double[] values, double p) {
229 return compute(values, 0, values.length, p);
230 }
231
232 /**
233 * Evaluate the {@code p}-th quantile of the specified range of values.
234 *
235 * <p>Note: This method may partially sort the input values if not configured to
236 * {@link #withCopy(boolean) copy} the input data.
237 *
238 * <p><strong>Performance</strong>
239 *
240 * <p>It is not recommended to use this method for repeat calls for different quantiles
241 * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used
242 * which provides better performance.
243 *
244 * @param values Values.
245 * @param from Inclusive start of the range.
246 * @param to Exclusive end of the range.
247 * @param p Probability for the quantile to compute.
248 * @return the quantile
249 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
250 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
251 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
252 * @see #evaluateRange(double[], int, int, double...)
253 * @see #with(NaNPolicy)
254 * @since 1.2
255 */
256 public double evaluateRange(double[] values, int from, int to, double p) {
257 Statistics.checkFromToIndex(from, to, values.length);
258 return compute(values, from, to, p);
259 }
260
261 /**
262 * Compute the {@code p}-th quantile of the specified range of values.
263 *
264 * @param values Values.
265 * @param from Inclusive start of the range.
266 * @param to Exclusive end of the range.
267 * @param p Probability for the quantile to compute.
268 * @return the quantile
269 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
270 */
271 private double compute(double[] values, int from, int to, double p) {
272 checkProbability(p);
273 // Floating-point data handling
274 final int[] bounds = new int[2];
275 final double[] x = nanTransformer.apply(values, from, to, bounds);
276 final int start = bounds[0];
277 final int end = bounds[1];
278 final int n = end - start;
279 // Special cases
280 if (n <= 1) {
281 return n == 0 ? Double.NaN : x[start];
282 }
283
284 final double pos = estimationType.index(p, n);
285 final int ip = (int) pos;
286 final int i = start + ip;
287
288 // Partition and compute
289 if (pos > ip) {
290 Selection.select(x, start, end, new int[] {i, i + 1});
291 return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
292 }
293 Selection.select(x, start, end, i);
294 return x[i];
295 }
296
297 /**
298 * Evaluate the {@code p}-th quantiles of the values.
299 *
300 * <p>Note: This method may partially sort the input values if not configured to
301 * {@link #withCopy(boolean) copy} the input data.
302 *
303 * @param values Values.
304 * @param p Probabilities for the quantiles to compute.
305 * @return the quantiles
306 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
307 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
308 * @see #with(NaNPolicy)
309 */
310 public double[] evaluate(double[] values, double... p) {
311 return compute(values, 0, values.length, p);
312 }
313
314 /**
315 * Evaluate the {@code p}-th quantiles of the specified range of values.
316 *
317 * <p>Note: This method may partially sort the input values if not configured to
318 * {@link #withCopy(boolean) copy} the input data.
319 *
320 * @param values Values.
321 * @param from Inclusive start of the range.
322 * @param to Exclusive end of the range.
323 * @param p Probabilities for the quantiles to compute.
324 * @return the quantiles
325 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
326 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
327 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
328 * @see #with(NaNPolicy)
329 * @since 1.2
330 */
331 public double[] evaluateRange(double[] values, int from, int to, double... p) {
332 Statistics.checkFromToIndex(from, to, values.length);
333 return compute(values, from, to, p);
334 }
335
336 /**
337 * Compute the {@code p}-th quantiles of the specified range of values.
338 *
339 * @param values Values.
340 * @param from Inclusive start of the range.
341 * @param to Exclusive end of the range.
342 * @param p Probabilities for the quantiles to compute.
343 * @return the quantiles
344 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
345 * or no probabilities are specified.
346 */
347 private double[] compute(double[] values, int from, int to, double... p) {
348 checkProbabilities(p);
349 // Floating-point data handling
350 final int[] bounds = new int[2];
351 final double[] x = nanTransformer.apply(values, from, to, bounds);
352 final int start = bounds[0];
353 final int end = bounds[1];
354 final int n = end - start;
355 // Special cases
356 final double[] q = new double[p.length];
357 if (n <= 1) {
358 Arrays.fill(q, n == 0 ? Double.NaN : x[start]);
359 return q;
360 }
361
362 // Collect interpolation positions. We use the output q as storage.
363 final int[] indices = computeIndices(n, p, q, start);
364
365 // Partition
366 Selection.select(x, start, end, indices);
367
368 // Compute
369 for (int k = 0; k < p.length; k++) {
370 // ip in [0, n); i in [start, end)
371 final int ip = (int) q[k];
372 final int i = start + ip;
373 if (q[k] > ip) {
374 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
375 } else {
376 q[k] = x[i];
377 }
378 }
379 return q;
380 }
381
382 /**
383 * Evaluate the {@code p}-th quantile of the values.
384 *
385 * <p>Note: This method may partially sort the input values if not configured to
386 * {@link #withCopy(boolean) copy} the input data.
387 *
388 * <p><strong>Performance</strong>
389 *
390 * <p>It is not recommended to use this method for repeat calls for different quantiles
391 * within the same values. The {@link #evaluate(int[], double...)} method should be used
392 * which provides better performance.
393 *
394 * @param values Values.
395 * @param p Probability for the quantile to compute.
396 * @return the quantile
397 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
398 * @see #evaluate(int[], double...)
399 */
400 public double evaluate(int[] values, double p) {
401 return compute(values, 0, values.length, p);
402 }
403
404 /**
405 * Evaluate the {@code p}-th quantile of the specified range of values.
406 *
407 * <p>Note: This method may partially sort the input values if not configured to
408 * {@link #withCopy(boolean) copy} the input data.
409 *
410 * <p><strong>Performance</strong>
411 *
412 * <p>It is not recommended to use this method for repeat calls for different quantiles
413 * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used
414 * which provides better performance.
415 *
416 * @param values Values.
417 * @param from Inclusive start of the range.
418 * @param to Exclusive end of the range.
419 * @param p Probability for the quantile to compute.
420 * @return the quantile
421 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
422 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
423 * @see #evaluateRange(int[], int, int, double...)
424 * @since 1.2
425 */
426 public double evaluateRange(int[] values, int from, int to, double p) {
427 Statistics.checkFromToIndex(from, to, values.length);
428 return compute(values, from, to, p);
429 }
430
431 /**
432 * Compute the {@code p}-th quantile of the specified range of values.
433 *
434 * @param values Values.
435 * @param from Inclusive start of the range.
436 * @param to Exclusive end of the range.
437 * @param p Probability for the quantile to compute.
438 * @return the quantile
439 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
440 */
441 private double compute(int[] values, int from, int to, double p) {
442 checkProbability(p);
443 final int n = to - from;
444 // Special cases
445 if (n <= 1) {
446 return n == 0 ? Double.NaN : values[from];
447 }
448
449 // Create the range
450 final int[] x;
451 final int start;
452 final int end;
453 if (copy) {
454 x = Statistics.copy(values, from, to);
455 start = 0;
456 end = n;
457 } else {
458 x = values;
459 start = from;
460 end = to;
461 }
462
463 final double pos = estimationType.index(p, n);
464 final int ip = (int) pos;
465 final int i = start + ip;
466
467 // Partition and compute
468 if (pos > ip) {
469 Selection.select(x, start, end, new int[] {i, i + 1});
470 return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
471 }
472 Selection.select(x, start, end, i);
473 return x[i];
474 }
475
476 /**
477 * Evaluate the {@code p}-th quantiles of the values.
478 *
479 * <p>Note: This method may partially sort the input values if not configured to
480 * {@link #withCopy(boolean) copy} the input data.
481 *
482 * @param values Values.
483 * @param p Probabilities for the quantiles to compute.
484 * @return the quantiles
485 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
486 * or no probabilities are specified.
487 */
488 public double[] evaluate(int[] values, double... p) {
489 return compute(values, 0, values.length, p);
490 }
491
492 /**
493 * Evaluate the {@code p}-th quantiles of the specified range of values..
494 *
495 * <p>Note: This method may partially sort the input values if not configured to
496 * {@link #withCopy(boolean) copy} the input data.
497 *
498 * @param values Values.
499 * @param from Inclusive start of the range.
500 * @param to Exclusive end of the range.
501 * @param p Probabilities for the quantiles to compute.
502 * @return the quantiles
503 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
504 * or no probabilities are specified.
505 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
506 * @since 1.2
507 */
508 public double[] evaluateRange(int[] values, int from, int to, double... p) {
509 Statistics.checkFromToIndex(from, to, values.length);
510 return compute(values, from, to, p);
511 }
512
513 /**
514 * Evaluate the {@code p}-th quantiles of the specified range of values..
515 *
516 * <p>Note: This method may partially sort the input values if not configured to
517 * {@link #withCopy(boolean) copy} the input data.
518 *
519 * @param values Values.
520 * @param from Inclusive start of the range.
521 * @param to Exclusive end of the range.
522 * @param p Probabilities for the quantiles to compute.
523 * @return the quantiles
524 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
525 * or no probabilities are specified.
526 */
527 private double[] compute(int[] values, int from, int to, double... p) {
528 checkProbabilities(p);
529 final int n = to - from;
530 // Special cases
531 final double[] q = new double[p.length];
532 if (n <= 1) {
533 Arrays.fill(q, n == 0 ? Double.NaN : values[from]);
534 return q;
535 }
536
537 // Create the range
538 final int[] x;
539 final int start;
540 final int end;
541 if (copy) {
542 x = Statistics.copy(values, from, to);
543 start = 0;
544 end = n;
545 } else {
546 x = values;
547 start = from;
548 end = to;
549 }
550
551 // Collect interpolation positions. We use the output q as storage.
552 final int[] indices = computeIndices(n, p, q, start);
553
554 // Partition
555 Selection.select(x, start, end, indices);
556
557 // Compute
558 for (int k = 0; k < p.length; k++) {
559 // ip in [0, n); i in [start, end)
560 final int ip = (int) q[k];
561 final int i = start + ip;
562 if (q[k] > ip) {
563 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
564 } else {
565 q[k] = x[i];
566 }
567 }
568 return q;
569 }
570
571 /**
572 * Evaluate the {@code p}-th quantile of the values.
573 *
574 * <p>This method can be used when the values of known size are already sorted.
575 *
576 * <pre>{@code
577 * short[] x = ...
578 * Arrays.sort(x);
579 * double q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.05);
580 * }</pre>
581 *
582 * @param n Size of the values.
583 * @param values Values function.
584 * @param p Probability for the quantile to compute.
585 * @return the quantile
586 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is
587 * not in the range {@code [0, 1]}.
588 */
589 public double evaluate(int n, IntToDoubleFunction values, double p) {
590 checkSize(n);
591 checkProbability(p);
592 // Special case
593 if (n <= 1) {
594 return n == 0 ? Double.NaN : values.applyAsDouble(0);
595 }
596 final double pos = estimationType.index(p, n);
597 final int i = (int) pos;
598 final double v1 = values.applyAsDouble(i);
599 if (pos > i) {
600 final double v2 = values.applyAsDouble(i + 1);
601 return Interpolation.interpolate(v1, v2, pos - i);
602 }
603 return v1;
604 }
605
606 /**
607 * Evaluate the {@code p}-th quantiles of the values.
608 *
609 * <p>This method can be used when the values of known size are already sorted.
610 *
611 * <pre>{@code
612 * short[] x = ...
613 * Arrays.sort(x);
614 * double[] q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.25, 0.5, 0.75);
615 * }</pre>
616 *
617 * @param n Size of the values.
618 * @param values Values function.
619 * @param p Probabilities for the quantiles to compute.
620 * @return the quantiles
621 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is
622 * not in the range {@code [0, 1]}; or no probabilities are specified.
623 */
624 public double[] evaluate(int n, IntToDoubleFunction values, double... p) {
625 checkSize(n);
626 checkProbabilities(p);
627 // Special case
628 final double[] q = new double[p.length];
629 if (n <= 1) {
630 Arrays.fill(q, n == 0 ? Double.NaN : values.applyAsDouble(0));
631 return q;
632 }
633 for (int k = 0; k < p.length; k++) {
634 final double pos = estimationType.index(p[k], n);
635 final int i = (int) pos;
636 final double v1 = values.applyAsDouble(i);
637 if (pos > i) {
638 final double v2 = values.applyAsDouble(i + 1);
639 q[k] = Interpolation.interpolate(v1, v2, pos - i);
640 } else {
641 q[k] = v1;
642 }
643 }
644 return q;
645 }
646
647 /**
648 * Check the probability {@code p} is in the range {@code [0, 1]}.
649 *
650 * @param p Probability for the quantile to compute.
651 * @throws IllegalArgumentException if the probability is not in the range {@code [0, 1]}
652 */
653 private static void checkProbability(double p) {
654 // Logic negation will detect NaN
655 if (!(p >= 0 && p <= 1)) {
656 throw new IllegalArgumentException(INVALID_PROBABILITY + p);
657 }
658 }
659
660 /**
661 * Check the probabilities {@code p} are in the range {@code [0, 1]}.
662 *
663 * @param p Probabilities for the quantiles to compute.
664 * @throws IllegalArgumentException if any probabilities {@code p} is not in the range {@code [0, 1]};
665 * or no probabilities are specified.
666 */
667 private static void checkProbabilities(double... p) {
668 if (p.length == 0) {
669 throw new IllegalArgumentException(NO_PROBABILITIES_SPECIFIED);
670 }
671 for (final double pp : p) {
672 checkProbability(pp);
673 }
674 }
675
676 /**
677 * Check the {@code size} is positive.
678 *
679 * @param n Size of the values.
680 * @throws IllegalArgumentException if {@code size < 0}
681 */
682 private static void checkSize(int n) {
683 if (n < 0) {
684 throw new IllegalArgumentException(INVALID_SIZE + n);
685 }
686 }
687
688 /**
689 * Check the number of probabilities {@code n} is strictly positive.
690 *
691 * @param n Number of probabilities.
692 * @throws IllegalArgumentException if {@code c < 1}
693 */
694 private static void checkNumberOfProbabilities(int n) {
695 if (n < 1) {
696 throw new IllegalArgumentException(INVALID_NUMBER_OF_PROBABILITIES + n);
697 }
698 }
699
700 /**
701 * Compute the indices required for quantile interpolation.
702 *
703 * <p>The zero-based interpolation index in {@code [0, n)} is
704 * saved into the working array {@code q} for each {@code p}.
705 *
706 * <p>The indices are incremented by the provided {@code offset} to allow
707 * addressing sub-ranges of a larger array.
708 *
709 * @param n Size of the data.
710 * @param p Probabilities for the quantiles to compute.
711 * @param q Working array for quantiles in {@code [0, n)}.
712 * @param offset Array offset.
713 * @return the indices in {@code [offset, offset + n)}
714 */
715 private int[] computeIndices(int n, double[] p, double[] q, int offset) {
716 final int[] indices = new int[p.length << 1];
717 int count = 0;
718 for (int k = 0; k < p.length; k++) {
719 final double pos = estimationType.index(p[k], n);
720 q[k] = pos;
721 final int i = (int) pos;
722 indices[count++] = offset + i;
723 if (pos > i) {
724 // Require the next index for interpolation
725 indices[count++] = offset + i + 1;
726 }
727 }
728 if (count < indices.length) {
729 return Arrays.copyOf(indices, count);
730 }
731 return indices;
732 }
733
734 /**
735 * Estimation methods for a quantile. Provides the nine quantile algorithms
736 * defined in Hyndman and Fan (1996)[1] as {@code HF1 - HF9}.
737 *
738 * <p>Samples quantiles are defined by:
739 *
740 * <p>\[ Q(p) = (1 - \gamma) x_j + \gamma x_{j+1} \]
741 *
742 * <p>where \( \frac{j-m}{n} \leq p \le \frac{j-m+1}{n} \), \( x_j \) is the \( j \)th
743 * order statistic, \( n \) is the sample size, the value of \( \gamma \) is a function
744 * of \( j = \lfloor np+m \rfloor \) and \( g = np + m - j \), and \( m \) is a constant
745 * determined by the sample quantile type.
746 *
747 * <p>Note that the real-valued position \( np + m \) is a 1-based index and
748 * \( j \in [1, n] \). If the real valued position is computed as beyond the lowest or
749 * highest values in the sample, this implementation will return the minimum or maximum
750 * observation respectively.
751 *
752 * <p>Types 1, 2, and 3 are discontinuous functions of \( p \); types 4 to 9 are continuous
753 * functions of \( p \).
754 *
755 * <p>For the continuous functions, the probability \( p_k \) is provided for the \( k \)-th order
756 * statistic in size \( n \). Samples quantiles are equivalently obtained to \( Q(p) \) by
757 * linear interpolation between points \( (p_k, x_k) \) and \( (p_{k+1}, x_{k+1}) \) for
758 * any \( p_k \leq p \leq p_{k+1} \).
759 *
760 * <ol>
761 * <li>Hyndman and Fan (1996)
762 * <i>Sample Quantiles in Statistical Packages.</i>
763 * The American Statistician, 50, 361-365.
764 * <a href="https://www.jstor.org/stable/2684934">doi.org/10.2307/2684934</a>
765 * <li><a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a>
766 * </ol>
767 */
768 public enum EstimationMethod {
769 /**
770 * Inverse of the empirical distribution function.
771 *
772 * <p>\( m = 0 \). \( \gamma = 0 \) if \( g = 0 \), and 1 otherwise.
773 */
774 HF1 {
775 @Override
776 double position0(double p, int n) {
777 // position = np + 0. This is 1-based so adjust to 0-based.
778 return Math.ceil(n * p) - 1;
779 }
780 },
781 /**
782 * Similar to {@link #HF1} with averaging at discontinuities.
783 *
784 * <p>\( m = 0 \). \( \gamma = 0.5 \) if \( g = 0 \), and 1 otherwise.
785 */
786 HF2 {
787 @Override
788 double position0(double p, int n) {
789 final double pos = n * p;
790 // Average at discontinuities
791 final int j = (int) pos;
792 final double g = pos - j;
793 if (g == 0) {
794 return j - 0.5;
795 }
796 // As HF1 : ceil(j + g) - 1
797 return j;
798 }
799 },
800 /**
801 * The observation closest to \( np \). Ties are resolved to the nearest even order statistic.
802 *
803 * <p>\( m = -1/2 \). \( \gamma = 0 \) if \( g = 0 \) and \( j \) is even, and 1 otherwise.
804 */
805 HF3 {
806 @Override
807 double position0(double p, int n) {
808 // Let rint do the work for ties to even
809 return Math.rint(n * p) - 1;
810 }
811 },
812 /**
813 * Linear interpolation of the inverse of the empirical CDF.
814 *
815 * <p>\( m = 0 \). \( p_k = \frac{k}{n} \).
816 */
817 HF4 {
818 @Override
819 double position0(double p, int n) {
820 // np + 0 - 1
821 return n * p - 1;
822 }
823 },
824 /**
825 * A piecewise linear function where the knots are the values midway through the steps of
826 * the empirical CDF. Proposed by Hazen (1914) and popular amongst hydrologists.
827 *
828 * <p>\( m = 1/2 \). \( p_k = \frac{k - 1/2}{n} \).
829 */
830 HF5 {
831 @Override
832 double position0(double p, int n) {
833 // np + 0.5 - 1
834 return n * p - 0.5;
835 }
836 },
837 /**
838 * Linear interpolation of the expectations for the order statistics for the uniform
839 * distribution on [0,1]. Proposed by Weibull (1939).
840 *
841 * <p>\( m = p \). \( p_k = \frac{k}{n + 1} \).
842 *
843 * <p>This method computes the quantile as per the Apache Commons Math Percentile
844 * legacy implementation.
845 */
846 HF6 {
847 @Override
848 double position0(double p, int n) {
849 // np + p - 1
850 return (n + 1) * p - 1;
851 }
852 },
853 /**
854 * Linear interpolation of the modes for the order statistics for the uniform
855 * distribution on [0,1]. Proposed by Gumbull (1939).
856 *
857 * <p>\( m = 1 - p \). \( p_k = \frac{k - 1}{n - 1} \).
858 */
859 HF7 {
860 @Override
861 double position0(double p, int n) {
862 // np + 1-p - 1
863 return (n - 1) * p;
864 }
865 },
866 /**
867 * Linear interpolation of the approximate medians for order statistics.
868 *
869 * <p>\( m = (p + 1)/3 \). \( p_k = \frac{k - 1/3}{n + 1/3} \).
870 *
871 * <p>As per Hyndman and Fan (1996) this approach is most recommended as it provides
872 * an approximate median-unbiased estimate regardless of distribution.
873 */
874 HF8 {
875 @Override
876 double position0(double p, int n) {
877 return n * p + (p + 1) / 3 - 1;
878 }
879 },
880 /**
881 * Quantile estimates are approximately unbiased for the expected order statistics if
882 * \( x \) is normally distributed.
883 *
884 * <p>\( m = p/4 + 3/8 \). \( p_k = \frac{k - 3/8}{n + 1/4} \).
885 */
886 HF9 {
887 @Override
888 double position0(double p, int n) {
889 // np + p/4 + 3/8 - 1
890 return (n + 0.25) * p - 0.625;
891 }
892 };
893
894 /**
895 * Finds the real-valued position for calculation of the quantile.
896 *
897 * <p>Return {@code i + g} such that the quantile value from sorted data is:
898 *
899 * <p>value = data[i] + g * (data[i+1] - data[i])
900 *
901 * <p>Warning: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
902 *
903 * <p>Note: In contrast to the definition of Hyndman and Fan in the class header
904 * which uses a 1-based position, this is a zero based index. This change is for
905 * convenience when addressing array positions.
906 *
907 * @param p p<sup>th</sup> quantile.
908 * @param n Size.
909 * @return a real-valued position (0-based) into the range {@code [0, n)}
910 */
911 abstract double position0(double p, int n);
912
913 /**
914 * Finds the index {@code i} and fractional part {@code g} of a real-valued position
915 * to interpolate the quantile.
916 *
917 * <p>Return {@code i + g} such that the quantile value from sorted data is:
918 *
919 * <p>value = data[i] + g * (data[i+1] - data[i])
920 *
921 * <p>Note: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
922 *
923 * @param p p<sup>th</sup> quantile.
924 * @param n Size.
925 * @return index (in [0, n-1])
926 */
927 final double index(double p, int n) {
928 final double pos = position0(p, n);
929 // Bounds check in [0, n-1]
930 if (pos < 0) {
931 return 0;
932 }
933 if (pos > n - 1.0) {
934 return n - 1.0;
935 }
936 return pos;
937 }
938 }
939 }