Quantile.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.statistics.descriptive;

  18. import java.util.Arrays;
  19. import java.util.Objects;
  20. import java.util.function.IntToDoubleFunction;
  21. import org.apache.commons.numbers.arrays.Selection;

  22. /**
  23.  * Provides quantile computation.
  24.  *
  25.  * <p>For values of length {@code n}:
  26.  * <ul>
  27.  * <li>The result is {@code NaN} if {@code n = 0}.
  28.  * <li>The result is {@code values[0]} if {@code n = 1}.
  29.  * <li>Otherwise the result is computed using the {@link EstimationMethod}.
  30.  * </ul>
  31.  *
  32.  * <p>Computation of multiple quantiles and will handle duplicate and unordered
  33.  * probabilities. Passing ordered probabilities is recommended if the order is already
  34.  * known as this can improve efficiency; for example using uniform spacing through the
  35.  * array data, or to identify extreme values from the data such as {@code [0.001, 0.999]}.
  36.  *
  37.  * <p>This implementation respects the ordering imposed by
  38.  * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
  39.  * in the selected positions in the fully sorted values then the result is {@code NaN}.
  40.  *
  41.  * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
  42.  *
  43.  * <p>Instances of this class are immutable and thread-safe.
  44.  *
  45.  * @see #with(NaNPolicy)
  46.  * @see <a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a>
  47.  * @since 1.1
  48.  */
  49. public final class Quantile {
  50.     /** Message when the probability is not in the range {@code [0, 1]}. */
  51.     private static final String INVALID_PROBABILITY = "Invalid probability: ";
  52.     /** Message when no probabilities are provided for the varargs method. */
  53.     private static final String NO_PROBABILITIES_SPECIFIED = "No probabilities specified";
  54.     /** Message when the size is not valid. */
  55.     private static final String INVALID_SIZE = "Invalid size: ";
  56.     /** Message when the number of probabilities in a range is not valid. */
  57.     private static final String INVALID_NUMBER_OF_PROBABILITIES = "Invalid number of probabilities: ";

  58.     /** Default instance. Method 8 is recommended by Hyndman and Fan. */
  59.     private static final Quantile DEFAULT = new Quantile(false, NaNPolicy.INCLUDE, EstimationMethod.HF8);

  60.     /** Flag to indicate if the data should be copied. */
  61.     private final boolean copy;
  62.     /** NaN policy for floating point data. */
  63.     private final NaNPolicy nanPolicy;
  64.     /** Transformer for NaN data. */
  65.     private final NaNTransformer nanTransformer;
  66.     /** Estimation type used to determine the value from the quantile. */
  67.     private final EstimationMethod estimationType;

  68.     /**
  69.      * @param copy Flag to indicate if the data should be copied.
  70.      * @param nanPolicy NaN policy.
  71.      * @param estimationType Estimation type used to determine the value from the quantile.
  72.      */
  73.     private Quantile(boolean copy, NaNPolicy nanPolicy, EstimationMethod estimationType) {
  74.         this.copy = copy;
  75.         this.nanPolicy = nanPolicy;
  76.         this.estimationType = estimationType;
  77.         nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
  78.     }

  79.     /**
  80.      * Return a new instance with the default options.
  81.      *
  82.      * <ul>
  83.      * <li>{@linkplain #withCopy(boolean) Copy = false}
  84.      * <li>{@linkplain #with(NaNPolicy) NaN policy = include}
  85.      * <li>{@linkplain #with(EstimationMethod) Estimation method = HF8}
  86.      * </ul>
  87.      *
  88.      * <p>Note: The default options configure for processing in-place and including
  89.      * {@code NaN} values in the data. This is the most efficient mode and has the
  90.      * smallest memory consumption.
  91.      *
  92.      * @return the quantile implementation
  93.      * @see #withCopy(boolean)
  94.      * @see #with(NaNPolicy)
  95.      * @see #with(EstimationMethod)
  96.      */
  97.     public static Quantile withDefaults() {
  98.         return DEFAULT;
  99.     }

  100.     /**
  101.      * Return an instance with the configured copy behaviour. If {@code false} then
  102.      * the input array will be modified by the call to evaluate the quantiles; otherwise
  103.      * the computation uses a copy of the data.
  104.      *
  105.      * @param v Value.
  106.      * @return an instance
  107.      */
  108.     public Quantile withCopy(boolean v) {
  109.         return new Quantile(v, nanPolicy, estimationType);
  110.     }

  111.     /**
  112.      * Return an instance with the configured {@link NaNPolicy}.
  113.      *
  114.      * <p>Note: This implementation respects the ordering imposed by
  115.      * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
  116.      * considered greater than all other values, and all {@code NaN} values are equal. The
  117.      * {@link NaNPolicy} changes the computation of the statistic in the presence of
  118.      * {@code NaN} values.
  119.      *
  120.      * <ul>
  121.      * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
  122.      * the size of the data <em>includes</em> the {@code NaN} values and the quantile will be
  123.      * {@code NaN} if any value used for quantile interpolation is {@code NaN}.
  124.      * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
  125.      * the size of the data <em>excludes</em> the {@code NaN} values and the quantile will
  126.      * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
  127.      * and the result is {@code NaN}.
  128.      * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
  129.      * values.
  130.      * </ul>
  131.      *
  132.      * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
  133.      *
  134.      * @param v Value.
  135.      * @return an instance
  136.      */
  137.     public Quantile with(NaNPolicy v) {
  138.         return new Quantile(copy, Objects.requireNonNull(v), estimationType);
  139.     }

  140.     /**
  141.      * Return an instance with the configured {@link EstimationMethod}.
  142.      *
  143.      * @param v Value.
  144.      * @return an instance
  145.      */
  146.     public Quantile with(EstimationMethod v) {
  147.         return new Quantile(copy, nanPolicy, Objects.requireNonNull(v));
  148.     }

  149.     /**
  150.      * Generate {@code n} evenly spaced probabilities in the range {@code [0, 1]}.
  151.      *
  152.      * <pre>
  153.      * 1/(n + 1), 2/(n + 1), ..., n/(n + 1)
  154.      * </pre>
  155.      *
  156.      * @param n Number of probabilities.
  157.      * @return the probabilities
  158.      * @throws IllegalArgumentException if {@code n < 1}
  159.      */
  160.     public static double[] probabilities(int n) {
  161.         checkNumberOfProbabilities(n);
  162.         final double c1 = n + 1.0;
  163.         final double[] p = new double[n];
  164.         for (int i = 0; i < n; i++) {
  165.             p[i] = (i + 1.0) / c1;
  166.         }
  167.         return p;
  168.     }

  169.     /**
  170.      * Generate {@code n} evenly spaced probabilities in the range {@code [p1, p2]}.
  171.      *
  172.      * <pre>
  173.      * w = p2 - p1
  174.      * p1 + w/(n + 1), p1 + 2w/(n + 1), ..., p1 + nw/(n + 1)
  175.      * </pre>
  176.      *
  177.      * @param n Number of probabilities.
  178.      * @param p1 Lower probability.
  179.      * @param p2 Upper probability.
  180.      * @return the probabilities
  181.      * @throws IllegalArgumentException if {@code n < 1}; if the probabilities are not in the
  182.      * range {@code [0, 1]}; or {@code p2 <= p1}.
  183.      */
  184.     public static double[] probabilities(int n, double p1, double p2) {
  185.         checkProbability(p1);
  186.         checkProbability(p2);
  187.         if (p2 <= p1) {
  188.             throw new IllegalArgumentException("Invalid range: [" + p1 + ", " + p2 + "]");
  189.         }
  190.         final double[] p = probabilities(n);
  191.         for (int i = 0; i < n; i++) {
  192.             p[i] = (1 - p[i]) * p1 + p[i] * p2;
  193.         }
  194.         return p;
  195.     }

  196.     /**
  197.      * Evaluate the {@code p}-th quantile of the values.
  198.      *
  199.      * <p>Note: This method may partially sort the input values if not configured to
  200.      * {@link #withCopy(boolean) copy} the input data.
  201.      *
  202.      * <p><strong>Performance</strong>
  203.      *
  204.      * <p>It is not recommended to use this method for repeat calls for different quantiles
  205.      * within the same values. The {@link #evaluate(double[], double...)} method should be used
  206.      * which provides better performance.
  207.      *
  208.      * @param values Values.
  209.      * @param p Probability for the quantile to compute.
  210.      * @return the quantile
  211.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
  212.      * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
  213.      * @see #evaluate(double[], double...)
  214.      * @see #with(NaNPolicy)
  215.      */
  216.     public double evaluate(double[] values, double p) {
  217.         return compute(values, 0, values.length, p);
  218.     }

  219.     /**
  220.      * Evaluate the {@code p}-th quantile of the specified range of values.
  221.      *
  222.      * <p>Note: This method may partially sort the input values if not configured to
  223.      * {@link #withCopy(boolean) copy} the input data.
  224.      *
  225.      * <p><strong>Performance</strong>
  226.      *
  227.      * <p>It is not recommended to use this method for repeat calls for different quantiles
  228.      * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used
  229.      * which provides better performance.
  230.      *
  231.      * @param values Values.
  232.      * @param from Inclusive start of the range.
  233.      * @param to Exclusive end of the range.
  234.      * @param p Probability for the quantile to compute.
  235.      * @return the quantile
  236.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
  237.      * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
  238.      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
  239.      * @see #evaluateRange(double[], int, int, double...)
  240.      * @see #with(NaNPolicy)
  241.      * @since 1.2
  242.      */
  243.     public double evaluateRange(double[] values, int from, int to, double p) {
  244.         Statistics.checkFromToIndex(from, to, values.length);
  245.         return compute(values, from, to, p);
  246.     }

  247.     /**
  248.      * Compute the {@code p}-th quantile of the specified range of values.
  249.      *
  250.      * @param values Values.
  251.      * @param from Inclusive start of the range.
  252.      * @param to Exclusive end of the range.
  253.      * @param p Probability for the quantile to compute.
  254.      * @return the quantile
  255.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
  256.      */
  257.     private double compute(double[] values, int from, int to, double p) {
  258.         checkProbability(p);
  259.         // Floating-point data handling
  260.         final int[] bounds = new int[2];
  261.         final double[] x = nanTransformer.apply(values, from, to, bounds);
  262.         final int start = bounds[0];
  263.         final int end = bounds[1];
  264.         final int n = end - start;
  265.         // Special cases
  266.         if (n <= 1) {
  267.             return n == 0 ? Double.NaN : x[start];
  268.         }

  269.         final double pos = estimationType.index(p, n);
  270.         final int ip = (int) pos;
  271.         final int i = start + ip;

  272.         // Partition and compute
  273.         if (pos > ip) {
  274.             Selection.select(x, start, end, new int[] {i, i + 1});
  275.             return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
  276.         }
  277.         Selection.select(x, start, end, i);
  278.         return x[i];
  279.     }

  280.     /**
  281.      * Evaluate the {@code p}-th quantiles of the values.
  282.      *
  283.      * <p>Note: This method may partially sort the input values if not configured to
  284.      * {@link #withCopy(boolean) copy} the input data.
  285.      *
  286.      * @param values Values.
  287.      * @param p Probabilities for the quantiles to compute.
  288.      * @return the quantiles
  289.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  290.      * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
  291.      * @see #with(NaNPolicy)
  292.      */
  293.     public double[] evaluate(double[] values, double... p) {
  294.         return compute(values, 0, values.length, p);
  295.     }

  296.     /**
  297.      * Evaluate the {@code p}-th quantiles of the specified range of values.
  298.      *
  299.      * <p>Note: This method may partially sort the input values if not configured to
  300.      * {@link #withCopy(boolean) copy} the input data.
  301.      *
  302.      * @param values Values.
  303.      * @param from Inclusive start of the range.
  304.      * @param to Exclusive end of the range.
  305.      * @param p Probabilities for the quantiles to compute.
  306.      * @return the quantiles
  307.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  308.      * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
  309.      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
  310.      * @see #with(NaNPolicy)
  311.      * @since 1.2
  312.      */
  313.     public double[] evaluateRange(double[] values, int from, int to, double... p) {
  314.         Statistics.checkFromToIndex(from, to, values.length);
  315.         return compute(values, from, to, p);
  316.     }

  317.     /**
  318.      * Compute the {@code p}-th quantiles of the specified range of values.
  319.      *
  320.      * @param values Values.
  321.      * @param from Inclusive start of the range.
  322.      * @param to Exclusive end of the range.
  323.      * @param p Probabilities for the quantiles to compute.
  324.      * @return the quantiles
  325.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  326.      * or no probabilities are specified.
  327.      */
  328.     private double[] compute(double[] values, int from, int to, double... p) {
  329.         checkProbabilities(p);
  330.         // Floating-point data handling
  331.         final int[] bounds = new int[2];
  332.         final double[] x = nanTransformer.apply(values, from, to, bounds);
  333.         final int start = bounds[0];
  334.         final int end = bounds[1];
  335.         final int n = end - start;
  336.         // Special cases
  337.         final double[] q = new double[p.length];
  338.         if (n <= 1) {
  339.             Arrays.fill(q, n == 0 ? Double.NaN : x[start]);
  340.             return q;
  341.         }

  342.         // Collect interpolation positions. We use the output q as storage.
  343.         final int[] indices = computeIndices(n, p, q, start);

  344.         // Partition
  345.         Selection.select(x, start, end, indices);

  346.         // Compute
  347.         for (int k = 0; k < p.length; k++) {
  348.             // ip in [0, n); i in [start, end)
  349.             final int ip = (int) q[k];
  350.             final int i = start + ip;
  351.             if (q[k] > ip) {
  352.                 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
  353.             } else {
  354.                 q[k] = x[i];
  355.             }
  356.         }
  357.         return q;
  358.     }

  359.     /**
  360.      * Evaluate the {@code p}-th quantile of the values.
  361.      *
  362.      * <p>Note: This method may partially sort the input values if not configured to
  363.      * {@link #withCopy(boolean) copy} the input data.
  364.      *
  365.      * <p><strong>Performance</strong>
  366.      *
  367.      * <p>It is not recommended to use this method for repeat calls for different quantiles
  368.      * within the same values. The {@link #evaluate(int[], double...)} method should be used
  369.      * which provides better performance.
  370.      *
  371.      * @param values Values.
  372.      * @param p Probability for the quantile to compute.
  373.      * @return the quantile
  374.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
  375.      * @see #evaluate(int[], double...)
  376.      */
  377.     public double evaluate(int[] values, double p) {
  378.         return compute(values, 0, values.length, p);
  379.     }

  380.     /**
  381.      * Evaluate the {@code p}-th quantile of the specified range of values.
  382.      *
  383.      * <p>Note: This method may partially sort the input values if not configured to
  384.      * {@link #withCopy(boolean) copy} the input data.
  385.      *
  386.      * <p><strong>Performance</strong>
  387.      *
  388.      * <p>It is not recommended to use this method for repeat calls for different quantiles
  389.      * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used
  390.      * which provides better performance.
  391.      *
  392.      * @param values Values.
  393.      * @param from Inclusive start of the range.
  394.      * @param to Exclusive end of the range.
  395.      * @param p Probability for the quantile to compute.
  396.      * @return the quantile
  397.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
  398.      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
  399.      * @see #evaluateRange(int[], int, int, double...)
  400.      * @since 1.2
  401.      */
  402.     public double evaluateRange(int[] values, int from, int to, double p) {
  403.         Statistics.checkFromToIndex(from, to, values.length);
  404.         return compute(values, from, to, p);
  405.     }

  406.     /**
  407.      * Compute the {@code p}-th quantile of the specified range of values.
  408.      *
  409.      * @param values Values.
  410.      * @param from Inclusive start of the range.
  411.      * @param to Exclusive end of the range.
  412.      * @param p Probability for the quantile to compute.
  413.      * @return the quantile
  414.      * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
  415.      */
  416.     private double compute(int[] values, int from, int to, double p) {
  417.         checkProbability(p);
  418.         final int n = to - from;
  419.         // Special cases
  420.         if (n <= 1) {
  421.             return n == 0 ? Double.NaN : values[from];
  422.         }

  423.         // Create the range
  424.         final int[] x;
  425.         final int start;
  426.         final int end;
  427.         if (copy) {
  428.             x = Statistics.copy(values, from, to);
  429.             start = 0;
  430.             end = n;
  431.         } else {
  432.             x = values;
  433.             start = from;
  434.             end = to;
  435.         }

  436.         final double pos = estimationType.index(p, n);
  437.         final int ip = (int) pos;
  438.         final int i = start + ip;

  439.         // Partition and compute
  440.         if (pos > ip) {
  441.             Selection.select(x, start, end, new int[] {i, i + 1});
  442.             return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
  443.         }
  444.         Selection.select(x, start, end, i);
  445.         return x[i];
  446.     }

  447.     /**
  448.      * Evaluate the {@code p}-th quantiles of the values.
  449.      *
  450.      * <p>Note: This method may partially sort the input values if not configured to
  451.      * {@link #withCopy(boolean) copy} the input data.
  452.      *
  453.      * @param values Values.
  454.      * @param p Probabilities for the quantiles to compute.
  455.      * @return the quantiles
  456.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  457.      * or no probabilities are specified.
  458.      */
  459.     public double[] evaluate(int[] values, double... p) {
  460.         return compute(values, 0, values.length, p);
  461.     }

  462.     /**
  463.      * Evaluate the {@code p}-th quantiles of the specified range of values..
  464.      *
  465.      * <p>Note: This method may partially sort the input values if not configured to
  466.      * {@link #withCopy(boolean) copy} the input data.
  467.      *
  468.      * @param values Values.
  469.      * @param from Inclusive start of the range.
  470.      * @param to Exclusive end of the range.
  471.      * @param p Probabilities for the quantiles to compute.
  472.      * @return the quantiles
  473.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  474.      * or no probabilities are specified.
  475.      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
  476.      * @since 1.2
  477.      */
  478.     public double[] evaluateRange(int[] values, int from, int to, double... p) {
  479.         Statistics.checkFromToIndex(from, to, values.length);
  480.         return compute(values, from, to, p);
  481.     }

  482.     /**
  483.      * Evaluate the {@code p}-th quantiles of the specified range of values..
  484.      *
  485.      * <p>Note: This method may partially sort the input values if not configured to
  486.      * {@link #withCopy(boolean) copy} the input data.
  487.      *
  488.      * @param values Values.
  489.      * @param from Inclusive start of the range.
  490.      * @param to Exclusive end of the range.
  491.      * @param p Probabilities for the quantiles to compute.
  492.      * @return the quantiles
  493.      * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
  494.      * or no probabilities are specified.
  495.      */
  496.     private double[] compute(int[] values, int from, int to, double... p) {
  497.         checkProbabilities(p);
  498.         final int n = to - from;
  499.         // Special cases
  500.         final double[] q = new double[p.length];
  501.         if (n <= 1) {
  502.             Arrays.fill(q, n == 0 ? Double.NaN : values[from]);
  503.             return q;
  504.         }

  505.         // Create the range
  506.         final int[] x;
  507.         final int start;
  508.         final int end;
  509.         if (copy) {
  510.             x = Statistics.copy(values, from, to);
  511.             start = 0;
  512.             end = n;
  513.         } else {
  514.             x = values;
  515.             start = from;
  516.             end = to;
  517.         }

  518.         // Collect interpolation positions. We use the output q as storage.
  519.         final int[] indices = computeIndices(n, p, q, start);

  520.         // Partition
  521.         Selection.select(x, start, end, indices);

  522.         // Compute
  523.         for (int k = 0; k < p.length; k++) {
  524.             // ip in [0, n); i in [start, end)
  525.             final int ip = (int) q[k];
  526.             final int i = start + ip;
  527.             if (q[k] > ip) {
  528.                 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
  529.             } else {
  530.                 q[k] = x[i];
  531.             }
  532.         }
  533.         return q;
  534.     }

  535.     /**
  536.      * Evaluate the {@code p}-th quantile of the values.
  537.      *
  538.      * <p>This method can be used when the values of known size are already sorted.
  539.      *
  540.      * <pre>{@code
  541.      * short[] x = ...
  542.      * Arrays.sort(x);
  543.      * double q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.05);
  544.      * }</pre>
  545.      *
  546.      * @param n Size of the values.
  547.      * @param values Values function.
  548.      * @param p Probability for the quantile to compute.
  549.      * @return the quantile
  550.      * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is
  551.      * not in the range {@code [0, 1]}.
  552.      */
  553.     public double evaluate(int n, IntToDoubleFunction values, double p) {
  554.         checkSize(n);
  555.         checkProbability(p);
  556.         // Special case
  557.         if (n <= 1) {
  558.             return n == 0 ? Double.NaN : values.applyAsDouble(0);
  559.         }
  560.         final double pos = estimationType.index(p, n);
  561.         final int i = (int) pos;
  562.         final double v1 = values.applyAsDouble(i);
  563.         if (pos > i) {
  564.             final double v2 = values.applyAsDouble(i + 1);
  565.             return Interpolation.interpolate(v1, v2, pos - i);
  566.         }
  567.         return v1;
  568.     }

  569.     /**
  570.      * Evaluate the {@code p}-th quantiles of the values.
  571.      *
  572.      * <p>This method can be used when the values of known size are already sorted.
  573.      *
  574.      * <pre>{@code
  575.      * short[] x = ...
  576.      * Arrays.sort(x);
  577.      * double[] q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.25, 0.5, 0.75);
  578.      * }</pre>
  579.      *
  580.      * @param n Size of the values.
  581.      * @param values Values function.
  582.      * @param p Probabilities for the quantiles to compute.
  583.      * @return the quantiles
  584.      * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is
  585.      * not in the range {@code [0, 1]}; or no probabilities are specified.
  586.      */
  587.     public double[] evaluate(int n, IntToDoubleFunction values, double... p) {
  588.         checkSize(n);
  589.         checkProbabilities(p);
  590.         // Special case
  591.         final double[] q = new double[p.length];
  592.         if (n <= 1) {
  593.             Arrays.fill(q, n == 0 ? Double.NaN : values.applyAsDouble(0));
  594.             return q;
  595.         }
  596.         for (int k = 0; k < p.length; k++) {
  597.             final double pos = estimationType.index(p[k], n);
  598.             final int i = (int) pos;
  599.             final double v1 = values.applyAsDouble(i);
  600.             if (pos > i) {
  601.                 final double v2 = values.applyAsDouble(i + 1);
  602.                 q[k] = Interpolation.interpolate(v1, v2, pos - i);
  603.             } else {
  604.                 q[k] = v1;
  605.             }
  606.         }
  607.         return q;
  608.     }

  609.     /**
  610.      * Check the probability {@code p} is in the range {@code [0, 1]}.
  611.      *
  612.      * @param p Probability for the quantile to compute.
  613.      * @throws IllegalArgumentException if the probability is not in the range {@code [0, 1]}
  614.      */
  615.     private static void checkProbability(double p) {
  616.         // Logic negation will detect NaN
  617.         if (!(p >= 0 && p <= 1)) {
  618.             throw new IllegalArgumentException(INVALID_PROBABILITY + p);
  619.         }
  620.     }

  621.     /**
  622.      * Check the probabilities {@code p} are in the range {@code [0, 1]}.
  623.      *
  624.      * @param p Probabilities for the quantiles to compute.
  625.      * @throws IllegalArgumentException if any probabilities {@code p} is not in the range {@code [0, 1]};
  626.      * or no probabilities are specified.
  627.      */
  628.     private static void checkProbabilities(double... p) {
  629.         if (p.length == 0) {
  630.             throw new IllegalArgumentException(NO_PROBABILITIES_SPECIFIED);
  631.         }
  632.         for (final double pp : p) {
  633.             checkProbability(pp);
  634.         }
  635.     }

  636.     /**
  637.      * Check the {@code size} is positive.
  638.      *
  639.      * @param n Size of the values.
  640.      * @throws IllegalArgumentException if {@code size < 0}
  641.      */
  642.     private static void checkSize(int n) {
  643.         if (n < 0) {
  644.             throw new IllegalArgumentException(INVALID_SIZE + n);
  645.         }
  646.     }

  647.     /**
  648.      * Check the number of probabilities {@code n} is strictly positive.
  649.      *
  650.      * @param n Number of probabilities.
  651.      * @throws IllegalArgumentException if {@code c < 1}
  652.      */
  653.     private static void checkNumberOfProbabilities(int n) {
  654.         if (n < 1) {
  655.             throw new IllegalArgumentException(INVALID_NUMBER_OF_PROBABILITIES + n);
  656.         }
  657.     }

  658.     /**
  659.      * Compute the indices required for quantile interpolation.
  660.      *
  661.      * <p>The zero-based interpolation index in {@code [0, n)} is
  662.      * saved into the working array {@code q} for each {@code p}.
  663.      *
  664.      * <p>The indices are incremented by the provided {@code offset} to allow
  665.      * addressing sub-ranges of a larger array.
  666.      *
  667.      * @param n Size of the data.
  668.      * @param p Probabilities for the quantiles to compute.
  669.      * @param q Working array for quantiles in {@code [0, n)}.
  670.      * @param offset Array offset.
  671.      * @return the indices in {@code [offset, offset + n)}
  672.      */
  673.     private int[] computeIndices(int n, double[] p, double[] q, int offset) {
  674.         final int[] indices = new int[p.length << 1];
  675.         int count = 0;
  676.         for (int k = 0; k < p.length; k++) {
  677.             final double pos = estimationType.index(p[k], n);
  678.             q[k] = pos;
  679.             final int i = (int) pos;
  680.             indices[count++] = offset + i;
  681.             if (pos > i) {
  682.                 // Require the next index for interpolation
  683.                 indices[count++] = offset + i + 1;
  684.             }
  685.         }
  686.         if (count < indices.length) {
  687.             return Arrays.copyOf(indices, count);
  688.         }
  689.         return indices;
  690.     }

  691.     /**
  692.      * Estimation methods for a quantile. Provides the nine quantile algorithms
  693.      * defined in Hyndman and Fan (1996)[1] as {@code HF1 - HF9}.
  694.      *
  695.      * <p>Samples quantiles are defined by:
  696.      *
  697.      * <p>\[ Q(p) = (1 - \gamma) x_j + \gamma x_{j+1} \]
  698.      *
  699.      * <p>where \( \frac{j-m}{n} \leq p \le \frac{j-m+1}{n} \), \( x_j \) is the \( j \)th
  700.      * order statistic, \( n \) is the sample size, the value of \( \gamma \) is a function
  701.      * of \( j = \lfloor np+m \rfloor \) and \( g = np + m - j \), and \( m \) is a constant
  702.      * determined by the sample quantile type.
  703.      *
  704.      * <p>Note that the real-valued position \( np + m \) is a 1-based index and
  705.      * \( j \in [1, n] \). If the real valued position is computed as beyond the lowest or
  706.      * highest values in the sample, this implementation will return the minimum or maximum
  707.      * observation respectively.
  708.      *
  709.      * <p>Types 1, 2, and 3 are discontinuous functions of \( p \); types 4 to 9 are continuous
  710.      * functions of \( p \).
  711.      *
  712.      * <p>For the continuous functions, the probability \( p_k \) is provided for the \( k \)-th order
  713.      * statistic in size \( n \). Samples quantiles are equivalently obtained to \( Q(p) \) by
  714.      * linear interpolation between points \( (p_k, x_k) \) and \( (p_{k+1}, x_{k+1}) \) for
  715.      * any \( p_k \leq p \leq p_{k+1} \).
  716.      *
  717.      * <ol>
  718.      * <li>Hyndman and Fan (1996)
  719.      *     <i>Sample Quantiles in Statistical Packages.</i>
  720.      *     The American Statistician, 50, 361-365.
  721.      *     <a href="https://www.jstor.org/stable/2684934">doi.org/10.2307/2684934</a>
  722.      * <li><a href="http://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a>
  723.      * </ol>
  724.      */
  725.     public enum EstimationMethod {
  726.         /**
  727.          * Inverse of the empirical distribution function.
  728.          *
  729.          * <p>\( m = 0 \). \( \gamma = 0 \) if \( g = 0 \), and 1 otherwise.
  730.          */
  731.         HF1 {
  732.             @Override
  733.             double position0(double p, int n) {
  734.                 // position = np + 0. This is 1-based so adjust to 0-based.
  735.                 return Math.ceil(n * p) - 1;
  736.             }
  737.         },
  738.         /**
  739.          * Similar to {@link #HF1} with averaging at discontinuities.
  740.          *
  741.          * <p>\( m = 0 \). \( \gamma = 0.5 \) if \( g = 0 \), and 1 otherwise.
  742.          */
  743.         HF2 {
  744.             @Override
  745.             double position0(double p, int n) {
  746.                 final double pos = n * p;
  747.                 // Average at discontinuities
  748.                 final int j = (int) pos;
  749.                 final double g = pos - j;
  750.                 if (g == 0) {
  751.                     return j - 0.5;
  752.                 }
  753.                 // As HF1 : ceil(j + g) - 1
  754.                 return j;
  755.             }
  756.         },
  757.         /**
  758.          * The observation closest to \( np \). Ties are resolved to the nearest even order statistic.
  759.          *
  760.          * <p>\( m = -1/2 \). \( \gamma = 0 \) if \( g = 0 \) and \( j \) is even, and 1 otherwise.
  761.          */
  762.         HF3 {
  763.             @Override
  764.             double position0(double p, int n) {
  765.                 // Let rint do the work for ties to even
  766.                 return Math.rint(n * p) - 1;
  767.             }
  768.         },
  769.         /**
  770.          * Linear interpolation of the inverse of the empirical CDF.
  771.          *
  772.          * <p>\( m = 0 \). \( p_k = \frac{k}{n} \).
  773.          */
  774.         HF4 {
  775.             @Override
  776.             double position0(double p, int n) {
  777.                 // np + 0 - 1
  778.                 return n * p - 1;
  779.             }
  780.         },
  781.         /**
  782.          * A piecewise linear function where the knots are the values midway through the steps of
  783.          * the empirical CDF. Proposed by Hazen (1914) and popular amongst hydrologists.
  784.          *
  785.          * <p>\( m = 1/2 \). \( p_k = \frac{k - 1/2}{n} \).
  786.          */
  787.         HF5 {
  788.             @Override
  789.             double position0(double p, int n) {
  790.                 // np + 0.5 - 1
  791.                 return n * p - 0.5;
  792.             }
  793.         },
  794.         /**
  795.          * Linear interpolation of the expectations for the order statistics for the uniform
  796.          * distribution on [0,1]. Proposed by Weibull (1939).
  797.          *
  798.          * <p>\( m = p \). \( p_k = \frac{k}{n + 1} \).
  799.          *
  800.          * <p>This method computes the quantile as per the Apache Commons Math Percentile
  801.          * legacy implementation.
  802.          */
  803.         HF6 {
  804.             @Override
  805.             double position0(double p, int n) {
  806.                 // np + p - 1
  807.                 return (n + 1) * p - 1;
  808.             }
  809.         },
  810.         /**
  811.          * Linear interpolation of the modes for the order statistics for the uniform
  812.          * distribution on [0,1]. Proposed by Gumbull (1939).
  813.          *
  814.          * <p>\( m = 1 - p \). \( p_k = \frac{k - 1}{n - 1} \).
  815.          */
  816.         HF7 {
  817.             @Override
  818.             double position0(double p, int n) {
  819.                 // np + 1-p - 1
  820.                 return (n - 1) * p;
  821.             }
  822.         },
  823.         /**
  824.          * Linear interpolation of the approximate medians for order statistics.
  825.          *
  826.          * <p>\( m = (p + 1)/3 \). \( p_k = \frac{k - 1/3}{n + 1/3} \).
  827.          *
  828.          * <p>As per Hyndman and Fan (1996) this approach is most recommended as it provides
  829.          * an approximate median-unbiased estimate regardless of distribution.
  830.          */
  831.         HF8 {
  832.             @Override
  833.             double position0(double p, int n) {
  834.                 return n * p + (p + 1) / 3 - 1;
  835.             }
  836.         },
  837.         /**
  838.          * Quantile estimates are approximately unbiased for the expected order statistics if
  839.          * \( x \) is normally distributed.
  840.          *
  841.          * <p>\( m = p/4 + 3/8 \). \( p_k = \frac{k - 3/8}{n + 1/4} \).
  842.          */
  843.         HF9 {
  844.             @Override
  845.             double position0(double p, int n) {
  846.                 // np + p/4 + 3/8 - 1
  847.                 return (n + 0.25) * p - 0.625;
  848.             }
  849.         };

  850.         /**
  851.          * Finds the real-valued position for calculation of the quantile.
  852.          *
  853.          * <p>Return {@code i + g} such that the quantile value from sorted data is:
  854.          *
  855.          * <p>value = data[i] + g * (data[i+1] - data[i])
  856.          *
  857.          * <p>Warning: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
  858.          *
  859.          * <p>Note: In contrast to the definition of Hyndman and Fan in the class header
  860.          * which uses a 1-based position, this is a zero based index. This change is for
  861.          * convenience when addressing array positions.
  862.          *
  863.          * @param p p<sup>th</sup> quantile.
  864.          * @param n Size.
  865.          * @return a real-valued position (0-based) into the range {@code [0, n)}
  866.          */
  867.         abstract double position0(double p, int n);

  868.         /**
  869.          * Finds the index {@code i} and fractional part {@code g} of a real-valued position
  870.          * to interpolate the quantile.
  871.          *
  872.          * <p>Return {@code i + g} such that the quantile value from sorted data is:
  873.          *
  874.          * <p>value = data[i] + g * (data[i+1] - data[i])
  875.          *
  876.          * <p>Note: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
  877.          *
  878.          * @param p p<sup>th</sup> quantile.
  879.          * @param n Size.
  880.          * @return index (in [0, n-1])
  881.          */
  882.         final double index(double p, int n) {
  883.             final double pos = position0(p, n);
  884.             // Bounds check in [0, n-1]
  885.             if (pos < 0) {
  886.                 return 0;
  887.             }
  888.             if (pos > n - 1.0) {
  889.                 return n - 1.0;
  890.             }
  891.             return pos;
  892.         }
  893.     }
  894. }