001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math3.stat.descriptive.rank;
018    
019    import java.io.Serializable;
020    import java.util.Arrays;
021    
022    import org.apache.commons.math3.exception.MathIllegalArgumentException;
023    import org.apache.commons.math3.exception.NullArgumentException;
024    import org.apache.commons.math3.exception.OutOfRangeException;
025    import org.apache.commons.math3.exception.util.LocalizedFormats;
026    import org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic;
027    import org.apache.commons.math3.util.FastMath;
028    import org.apache.commons.math3.util.MathUtils;
029    
030    /**
031     * Provides percentile computation.
032     * <p>
033     * There are several commonly used methods for estimating percentiles (a.k.a.
034     * quantiles) based on sample data.  For large samples, the different methods
035     * agree closely, but when sample sizes are small, different methods will give
036     * significantly different results.  The algorithm implemented here works as follows:
037     * <ol>
038     * <li>Let <code>n</code> be the length of the (sorted) array and
039     * <code>0 < p <= 100</code> be the desired percentile.</li>
040     * <li>If <code> n = 1 </code> return the unique array element (regardless of
041     * the value of <code>p</code>); otherwise </li>
042     * <li>Compute the estimated percentile position
043     * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
044     * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
045     * part of <code>pos</code>).</li>
046     * <li> If <code>pos < 1</code> return the smallest element in the array.</li>
047     * <li> Else if <code>pos >= n</code> return the largest element in the array.</li>
048     * <li> Else let <code>lower</code> be the element in position
049     * <code>floor(pos)</code> in the array and let <code>upper</code> be the
050     * next element in the array.  Return <code>lower + d * (upper - lower)</code>
051     * </li>
052     * </ol></p>
053     * <p>
054     * To compute percentiles, the data must be at least partially ordered.  Input
055     * arrays are copied and recursively partitioned using an ordering definition.
056     * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
057     * by {@link java.lang.Double#compareTo(Double)}.  This ordering makes
058     * <code>Double.NaN</code> larger than any other value (including
059     * <code>Double.POSITIVE_INFINITY</code>).  Therefore, for example, the median
060     * (50th percentile) of
061     * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
062     * <p>
063     * Since percentile estimation usually involves interpolation between array
064     * elements, arrays containing  <code>NaN</code> or infinite values will often
065     * result in <code>NaN</code> or infinite values returned.</p>
066     * <p>
067     * Since 2.2, Percentile uses only selection instead of complete sorting
068     * and caches selection algorithm state between calls to the various
069     * {@code evaluate} methods. This greatly improves efficiency, both for a single
070     * percentile and multiple percentile computations. To maximize performance when
071     * multiple percentiles are computed based on the same data, users should set the
072     * data array once using either one of the {@link #evaluate(double[], double)} or
073     * {@link #setData(double[])} methods and thereafter {@link #evaluate(double)}
074     * with just the percentile provided.
075     * </p>
076     * <p>
077     * <strong>Note that this implementation is not synchronized.</strong> If
078     * multiple threads access an instance of this class concurrently, and at least
079     * one of the threads invokes the <code>increment()</code> or
080     * <code>clear()</code> method, it must be synchronized externally.</p>
081     *
082     * @version $Id: Percentile.java 1416643 2012-12-03 19:37:14Z tn $
083     */
084    public class Percentile extends AbstractUnivariateStatistic implements Serializable {
085    
086        /** Serializable version identifier */
087        private static final long serialVersionUID = -8091216485095130416L;
088    
089        /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
090        private static final int MIN_SELECT_SIZE = 15;
091    
092        /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
093        private static final int MAX_CACHED_LEVELS = 10;
094    
095        /** Determines what percentile is computed when evaluate() is activated
096         * with no quantile argument */
097        private double quantile = 0.0;
098    
099        /** Cached pivots. */
100        private int[] cachedPivots;
101    
102        /**
103         * Constructs a Percentile with a default quantile
104         * value of 50.0.
105         */
106        public Percentile() {
107            // No try-catch or advertised exception here - arg is valid
108            this(50.0);
109        }
110    
111        /**
112         * Constructs a Percentile with the specific quantile value.
113         * @param p the quantile
114         * @throws MathIllegalArgumentException  if p is not greater than 0 and less
115         * than or equal to 100
116         */
117        public Percentile(final double p) throws MathIllegalArgumentException {
118            setQuantile(p);
119            cachedPivots = null;
120        }
121    
122        /**
123         * Copy constructor, creates a new {@code Percentile} identical
124         * to the {@code original}
125         *
126         * @param original the {@code Percentile} instance to copy
127         * @throws NullArgumentException if original is null
128         */
129        public Percentile(Percentile original) throws NullArgumentException {
130            copy(original, this);
131        }
132    
133        /** {@inheritDoc} */
134        @Override
135        public void setData(final double[] values) {
136            if (values == null) {
137                cachedPivots = null;
138            } else {
139                cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
140                Arrays.fill(cachedPivots, -1);
141            }
142            super.setData(values);
143        }
144    
145        /** {@inheritDoc} */
146        @Override
147        public void setData(final double[] values, final int begin, final int length)
148        throws MathIllegalArgumentException {
149            if (values == null) {
150                cachedPivots = null;
151            } else {
152                cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
153                Arrays.fill(cachedPivots, -1);
154            }
155            super.setData(values, begin, length);
156        }
157    
158        /**
159         * Returns the result of evaluating the statistic over the stored data.
160         * <p>
161         * The stored array is the one which was set by previous calls to
162         * {@link #setData(double[])}
163         * </p>
164         * @param p the percentile value to compute
165         * @return the value of the statistic applied to the stored data
166         * @throws MathIllegalArgumentException if p is not a valid quantile value
167         * (p must be greater than 0 and less than or equal to 100)
168         */
169        public double evaluate(final double p) throws MathIllegalArgumentException {
170            return evaluate(getDataRef(), p);
171        }
172    
173        /**
174         * Returns an estimate of the <code>p</code>th percentile of the values
175         * in the <code>values</code> array.
176         * <p>
177         * Calls to this method do not modify the internal <code>quantile</code>
178         * state of this statistic.</p>
179         * <p>
180         * <ul>
181         * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
182         * <code>0</code></li>
183         * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
184         *  if <code>values</code> has length <code>1</code></li>
185         * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
186         * is null or p is not a valid quantile value (p must be greater than 0
187         * and less than or equal to 100) </li>
188         * </ul></p>
189         * <p>
190         * See {@link Percentile} for a description of the percentile estimation
191         * algorithm used.</p>
192         *
193         * @param values input array of values
194         * @param p the percentile value to compute
195         * @return the percentile value or Double.NaN if the array is empty
196         * @throws MathIllegalArgumentException if <code>values</code> is null
197         *     or p is invalid
198         */
199        public double evaluate(final double[] values, final double p)
200        throws MathIllegalArgumentException {
201            test(values, 0, 0);
202            return evaluate(values, 0, values.length, p);
203        }
204    
205        /**
206         * Returns an estimate of the <code>quantile</code>th percentile of the
207         * designated values in the <code>values</code> array.  The quantile
208         * estimated is determined by the <code>quantile</code> property.
209         * <p>
210         * <ul>
211         * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
212         * <li>Returns (for any value of <code>quantile</code>)
213         * <code>values[begin]</code> if <code>length = 1 </code></li>
214         * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
215         * is null, or <code>start</code> or <code>length</code> is invalid</li>
216         * </ul></p>
217         * <p>
218         * See {@link Percentile} for a description of the percentile estimation
219         * algorithm used.</p>
220         *
221         * @param values the input array
222         * @param start index of the first array element to include
223         * @param length the number of elements to include
224         * @return the percentile value
225         * @throws MathIllegalArgumentException if the parameters are not valid
226         *
227         */
228        @Override
229        public double evaluate(final double[] values, final int start, final int length)
230        throws MathIllegalArgumentException {
231            return evaluate(values, start, length, quantile);
232        }
233    
234         /**
235         * Returns an estimate of the <code>p</code>th percentile of the values
236         * in the <code>values</code> array, starting with the element in (0-based)
237         * position <code>begin</code> in the array and including <code>length</code>
238         * values.
239         * <p>
240         * Calls to this method do not modify the internal <code>quantile</code>
241         * state of this statistic.</p>
242         * <p>
243         * <ul>
244         * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
245         * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
246         *  if <code>length = 1 </code></li>
247         * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
248         *  is null , <code>begin</code> or <code>length</code> is invalid, or
249         * <code>p</code> is not a valid quantile value (p must be greater than 0
250         * and less than or equal to 100)</li>
251         * </ul></p>
252         * <p>
253         * See {@link Percentile} for a description of the percentile estimation
254         * algorithm used.</p>
255         *
256         * @param values array of input values
257         * @param p  the percentile to compute
258         * @param begin  the first (0-based) element to include in the computation
259         * @param length  the number of array elements to include
260         * @return  the percentile value
261         * @throws MathIllegalArgumentException if the parameters are not valid or the
262         * input array is null
263         */
264        public double evaluate(final double[] values, final int begin,
265                final int length, final double p) throws MathIllegalArgumentException {
266    
267            test(values, begin, length);
268    
269            if ((p > 100) || (p <= 0)) {
270                throw new OutOfRangeException(
271                        LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
272            }
273            if (length == 0) {
274                return Double.NaN;
275            }
276            if (length == 1) {
277                return values[begin]; // always return single value for n = 1
278            }
279            double n = length;
280            double pos = p * (n + 1) / 100;
281            double fpos = FastMath.floor(pos);
282            int intPos = (int) fpos;
283            double dif = pos - fpos;
284            double[] work;
285            int[] pivotsHeap;
286            if (values == getDataRef()) {
287                work = getDataRef();
288                pivotsHeap = cachedPivots;
289            } else {
290                work = new double[length];
291                System.arraycopy(values, begin, work, 0, length);
292                pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
293                Arrays.fill(pivotsHeap, -1);
294            }
295    
296            if (pos < 1) {
297                return select(work, pivotsHeap, 0);
298            }
299            if (pos >= n) {
300                return select(work, pivotsHeap, length - 1);
301            }
302            double lower = select(work, pivotsHeap, intPos - 1);
303            double upper = select(work, pivotsHeap, intPos);
304            return lower + dif * (upper - lower);
305        }
306    
307        /**
308         * Select the k<sup>th</sup> smallest element from work array
309         * @param work work array (will be reorganized during the call)
310         * @param pivotsHeap set of pivot index corresponding to elements that
311         * are already at their sorted location, stored as an implicit heap
312         * (i.e. a sorted binary tree stored in a flat array, where the
313         * children of a node at index n are at indices 2n+1 for the left
314         * child and 2n+2 for the right child, with 0-based indices)
315         * @param k index of the desired element
316         * @return k<sup>th</sup> smallest element
317         */
318        private double select(final double[] work, final int[] pivotsHeap, final int k) {
319    
320            int begin = 0;
321            int end   = work.length;
322            int node  = 0;
323    
324            while (end - begin > MIN_SELECT_SIZE) {
325    
326                final int pivot;
327                if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
328                    // the pivot has already been found in a previous call
329                    // and the array has already been partitioned around it
330                    pivot = pivotsHeap[node];
331                } else {
332                    // select a pivot and partition work array around it
333                    pivot = partition(work, begin, end, medianOf3(work, begin, end));
334                    if (node < pivotsHeap.length) {
335                        pivotsHeap[node] =  pivot;
336                    }
337                }
338    
339                if (k == pivot) {
340                    // the pivot was exactly the element we wanted
341                    return work[k];
342                } else if (k < pivot) {
343                    // the element is in the left partition
344                    end  = pivot;
345                    node = FastMath.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
346                } else {
347                    // the element is in the right partition
348                    begin = pivot + 1;
349                    node  = FastMath.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
350                }
351    
352            }
353    
354            // the element is somewhere in the small sub-array
355            // sort the sub-array using insertion sort
356            insertionSort(work, begin, end);
357            return work[k];
358    
359        }
360    
361        /** Select a pivot index as the median of three
362         * @param work data array
363         * @param begin index of the first element of the slice
364         * @param end index after the last element of the slice
365         * @return the index of the median element chosen between the
366         * first, the middle and the last element of the array slice
367         */
368        int medianOf3(final double[] work, final int begin, final int end) {
369    
370            final int inclusiveEnd = end - 1;
371            final int    middle    = begin + (inclusiveEnd - begin) / 2;
372            final double wBegin    = work[begin];
373            final double wMiddle   = work[middle];
374            final double wEnd      = work[inclusiveEnd];
375    
376            if (wBegin < wMiddle) {
377                if (wMiddle < wEnd) {
378                    return middle;
379                } else {
380                    return (wBegin < wEnd) ? inclusiveEnd : begin;
381                }
382            } else {
383                if (wBegin < wEnd) {
384                    return begin;
385                } else {
386                    return (wMiddle < wEnd) ? inclusiveEnd : middle;
387                }
388            }
389    
390        }
391    
392        /**
393         * Partition an array slice around a pivot
394         * <p>
395         * Partitioning exchanges array elements such that all elements
396         * smaller than pivot are before it and all elements larger than
397         * pivot are after it
398         * </p>
399         * @param work data array
400         * @param begin index of the first element of the slice
401         * @param end index after the last element of the slice
402         * @param pivot initial index of the pivot
403         * @return index of the pivot after partition
404         */
405        private int partition(final double[] work, final int begin, final int end, final int pivot) {
406    
407            final double value = work[pivot];
408            work[pivot] = work[begin];
409    
410            int i = begin + 1;
411            int j = end - 1;
412            while (i < j) {
413                while ((i < j) && (work[j] > value)) {
414                    --j;
415                }
416                while ((i < j) && (work[i] < value)) {
417                    ++i;
418                }
419    
420                if (i < j) {
421                    final double tmp = work[i];
422                    work[i++] = work[j];
423                    work[j--] = tmp;
424                }
425            }
426    
427            if ((i >= end) || (work[i] > value)) {
428                --i;
429            }
430            work[begin] = work[i];
431            work[i]     = value;
432            return i;
433    
434        }
435    
436        /**
437         * Sort in place a (small) array slice using insertion sort
438         * @param work array to sort
439         * @param begin index of the first element of the slice to sort
440         * @param end index after the last element of the slice to sort
441         */
442        private void insertionSort(final double[] work, final int begin, final int end) {
443            for (int j = begin + 1; j < end; j++) {
444                final double saved = work[j];
445                int i = j - 1;
446                while ((i >= begin) && (saved < work[i])) {
447                    work[i + 1] = work[i];
448                    i--;
449                }
450                work[i + 1] = saved;
451            }
452        }
453    
454        /**
455         * Returns the value of the quantile field (determines what percentile is
456         * computed when evaluate() is called with no quantile argument).
457         *
458         * @return quantile
459         */
460        public double getQuantile() {
461            return quantile;
462        }
463    
464        /**
465         * Sets the value of the quantile field (determines what percentile is
466         * computed when evaluate() is called with no quantile argument).
467         *
468         * @param p a value between 0 < p <= 100
469         * @throws MathIllegalArgumentException  if p is not greater than 0 and less
470         * than or equal to 100
471         */
472        public void setQuantile(final double p) throws MathIllegalArgumentException {
473            if (p <= 0 || p > 100) {
474                throw new OutOfRangeException(
475                        LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
476            }
477            quantile = p;
478        }
479    
480        /**
481         * {@inheritDoc}
482         */
483        @Override
484        public Percentile copy() {
485            Percentile result = new Percentile();
486            //No try-catch or advertised exception because args are guaranteed non-null
487            copy(this, result);
488            return result;
489        }
490    
491        /**
492         * Copies source to dest.
493         * <p>Neither source nor dest can be null.</p>
494         *
495         * @param source Percentile to copy
496         * @param dest Percentile to copy to
497         * @throws NullArgumentException if either source or dest is null
498         */
499        public static void copy(Percentile source, Percentile dest)
500            throws NullArgumentException {
501            MathUtils.checkNotNull(source);
502            MathUtils.checkNotNull(dest);
503            dest.setData(source.getDataRef());
504            if (source.cachedPivots != null) {
505                System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
506            }
507            dest.quantile = source.quantile;
508        }
509    
510    }