View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math3.stat.descriptive.rank;
18  
19  import java.io.Serializable;
20  import java.util.Arrays;
21  
22  import org.apache.commons.math3.exception.MathIllegalArgumentException;
23  import org.apache.commons.math3.exception.NullArgumentException;
24  import org.apache.commons.math3.exception.OutOfRangeException;
25  import org.apache.commons.math3.exception.util.LocalizedFormats;
26  import org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic;
27  import org.apache.commons.math3.util.FastMath;
28  import org.apache.commons.math3.util.MathUtils;
29  
30  /**
31   * Provides percentile computation.
32   * <p>
33   * There are several commonly used methods for estimating percentiles (a.k.a.
34   * quantiles) based on sample data.  For large samples, the different methods
35   * agree closely, but when sample sizes are small, different methods will give
36   * significantly different results.  The algorithm implemented here works as follows:
37   * <ol>
38   * <li>Let <code>n</code> be the length of the (sorted) array and
39   * <code>0 < p <= 100</code> be the desired percentile.</li>
40   * <li>If <code> n = 1 </code> return the unique array element (regardless of
41   * the value of <code>p</code>); otherwise </li>
42   * <li>Compute the estimated percentile position
43   * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
44   * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
45   * part of <code>pos</code>).</li>
46   * <li> If <code>pos < 1</code> return the smallest element in the array.</li>
47   * <li> Else if <code>pos >= n</code> return the largest element in the array.</li>
48   * <li> Else let <code>lower</code> be the element in position
49   * <code>floor(pos)</code> in the array and let <code>upper</code> be the
50   * next element in the array.  Return <code>lower + d * (upper - lower)</code>
51   * </li>
52   * </ol></p>
53   * <p>
54   * To compute percentiles, the data must be at least partially ordered.  Input
55   * arrays are copied and recursively partitioned using an ordering definition.
56   * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
57   * by {@link java.lang.Double#compareTo(Double)}.  This ordering makes
58   * <code>Double.NaN</code> larger than any other value (including
59   * <code>Double.POSITIVE_INFINITY</code>).  Therefore, for example, the median
60   * (50th percentile) of
61   * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
62   * <p>
63   * Since percentile estimation usually involves interpolation between array
64   * elements, arrays containing  <code>NaN</code> or infinite values will often
65   * result in <code>NaN</code> or infinite values returned.</p>
66   * <p>
67   * Since 2.2, Percentile uses only selection instead of complete sorting
68   * and caches selection algorithm state between calls to the various
69   * {@code evaluate} methods. This greatly improves efficiency, both for a single
70   * percentile and multiple percentile computations. To maximize performance when
71   * multiple percentiles are computed based on the same data, users should set the
72   * data array once using either one of the {@link #evaluate(double[], double)} or
73   * {@link #setData(double[])} methods and thereafter {@link #evaluate(double)}
74   * with just the percentile provided.
75   * </p>
76   * <p>
77   * <strong>Note that this implementation is not synchronized.</strong> If
78   * multiple threads access an instance of this class concurrently, and at least
79   * one of the threads invokes the <code>increment()</code> or
80   * <code>clear()</code> method, it must be synchronized externally.</p>
81   *
82   * @version $Id: Percentile.java 1416643 2012-12-03 19:37:14Z tn $
83   */
84  public class Percentile extends AbstractUnivariateStatistic implements Serializable {
85  
86      /** Serializable version identifier */
87      private static final long serialVersionUID = -8091216485095130416L;
88  
89      /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
90      private static final int MIN_SELECT_SIZE = 15;
91  
92      /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
93      private static final int MAX_CACHED_LEVELS = 10;
94  
95      /** Determines what percentile is computed when evaluate() is activated
96       * with no quantile argument */
97      private double quantile = 0.0;
98  
99      /** Cached pivots. */
100     private int[] cachedPivots;
101 
102     /**
103      * Constructs a Percentile with a default quantile
104      * value of 50.0.
105      */
106     public Percentile() {
107         // No try-catch or advertised exception here - arg is valid
108         this(50.0);
109     }
110 
111     /**
112      * Constructs a Percentile with the specific quantile value.
113      * @param p the quantile
114      * @throws MathIllegalArgumentException  if p is not greater than 0 and less
115      * than or equal to 100
116      */
117     public Percentile(final double p) throws MathIllegalArgumentException {
118         setQuantile(p);
119         cachedPivots = null;
120     }
121 
122     /**
123      * Copy constructor, creates a new {@code Percentile} identical
124      * to the {@code original}
125      *
126      * @param original the {@code Percentile} instance to copy
127      * @throws NullArgumentException if original is null
128      */
129     public Percentile(Percentile original) throws NullArgumentException {
130         copy(original, this);
131     }
132 
133     /** {@inheritDoc} */
134     @Override
135     public void setData(final double[] values) {
136         if (values == null) {
137             cachedPivots = null;
138         } else {
139             cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
140             Arrays.fill(cachedPivots, -1);
141         }
142         super.setData(values);
143     }
144 
145     /** {@inheritDoc} */
146     @Override
147     public void setData(final double[] values, final int begin, final int length)
148     throws MathIllegalArgumentException {
149         if (values == null) {
150             cachedPivots = null;
151         } else {
152             cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
153             Arrays.fill(cachedPivots, -1);
154         }
155         super.setData(values, begin, length);
156     }
157 
158     /**
159      * Returns the result of evaluating the statistic over the stored data.
160      * <p>
161      * The stored array is the one which was set by previous calls to
162      * {@link #setData(double[])}
163      * </p>
164      * @param p the percentile value to compute
165      * @return the value of the statistic applied to the stored data
166      * @throws MathIllegalArgumentException if p is not a valid quantile value
167      * (p must be greater than 0 and less than or equal to 100)
168      */
169     public double evaluate(final double p) throws MathIllegalArgumentException {
170         return evaluate(getDataRef(), p);
171     }
172 
173     /**
174      * Returns an estimate of the <code>p</code>th percentile of the values
175      * in the <code>values</code> array.
176      * <p>
177      * Calls to this method do not modify the internal <code>quantile</code>
178      * state of this statistic.</p>
179      * <p>
180      * <ul>
181      * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
182      * <code>0</code></li>
183      * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
184      *  if <code>values</code> has length <code>1</code></li>
185      * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
186      * is null or p is not a valid quantile value (p must be greater than 0
187      * and less than or equal to 100) </li>
188      * </ul></p>
189      * <p>
190      * See {@link Percentile} for a description of the percentile estimation
191      * algorithm used.</p>
192      *
193      * @param values input array of values
194      * @param p the percentile value to compute
195      * @return the percentile value or Double.NaN if the array is empty
196      * @throws MathIllegalArgumentException if <code>values</code> is null
197      *     or p is invalid
198      */
199     public double evaluate(final double[] values, final double p)
200     throws MathIllegalArgumentException {
201         test(values, 0, 0);
202         return evaluate(values, 0, values.length, p);
203     }
204 
205     /**
206      * Returns an estimate of the <code>quantile</code>th percentile of the
207      * designated values in the <code>values</code> array.  The quantile
208      * estimated is determined by the <code>quantile</code> property.
209      * <p>
210      * <ul>
211      * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
212      * <li>Returns (for any value of <code>quantile</code>)
213      * <code>values[begin]</code> if <code>length = 1 </code></li>
214      * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
215      * is null, or <code>start</code> or <code>length</code> is invalid</li>
216      * </ul></p>
217      * <p>
218      * See {@link Percentile} for a description of the percentile estimation
219      * algorithm used.</p>
220      *
221      * @param values the input array
222      * @param start index of the first array element to include
223      * @param length the number of elements to include
224      * @return the percentile value
225      * @throws MathIllegalArgumentException if the parameters are not valid
226      *
227      */
228     @Override
229     public double evaluate(final double[] values, final int start, final int length)
230     throws MathIllegalArgumentException {
231         return evaluate(values, start, length, quantile);
232     }
233 
234      /**
235      * Returns an estimate of the <code>p</code>th percentile of the values
236      * in the <code>values</code> array, starting with the element in (0-based)
237      * position <code>begin</code> in the array and including <code>length</code>
238      * values.
239      * <p>
240      * Calls to this method do not modify the internal <code>quantile</code>
241      * state of this statistic.</p>
242      * <p>
243      * <ul>
244      * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
245      * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
246      *  if <code>length = 1 </code></li>
247      * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
248      *  is null , <code>begin</code> or <code>length</code> is invalid, or
249      * <code>p</code> is not a valid quantile value (p must be greater than 0
250      * and less than or equal to 100)</li>
251      * </ul></p>
252      * <p>
253      * See {@link Percentile} for a description of the percentile estimation
254      * algorithm used.</p>
255      *
256      * @param values array of input values
257      * @param p  the percentile to compute
258      * @param begin  the first (0-based) element to include in the computation
259      * @param length  the number of array elements to include
260      * @return  the percentile value
261      * @throws MathIllegalArgumentException if the parameters are not valid or the
262      * input array is null
263      */
264     public double evaluate(final double[] values, final int begin,
265             final int length, final double p) throws MathIllegalArgumentException {
266 
267         test(values, begin, length);
268 
269         if ((p > 100) || (p <= 0)) {
270             throw new OutOfRangeException(
271                     LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
272         }
273         if (length == 0) {
274             return Double.NaN;
275         }
276         if (length == 1) {
277             return values[begin]; // always return single value for n = 1
278         }
279         double n = length;
280         double pos = p * (n + 1) / 100;
281         double fpos = FastMath.floor(pos);
282         int intPos = (int) fpos;
283         double dif = pos - fpos;
284         double[] work;
285         int[] pivotsHeap;
286         if (values == getDataRef()) {
287             work = getDataRef();
288             pivotsHeap = cachedPivots;
289         } else {
290             work = new double[length];
291             System.arraycopy(values, begin, work, 0, length);
292             pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
293             Arrays.fill(pivotsHeap, -1);
294         }
295 
296         if (pos < 1) {
297             return select(work, pivotsHeap, 0);
298         }
299         if (pos >= n) {
300             return select(work, pivotsHeap, length - 1);
301         }
302         double lower = select(work, pivotsHeap, intPos - 1);
303         double upper = select(work, pivotsHeap, intPos);
304         return lower + dif * (upper - lower);
305     }
306 
307     /**
308      * Select the k<sup>th</sup> smallest element from work array
309      * @param work work array (will be reorganized during the call)
310      * @param pivotsHeap set of pivot index corresponding to elements that
311      * are already at their sorted location, stored as an implicit heap
312      * (i.e. a sorted binary tree stored in a flat array, where the
313      * children of a node at index n are at indices 2n+1 for the left
314      * child and 2n+2 for the right child, with 0-based indices)
315      * @param k index of the desired element
316      * @return k<sup>th</sup> smallest element
317      */
318     private double select(final double[] work, final int[] pivotsHeap, final int k) {
319 
320         int begin = 0;
321         int end   = work.length;
322         int node  = 0;
323 
324         while (end - begin > MIN_SELECT_SIZE) {
325 
326             final int pivot;
327             if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
328                 // the pivot has already been found in a previous call
329                 // and the array has already been partitioned around it
330                 pivot = pivotsHeap[node];
331             } else {
332                 // select a pivot and partition work array around it
333                 pivot = partition(work, begin, end, medianOf3(work, begin, end));
334                 if (node < pivotsHeap.length) {
335                     pivotsHeap[node] =  pivot;
336                 }
337             }
338 
339             if (k == pivot) {
340                 // the pivot was exactly the element we wanted
341                 return work[k];
342             } else if (k < pivot) {
343                 // the element is in the left partition
344                 end  = pivot;
345                 node = FastMath.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
346             } else {
347                 // the element is in the right partition
348                 begin = pivot + 1;
349                 node  = FastMath.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
350             }
351 
352         }
353 
354         // the element is somewhere in the small sub-array
355         // sort the sub-array using insertion sort
356         insertionSort(work, begin, end);
357         return work[k];
358 
359     }
360 
361     /** Select a pivot index as the median of three
362      * @param work data array
363      * @param begin index of the first element of the slice
364      * @param end index after the last element of the slice
365      * @return the index of the median element chosen between the
366      * first, the middle and the last element of the array slice
367      */
368     int medianOf3(final double[] work, final int begin, final int end) {
369 
370         final int inclusiveEnd = end - 1;
371         final int    middle    = begin + (inclusiveEnd - begin) / 2;
372         final double wBegin    = work[begin];
373         final double wMiddle   = work[middle];
374         final double wEnd      = work[inclusiveEnd];
375 
376         if (wBegin < wMiddle) {
377             if (wMiddle < wEnd) {
378                 return middle;
379             } else {
380                 return (wBegin < wEnd) ? inclusiveEnd : begin;
381             }
382         } else {
383             if (wBegin < wEnd) {
384                 return begin;
385             } else {
386                 return (wMiddle < wEnd) ? inclusiveEnd : middle;
387             }
388         }
389 
390     }
391 
392     /**
393      * Partition an array slice around a pivot
394      * <p>
395      * Partitioning exchanges array elements such that all elements
396      * smaller than pivot are before it and all elements larger than
397      * pivot are after it
398      * </p>
399      * @param work data array
400      * @param begin index of the first element of the slice
401      * @param end index after the last element of the slice
402      * @param pivot initial index of the pivot
403      * @return index of the pivot after partition
404      */
405     private int partition(final double[] work, final int begin, final int end, final int pivot) {
406 
407         final double value = work[pivot];
408         work[pivot] = work[begin];
409 
410         int i = begin + 1;
411         int j = end - 1;
412         while (i < j) {
413             while ((i < j) && (work[j] > value)) {
414                 --j;
415             }
416             while ((i < j) && (work[i] < value)) {
417                 ++i;
418             }
419 
420             if (i < j) {
421                 final double tmp = work[i];
422                 work[i++] = work[j];
423                 work[j--] = tmp;
424             }
425         }
426 
427         if ((i >= end) || (work[i] > value)) {
428             --i;
429         }
430         work[begin] = work[i];
431         work[i]     = value;
432         return i;
433 
434     }
435 
436     /**
437      * Sort in place a (small) array slice using insertion sort
438      * @param work array to sort
439      * @param begin index of the first element of the slice to sort
440      * @param end index after the last element of the slice to sort
441      */
442     private void insertionSort(final double[] work, final int begin, final int end) {
443         for (int j = begin + 1; j < end; j++) {
444             final double saved = work[j];
445             int i = j - 1;
446             while ((i >= begin) && (saved < work[i])) {
447                 work[i + 1] = work[i];
448                 i--;
449             }
450             work[i + 1] = saved;
451         }
452     }
453 
454     /**
455      * Returns the value of the quantile field (determines what percentile is
456      * computed when evaluate() is called with no quantile argument).
457      *
458      * @return quantile
459      */
460     public double getQuantile() {
461         return quantile;
462     }
463 
464     /**
465      * Sets the value of the quantile field (determines what percentile is
466      * computed when evaluate() is called with no quantile argument).
467      *
468      * @param p a value between 0 < p <= 100
469      * @throws MathIllegalArgumentException  if p is not greater than 0 and less
470      * than or equal to 100
471      */
472     public void setQuantile(final double p) throws MathIllegalArgumentException {
473         if (p <= 0 || p > 100) {
474             throw new OutOfRangeException(
475                     LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
476         }
477         quantile = p;
478     }
479 
480     /**
481      * {@inheritDoc}
482      */
483     @Override
484     public Percentile copy() {
485         Percentile result = new Percentile();
486         //No try-catch or advertised exception because args are guaranteed non-null
487         copy(this, result);
488         return result;
489     }
490 
491     /**
492      * Copies source to dest.
493      * <p>Neither source nor dest can be null.</p>
494      *
495      * @param source Percentile to copy
496      * @param dest Percentile to copy to
497      * @throws NullArgumentException if either source or dest is null
498      */
499     public static void copy(Percentile source, Percentile dest)
500         throws NullArgumentException {
501         MathUtils.checkNotNull(source);
502         MathUtils.checkNotNull(dest);
503         dest.setData(source.getDataRef());
504         if (source.cachedPivots != null) {
505             System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
506         }
507         dest.quantile = source.quantile;
508     }
509 
510 }