View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.numbers.examples.jmh.arrays;
18  
19  /**
20   * Support for creating {@link SearchableInterval}, {@link SearchableInterval2} and
21   * {@link UpdatingInterval} implementations.
22   *
23   * @since 1.2
24   */
25  final class IndexIntervals {
26      /** Size to perform key analysis. This avoids key analysis for a small number of keys. */
27      private static final int KEY_ANALYSIS_SIZE = 10;
28      /** The upper threshold to use a modified insertion sort to find unique indices. */
29      private static final int INDICES_INSERTION_SORT_SIZE = 20;
30  
31      /** Size to use a {@link BinarySearchKeyInterval}. Note that the
32       * {@link ScanningKeyInterval} uses points within the range to fast-forward
33       * scanning which improves performance significantly for a few hundred indices.
34       * Performance is similar when indices are in the thousands. Binary search is
35       * much faster when there are multiple thousands of indices. */
36      private static final int BINARY_SEARCH_SIZE = 2048;
37  
38      /** No instances. */
39      private IndexIntervals() {}
40  
41      /**
42       * Returns an interval that covers all indices ({@code [0, MAX_VALUE)}).
43       *
44       * <p>When used with a partition algorithm will cause a full sort
45       * of the range between the bounds {@code [ka, kb]}.
46       *
47       * @return the interval
48       */
49      static SearchableInterval anyIndex() {
50          return AnyIndex.INSTANCE;
51      }
52  
53      /**
54       * Returns an interval that covers all indices ({@code [0, MAX_VALUE)}).
55       *
56       * <p>When used with a partition algorithm will cause a full sort
57       * of the range between the bounds {@code [ka, kb]}.
58       *
59       * @return the interval
60       */
61      static SearchableInterval2 anyIndex2() {
62          return AnyIndex.INSTANCE;
63      }
64  
65      /**
66       * Returns an interval that covers a single index {@code k}. The interval cannot
67       * be split or the bounds updated.
68       *
69       * @param k Index.
70       * @return the interval
71       */
72      static UpdatingInterval interval(int k) {
73          return new PointInterval(k);
74      }
75  
76      /**
77       * Returns an interval that covers all indices {@code [left, right]}.
78       * This method will sort the input bound to ensure {@code left <= right}.
79       *
80       * <p>When used with a partition algorithm will cause a full sort
81       * of the range between the bounds {@code [left, right]}.
82       *
83       * @param left Left bound (inclusive).
84       * @param right Right bound (inclusive).
85       * @return the interval
86       */
87      static UpdatingInterval interval(int left, int right) {
88          // Sort the bound
89          final int l = left < right ? left : right;
90          final int r = left < right ? right : left;
91          return new RangeInterval(l, r);
92      }
93  
94      /**
95       * Returns an interval that covers the specified indices {@code k}.
96       *
97       * @param k Indices.
98       * @param n Count of indices (must be strictly positive).
99       * @return the interval
100      */
101     static SearchableInterval createSearchableInterval(int[] k, int n) {
102         // Note: A typical use case is to have a few indices. Thus the heuristics
103         // in this method should be very fast when n is small. Here we skip them
104         // completely when the number of keys is tiny.
105 
106         if (n > KEY_ANALYSIS_SIZE) {
107             // Here we use a simple test based on the number of comparisons required
108             // to perform the expected next/previous look-ups.
109             // It is expected that we can cut n keys a maximum of n-1 times.
110             // Each cut requires a scan next/previous to divide the interval into two intervals:
111             //
112             //            cut
113             //             |
114             //        k1--------k2---------k3---- ... ---------kn    initial interval
115             //         <--| find previous
116             //    find next |-->
117             //        k1        k2---------k3---- ... ---------kn    divided intervals
118             //
119             // A ScanningKeyIndexInterval will scan n keys in both directions using n comparisons
120             // (if next takes m comparisons then previous will take n - m comparisons): Order(n^2)
121             // An IndexSet will scan from the cut location and find a match in time proportional to
122             // the index density. Average density is (size / n) and the scanning covers 64
123             // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)
124 
125             // Get the range. This will throw an exception if there are no indices.
126             int min = k[n - 1];
127             int max = min;
128             for (int i = n - 1; --i >= 0;) {
129                 min = Math.min(min, k[i]);
130                 max = Math.max(max, k[i]);
131             }
132 
133             // Transition when n * n ~ size / 32
134             // Benchmarking shows this is a reasonable approximation when size is small.
135             // Speed of the IndexSet is approximately independent of n and proportional to size.
136             // Large size observes degrading performance more than expected from a linear relationship.
137             // Note the memory required is approximately (size / 8) bytes.
138             // We introduce a penalty for each 4x increase over size = 2^20 (== 128KiB).
139             // n * n = size/32 * 2^log4(size / 2^20)
140 
141             // Transition point: n = sqrt(size/32)
142             // size n
143             // 2^10 5.66
144             // 2^15 32.0
145             // 2^20 181.0
146 
147             // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
148             // size n
149             // 2^22 512.0
150             // 2^24 1448.2
151             // 2^28 11585
152             // 2^31 55108
153 
154             final int size = max - min + 1;
155 
156             // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
157             // At 2^31 the shift reduces to 0.
158             int shift = 5;
159             if (size > (1 << 20)) {
160                 // log4(size/2^20) == (log2(size) - 20) / 2
161                 shift -= (ceilLog2(size) - 20) >>> 1;
162             }
163 
164             if ((long) n * n > (size >> shift)) {
165                 // Do not call IndexSet.of(k, n) which repeats the min/max search
166                 // (especially given n is likely to be large).
167                 final IndexSet interval = IndexSet.ofRange(min, max);
168                 for (int i = n; --i >= 0;) {
169                     interval.set(k[i]);
170                 }
171                 return interval;
172             }
173 
174             // Switch to binary search above a threshold.
175             // Note this invalidates the speed assumptions based on the number of comparisons.
176             // Benchmarking shows this is useful when the keys are in the thousands so this
177             // would be used when data size is in the millions.
178             if (n > BINARY_SEARCH_SIZE) {
179                 final int unique = Sorting.sortIndices2(k, n);
180                 return BinarySearchKeyInterval.of(k, unique);
181             }
182 
183             // Fall-though to the ScanningKeyIndexInterval...
184         }
185 
186         // This is the typical use case.
187         // Here n is small, or small compared to the min/max range of indices.
188         // Use a special method to sort unique indices (detects already sorted indices).
189         final int unique = Sorting.sortIndices2(k, n);
190 
191         return ScanningKeyInterval.of(k, unique);
192     }
193 
194     /**
195      * Returns an interval that covers the specified indices {@code k}.
196      *
197      * @param k Indices.
198      * @param n Count of indices (must be strictly positive).
199      * @return the interval
200      */
201     static UpdatingInterval createUpdatingInterval(int[] k, int n) {
202         // Note: A typical use case is to have a few indices. Thus the heuristics
203         // in this method should be very fast when n is small.
204         // We have a choice between a KeyUpdatingInterval which requires
205         // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
206         // The purpose of the heuristics is to avoid a very bad choice of data structure,
207         // rather than choosing the best data structure in all situations. As long as the
208         // choice is reasonable the speed will not impact a partition algorithm.
209 
210         // Simple cases
211         if (n < 3) {
212             if (n == 1 || k[0] == k[1]) {
213                 // 1 unique value
214                 return IndexIntervals.interval(k[0]);
215             }
216             // 2 unique values
217             if (Math.abs(k[0] - k[1]) == 1) {
218                 // Small range
219                 return IndexIntervals.interval(k[0], k[1]);
220             }
221             // 2 well separated values
222             if (k[1] < k[0]) {
223                 final int v = k[0];
224                 k[0] = k[1];
225                 k[1] = v;
226             }
227             return KeyUpdatingInterval.of(k, 2);
228         }
229 
230         // Strategy: Must be fast on already ascending data.
231         // Note: The recommended way to generate a lot of partition indices is to
232         // generate in sequence.
233 
234         // n <= small:
235         //   Modified insertion sort (naturally finds ascending data)
236         // n > small:
237         //   Look for ascending sequence and compact
238         // else:
239         //   Remove duplicates using an order(1) data structure and sort
240 
241         if (n <= INDICES_INSERTION_SORT_SIZE) {
242             final int unique = Sorting.sortIndicesInsertionSort(k, n);
243             return KeyUpdatingInterval.of(k, unique);
244         }
245 
246         if (isAscending(k, n)) {
247             // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
248             // BitIndexUpdatingInterval depending on data length but not significantly
249             // slower and the difference is lost in the time taken for partitioning.
250             // So always use the keys.
251             final int unique = compressDuplicates(k, n);
252             return KeyUpdatingInterval.of(k, unique);
253         }
254 
255         // At least 20 indices that are partially unordered.
256 
257         // Find min/max to understand the range.
258         int min = k[n - 1];
259         int max = min;
260         for (int i = n - 1; --i >= 0;) {
261             min = Math.min(min, k[i]);
262             max = Math.max(max, k[i]);
263         }
264 
265         // Here we use a simple test based on the number of comparisons required
266         // to perform the expected next/previous look-ups after a split.
267         // It is expected that we can cut n keys a maximum of n-1 times.
268         // Each cut requires a scan next/previous to divide the interval into two intervals:
269         //
270         //            cut
271         //             |
272         //        k1--------k2---------k3---- ... ---------kn    initial interval
273         //         <--| find previous
274         //    find next |-->
275         //        k1        k2---------k3---- ... ---------kn    divided intervals
276         //
277         // An BitSet will scan from the cut location and find a match in time proportional to
278         // the index density. Average density is (size / n) and the scanning covers 64
279         // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)
280 
281         // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
282         // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)
283 
284         // Transition when n * n ~ size / 32
285         // Benchmarking shows this is a reasonable approximation when size < 2^20.
286         // The speed of the bit keys is approximately independent of n and proportional to size.
287         // Large size observes degrading performance of the bit keys vs sorted keys.
288         // We introduce a penalty for each 4x increase over size = 2^20.
289         // n * n = size/32 * 2^log4(size / 2^20)
290         // The transition point still favours the bit keys when sorted keys would be faster.
291         // However the difference is held within 4x and the BitSet type structure is still fast
292         // enough to be negligible against the speed of partitioning.
293 
294         // Transition point: n = sqrt(size/32)
295         // size n
296         // 2^10 5.66
297         // 2^15 32.0
298         // 2^20 181.0
299 
300         // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
301         // size n
302         // 2^22 512.0
303         // 2^24 1448.2
304         // 2^28 11585
305         // 2^31 55108
306 
307         final int size = max - min + 1;
308 
309         // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
310         // At 2^31 the shift reduces to 0.
311         int shift = 5;
312         if (size > (1 << 20)) {
313             // log4(size/2^20) == (log2(size) - 20) / 2
314             shift -= (ceilLog2(size) - 20) >>> 1;
315         }
316 
317         if ((long) n * n > (size >> shift)) {
318             // Do not call BitIndexUpdatingInterval.of(k, n) which repeats the min/max search
319             // (especially given n is likely to be large).
320             final BitIndexUpdatingInterval interval = BitIndexUpdatingInterval.ofRange(min, max);
321             for (int i = n; --i >= 0;) {
322                 interval.set(k[i]);
323             }
324             return interval;
325         }
326 
327         // Sort with a hash set to filter indices
328         final int unique = Sorting.sortIndicesHashIndexSet(k, n);
329         return KeyUpdatingInterval.of(k, unique);
330     }
331 
332     /**
333      * Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
334      * Data is assumed to be at least length 1.
335      *
336      * @param data Data.
337      * @param n Length of data.
338      * @return true if ascending
339      */
340     private static boolean isAscending(int[] data, int n) {
341         for (int i = 0; ++i < n;) {
342             if (data[i] < data[i - 1]) {
343                 // descending
344                 return false;
345             }
346         }
347         return true;
348     }
349 
350     /**
351      * Compress duplicates in the ascending data.
352      *
353      * <p>Warning: Requires {@code n > 0}.
354      *
355      * @param data Indices.
356      * @param n Number of indices.
357      * @return the number of unique indices
358      */
359     private static int compressDuplicates(int[] data, int n) {
360         // Compress to remove duplicates
361         int last = 0;
362         int top = data[0];
363         for (int i = 0; ++i < n;) {
364             final int v = data[i];
365             if (v == top) {
366                 continue;
367             }
368             top = v;
369             data[++last] = v;
370         }
371         return last + 1;
372     }
373 
374     /**
375      * Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
376      *
377      * <p>Returns -1 for {@code x = 0} in place of -infinity.
378      *
379      * @param x Value.
380      * @return {@code ceil(log2(x))}
381      */
382     private static int ceilLog2(int x) {
383         return 32 - Integer.numberOfLeadingZeros(x - 1);
384     }
385 
386     /**
387      * {@link SearchableInterval} for range {@code [0, MAX_VALUE)}.
388      */
389     private static final class AnyIndex implements SearchableInterval, SearchableInterval2 {
390         /** Singleton instance. */
391         static final AnyIndex INSTANCE = new AnyIndex();
392 
393         @Override
394         public int left() {
395             return 0;
396         }
397 
398         @Override
399         public int right() {
400             return Integer.MAX_VALUE - 1;
401         }
402 
403         @Override
404         public int previousIndex(int k) {
405             return k;
406         }
407 
408         @Override
409         public int nextIndex(int k) {
410             return k;
411         }
412 
413         @Override
414         public int split(int ka, int kb, int[] upper) {
415             upper[0] = kb + 1;
416             return ka - 1;
417         }
418 
419         // IndexInterval2
420         // This is exactly the same as IndexInterval as the pointers i are the same as the keys k
421 
422         @Override
423         public int start() {
424             return 0;
425         }
426 
427         @Override
428         public int end() {
429             return Integer.MAX_VALUE - 1;
430         }
431 
432         @Override
433         public int index(int i) {
434             return i;
435         }
436 
437         @Override
438         public int previous(int i, int k) {
439             return k;
440         }
441 
442         @Override
443         public int next(int i, int k) {
444             return k;
445         }
446 
447         @Override
448         public int split(int i1, int i2, int ka, int kb, int[] upper) {
449             upper[0] = kb + 1;
450             return ka - 1;
451         }
452     }
453 
454     /**
455      * {@link UpdatingInterval} for a single {@code index}.
456      */
457     static final class PointInterval implements UpdatingInterval {
458         /** Left/right bound of the interval. */
459         private final int index;
460 
461         /**
462          * @param k Left/right bound.
463          */
464         PointInterval(int k) {
465             this.index = k;
466         }
467 
468         @Override
469         public int left() {
470             return index;
471         }
472 
473         @Override
474         public int right() {
475             return index;
476         }
477 
478         // Note: An UpdatingInterval is only required to update when a target index
479         // is within [left, right]. This is not possible for a single point.
480 
481         @Override
482         public int updateLeft(int k) {
483             throw new UnsupportedOperationException("updateLeft should not be called");
484         }
485 
486         @Override
487         public int updateRight(int k) {
488             throw new UnsupportedOperationException("updateRight should not be called");
489         }
490 
491         @Override
492         public UpdatingInterval splitLeft(int ka, int kb) {
493             throw new UnsupportedOperationException("splitLeft should not be called");
494         }
495 
496         @Override
497         public UpdatingInterval splitRight(int ka, int kb) {
498             throw new UnsupportedOperationException("splitRight should not be called");
499         }
500     }
501 
502     /**
503      * {@link UpdatingInterval} for range {@code [left, right]}.
504      */
505     static final class RangeInterval implements UpdatingInterval {
506         /** Left bound of the interval. */
507         private int left;
508         /** Right bound of the interval. */
509         private int right;
510 
511         /**
512          * @param left Left bound.
513          * @param right Right bound.
514          */
515         RangeInterval(int left, int right) {
516             this.left = left;
517             this.right = right;
518         }
519 
520         @Override
521         public int left() {
522             return left;
523         }
524 
525         @Override
526         public int right() {
527             return right;
528         }
529 
530         @Override
531         public int updateLeft(int k) {
532             // Assume left < k <= right
533             left = k;
534             return k;
535         }
536 
537         @Override
538         public int updateRight(int k) {
539             // Assume left <= k < right
540             right = k;
541             return k;
542         }
543 
544         @Override
545         public UpdatingInterval splitLeft(int ka, int kb) {
546             // Assume left < ka <= kb < right
547             final int lower = left;
548             left = kb + 1;
549             return new RangeInterval(lower, ka - 1);
550         }
551 
552         @Override
553         public UpdatingInterval splitRight(int ka, int kb) {
554             // Assume left < ka <= kb < right
555             final int upper = right;
556             right = ka - 1;
557             return new RangeInterval(kb + 1, upper);
558         }
559     }
560 }