View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.numbers.arrays;
18  
19  /**
20   * Support for creating {@link UpdatingInterval} implementations and validating indices.
21   *
22   * @since 1.2
23   */
24  final class IndexSupport {
25      /** The upper threshold to use a modified insertion sort to find unique indices. */
26      private static final int INSERTION_SORT_SIZE = 20;
27  
28      /** No instances. */
29      private IndexSupport() {}
30  
31      /**
32       * Returns an interval that covers the specified indices {@code k}.
33       *
34       * @param k Indices.
35       * @param n Count of indices (must be strictly positive).
36       * @return the interval
37       */
38      static UpdatingInterval createUpdatingInterval(int[] k, int n) {
39          // Note: A typical use case is to have a few indices. Thus the heuristics
40          // in this method should be very fast when n is small.
41          // We have a choice between a KeyUpdatingInterval which requires
42          // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
43          // The purpose of the heuristics is to avoid a very bad choice of data structure,
44          // rather than choosing the best data structure in all situations. As long as the
45          // choice is reasonable the speed will not impact a partition algorithm.
46  
47          // Simple cases
48          if (n == 2) {
49              if (k[0] == k[1]) {
50                  return newUpdatingInterval(k, 1);
51              }
52              if (k[1] < k[0]) {
53                  final int v = k[0];
54                  k[0] = k[1];
55                  k[1] = v;
56              }
57              return newUpdatingInterval(k, 2);
58          }
59  
60          // Strategy: Must be fast on already ascending data.
61          // Note: The recommended way to generate a lot of partition indices is to
62          // generate in sequence.
63  
64          // n <= small:
65          //   Modified insertion sort (naturally finds ascending data)
66          // n > small:
67          //   Look for ascending sequence and compact
68          // else:
69          //   Remove duplicates using an order(1) data structure and sort
70  
71          if (n <= INSERTION_SORT_SIZE) {
72              final int unique = Sorting.insertionSortIndices(k, n);
73              return newUpdatingInterval(k, unique);
74          }
75  
76          if (isAscending(k, n)) {
77              // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
78              // BitIndexUpdatingInterval depending on data length but not significantly
79              // slower and the difference is lost in the time taken for partitioning.
80              // So always use the keys.
81              final int unique = compressDuplicates(k, n);
82              return newUpdatingInterval(k, unique);
83          }
84  
85          // At least 20 indices that are partially unordered.
86  
87          // Find min/max to understand the range.
88          int min = k[n - 1];
89          int max = min;
90          for (int i = n - 2; i >= 0; i--) {
91              min = Math.min(min, k[i]);
92              max = Math.max(max, k[i]);
93          }
94  
95          // Here we use a simple test based on the number of comparisons required
96          // to perform the expected next/previous look-ups after a split.
97          // It is expected that we can cut n keys a maximum of n-1 times.
98          // Each cut requires a scan next/previous to divide the interval into two intervals:
99          //
100         //            cut
101         //             |
102         //        k1--------k2---------k3---- ... ---------kn    initial interval
103         //         <--| find previous
104         //    find next |-->
105         //        k1        k2---------k3---- ... ---------kn    divided intervals
106         //
107         // An BitSet will scan from the cut location and find a match in time proportional to
108         // the index density. Average density is (size / n) and the scanning covers 64
109         // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)
110 
111         // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
112         // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)
113 
114         // Transition when n * n ~ size / 32
115         // Benchmarking shows this is a reasonable approximation when size < 2^20.
116         // The speed of the bit keys is approximately independent of n and proportional to size.
117         // Large size observes degrading performance of the bit keys vs sorted keys.
118         // We introduce a penalty for each 4x increase over size = 2^20.
119         // n * n = size/32 * 2^log4(size / 2^20)
120         // The transition point still favours the bit keys when sorted keys would be faster.
121         // However the difference is held within 4x and the BitSet type structure is still fast
122         // enough to be negligible against the speed of partitioning.
123 
124         // Transition point: n = sqrt(size/32)
125         // size n
126         // 2^10 5.66
127         // 2^15 32.0
128         // 2^20 181.0
129 
130         // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
131         // size n
132         // 2^22 512.0
133         // 2^24 1448.2
134         // 2^28 11585
135         // 2^31 55108
136 
137         final int size = max - min + 1;
138 
139         // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
140         // At 2^31 the shift reduces to 0.
141         int shift = 5;
142         if (size > (1 << 20)) {
143             // log4(size/2^20) == (log2(size) - 20) / 2
144             shift -= (ceilLog2(size) - 20) >>> 1;
145         }
146 
147         if ((long) n * n > (size >> shift)) {
148             final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
149             for (int i = n; --i >= 0;) {
150                 interval.set(k[i]);
151             }
152             return interval;
153         }
154 
155         // Sort with a hash set to filter indices
156         final int unique = Sorting.sortIndices(k, n);
157         return new KeyUpdatingInterval(k, unique);
158     }
159 
160     /**
161      * Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
162      * Data is assumed to be at least length 1.
163      *
164      * @param data Data.
165      * @param n Length of data.
166      * @return true if ascending
167      */
168     private static boolean isAscending(int[] data, int n) {
169         for (int i = 1; i < n; i++) {
170             if (data[i] < data[i - 1]) {
171                 // descending
172                 return false;
173             }
174         }
175         return true;
176     }
177 
178     /**
179      * Compress duplicates in the ascending data.
180      *
181      * <p>Warning: Requires {@code n > 0}.
182      *
183      * @param data Indices.
184      * @param n Number of indices.
185      * @return the number of unique indices
186      */
187     private static int compressDuplicates(int[] data, int n) {
188         // Compress to remove duplicates
189         int last = 0;
190         int top = data[0];
191         for (int i = 1; i < n; i++) {
192             final int v = data[i];
193             if (v == top) {
194                 continue;
195             }
196             top = v;
197             data[++last] = v;
198         }
199         return last + 1;
200     }
201 
202     /**
203      * Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
204      *
205      * <p>Returns -1 for {@code x = 0} in place of -infinity.
206      *
207      * @param x Value.
208      * @return {@code ceil(log2(x))}
209      */
210     private static int ceilLog2(int x) {
211         return 32 - Integer.numberOfLeadingZeros(x - 1);
212     }
213 
214     /**
215      * Returns an interval that covers the specified indices {@code k}.
216      * The indices must be sorted.
217      *
218      * @param k Indices.
219      * @param n Count of indices (must be strictly positive).
220      * @throws IndexOutOfBoundsException if any index {@code k} is not within the
221      * sub-range {@code [left, right]}
222      * @return the interval
223      */
224     private static UpdatingInterval newUpdatingInterval(int[] k, int n) {
225         return new KeyUpdatingInterval(k, n);
226     }
227 
228     /**
229      * Count the number of indices. Returns a negative value if the indices are sorted.
230      *
231      * @param keys Keys.
232      * @param n Count of indices.
233      * @return the count of (sorted) indices
234      */
235     static int countIndices(UpdatingInterval keys, int n) {
236         if (keys instanceof KeyUpdatingInterval) {
237             return -((KeyUpdatingInterval) keys).size();
238         }
239         return n;
240     }
241 
242     /**
243      * Checks if the sub-range from fromIndex (inclusive) to toIndex (exclusive) is
244      * within the bounds of range from 0 (inclusive) to length (exclusive).
245      *
246      * <p>This function provides the functionality of
247      * {@code java.utils.Objects.checkFromToIndex} introduced in JDK 9. The <a
248      * href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Objects.html#checkFromToIndex(int,int,int)">Objects</a>
249      * javadoc has been reproduced for reference. The return value has been changed
250      * to void.
251      *
252      * <p>The sub-range is defined to be out of bounds if any of the following
253      * inequalities is true:
254      * <ul>
255      * <li>{@code fromIndex < 0}</li>
256      * <li>{@code fromIndex > toIndex}</li>
257      * <li>{@code toIndex > length}</li>
258      * <li>{@code length < 0}, which is implied from the former inequalities</li>
259      * </ul>
260      *
261      * @param fromIndex Lower-bound (inclusive) of the sub-range.
262      * @param toIndex Upper-bound (exclusive) of the sub-range.
263      * @param length Upper-bound (exclusive) of the range.
264      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
265      */
266     static void checkFromToIndex(int fromIndex, int toIndex, int length) {
267         // Checks as documented above
268         if (fromIndex < 0 || fromIndex > toIndex || toIndex > length) {
269             throw new IndexOutOfBoundsException(
270                 msgRangeOutOfBounds(fromIndex, toIndex, length));
271         }
272     }
273 
274     /**
275      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
276      *
277      * @param fromIndex Lower-bound (inclusive) of the sub-range.
278      * @param toIndex Upper-bound (exclusive) of the sub-range.
279      * @param k Indices.
280      * @throws IndexOutOfBoundsException if any index is out of bounds
281      */
282     static void checkIndices(int fromIndex, int toIndex, int[] k) {
283         for (final int i : k) {
284             checkIndex(fromIndex, toIndex, i);
285         }
286     }
287 
288     /**
289      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
290      *
291      * @param fromIndex Lower-bound (inclusive) of the sub-range.
292      * @param toIndex Upper-bound (exclusive) of the sub-range.
293      * @param index Index.
294      * @throws IndexOutOfBoundsException if the index is out of bounds
295      */
296     static void checkIndex(int fromIndex, int toIndex, int index) {
297         if (index < fromIndex || index >= toIndex) {
298             throw new IndexOutOfBoundsException(
299                 msgIndexOutOfBounds(fromIndex, toIndex, index));
300         }
301     }
302 
303     // Message formatting moved to separate methods to assist inlining of the validation methods.
304 
305     /**
306      * Format a message when range [from, to) is not entirely within the length.
307      *
308      * @param fromIndex Lower-bound (inclusive) of the sub-range.
309      * @param toIndex Upper-bound (exclusive) of the sub-range.
310      * @param length Upper-bound (exclusive) of the range.
311      * @return the message
312      */
313     private static String msgRangeOutOfBounds(int fromIndex, int toIndex, int length) {
314         return String.format("Range [%d, %d) out of bounds for length %d", fromIndex, toIndex, length);
315     }
316 
317     /**
318      * Format a message when index is not within range [from, to).
319      *
320      * @param fromIndex Lower-bound (inclusive) of the sub-range.
321      * @param toIndex Upper-bound (exclusive) of the sub-range.
322      * @param index Index.
323      * @return the message
324      */
325     private static String msgIndexOutOfBounds(int fromIndex, int toIndex, int index) {
326         return String.format("Index %d out of bounds for range [%d, %d)", index, fromIndex, toIndex);
327     }
328 }