IndexSupport.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.numbers.arrays;
/**
* Support for creating {@link UpdatingInterval} implementations and validating indices.
*
* @since 1.2
*/
final class IndexSupport {
/** The upper threshold to use a modified insertion sort to find unique indices. */
private static final int INSERTION_SORT_SIZE = 20;
/** No instances. */
private IndexSupport() {}
/**
* Returns an interval that covers the specified indices {@code k}.
*
* @param left Lower bound of data (inclusive).
* @param right Upper bound of data (inclusive).
* @param k Indices.
* @param n Count of indices (must be strictly positive).
* @throws IndexOutOfBoundsException if any index {@code k} is not within the
* sub-range {@code [left, right]}
* @return the interval
*/
static UpdatingInterval createUpdatingInterval(int left, int right, int[] k, int n) {
// Note: A typical use case is to have a few indices. Thus the heuristics
// in this method should be very fast when n is small.
// We have a choice between a KeyUpdatingInterval which requires
// sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
// The purpose of the heuristics is to avoid a very bad choice of data structure,
// rather than choosing the best data structure in all situations. As long as the
// choice is reasonable the speed will not impact a partition algorithm.
// Simple cases
if (n == 2) {
if (k[0] == k[1]) {
return newUpdatingInterval(k, 1);
}
if (k[1] < k[0]) {
final int v = k[0];
k[0] = k[1];
k[1] = v;
}
return newUpdatingInterval(k, 2);
}
// Strategy: Must be fast on already ascending data.
// Note: The recommended way to generate a lot of partition indices is to
// generate in sequence.
// n <= small:
// Modified insertion sort (naturally finds ascending data)
// n > small:
// Look for ascending sequence and compact
// else:
// Remove duplicates using an order(1) data structure and sort
if (n <= INSERTION_SORT_SIZE) {
final int unique = Sorting.insertionSortIndices(k, n);
return newUpdatingInterval(k, unique);
}
if (isAscending(k, n)) {
// For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
// BitIndexUpdatingInterval depending on data length but not significantly
// slower and the difference is lost in the time taken for partitioning.
// So always use the keys.
final int unique = compressDuplicates(k, n);
return newUpdatingInterval(k, unique);
}
// At least 20 indices that are partially unordered.
// Find min/max to understand the range.
int min = k[n - 1];
int max = min;
for (int i = n - 1; --i >= 0;) {
min = Math.min(min, k[i]);
max = Math.max(max, k[i]);
}
// Here we use a simple test based on the number of comparisons required
// to perform the expected next/previous look-ups after a split.
// It is expected that we can cut n keys a maximum of n-1 times.
// Each cut requires a scan next/previous to divide the interval into two intervals:
//
// cut
// |
// k1--------k2---------k3---- ... ---------kn initial interval
// <--| find previous
// find next |-->
// k1 k2---------k3---- ... ---------kn divided intervals
//
// An BitSet will scan from the cut location and find a match in time proportional to
// the index density. Average density is (size / n) and the scanning covers 64
// indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)
// Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
// Bit keys : Sort time Order(1) : Splitting time Order(size / 32)
// Transition when n * n ~ size / 32
// Benchmarking shows this is a reasonable approximation when size < 2^20.
// The speed of the bit keys is approximately independent of n and proportional to size.
// Large size observes degrading performance of the bit keys vs sorted keys.
// We introduce a penalty for each 4x increase over size = 2^20.
// n * n = size/32 * 2^log4(size / 2^20)
// The transition point still favours the bit keys when sorted keys would be faster.
// However the difference is held within 4x and the BitSet type structure is still fast
// enough to be negligible against the speed of partitioning.
// Transition point: n = sqrt(size/32)
// size n
// 2^10 5.66
// 2^15 32.0
// 2^20 181.0
// Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
// size n
// 2^22 512.0
// 2^24 1448.2
// 2^28 11585
// 2^31 55108
final int size = max - min + 1;
// Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
// At 2^31 the shift reduces to 0.
int shift = 5;
if (size > (1 << 20)) {
// log4(size/2^20) == (log2(size) - 20) / 2
shift -= (ceilLog2(size) - 20) >>> 1;
}
if ((long) n * n > (size >> shift)) {
final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
for (int i = n; --i >= 0;) {
interval.set(k[i]);
}
return interval;
}
// Sort with a hash set to filter indices
final int unique = Sorting.sortIndices(k, n);
return new KeyUpdatingInterval(k, unique);
}
/**
* Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
* Data is assumed to be at least length 1.
*
* @param data Data.
* @param n Length of data.
* @return true if ascending
*/
private static boolean isAscending(int[] data, int n) {
for (int i = 0; ++i < n;) {
if (data[i] < data[i - 1]) {
// descending
return false;
}
}
return true;
}
/**
* Compress duplicates in the ascending data.
*
* <p>Warning: Requires {@code n > 0}.
*
* @param data Indices.
* @param n Number of indices.
* @return the number of unique indices
*/
private static int compressDuplicates(int[] data, int n) {
// Compress to remove duplicates
int last = 0;
int top = data[0];
for (int i = 0; ++i < n;) {
final int v = data[i];
if (v == top) {
continue;
}
top = v;
data[++last] = v;
}
return last + 1;
}
/**
* Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
*
* <p>Returns -1 for {@code x = 0} in place of -infinity.
*
* @param x Value.
* @return {@code ceil(log2(x))}
*/
private static int ceilLog2(int x) {
return 32 - Integer.numberOfLeadingZeros(x - 1);
}
/**
* Returns an interval that covers the specified indices {@code k}.
* The indices must be sorted.
*
* @param k Indices.
* @param n Count of indices (must be strictly positive).
* @throws IndexOutOfBoundsException if any index {@code k} is not within the
* sub-range {@code [left, right]}
* @return the interval
*/
private static UpdatingInterval newUpdatingInterval(int[] k, int n) {
return new KeyUpdatingInterval(k, n);
}
/**
* Count the number of indices. Returns a negative value if the indices are sorted.
*
* @param keys Keys.
* @param n Count of indices.
* @return the count of (sorted) indices
*/
static int countIndices(UpdatingInterval keys, int n) {
if (keys instanceof KeyUpdatingInterval) {
return -((KeyUpdatingInterval) keys).size();
}
return n;
}
/**
* Checks if the sub-range from fromIndex (inclusive) to toIndex (exclusive) is
* within the bounds of range from 0 (inclusive) to length (exclusive).
*
* <p>This function provides the functionality of
* {@code java.utils.Objects.checkFromToIndex} introduced in JDK 9. The <a
* href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Objects.html#checkFromToIndex(int,int,int)">Objects</a>
* javadoc has been reproduced for reference. The return value has been changed
* to void.
*
* <p>The sub-range is defined to be out of bounds if any of the following
* inequalities is true:
* <ul>
* <li>{@code fromIndex < 0}
* <li>{@code fromIndex > toIndex}
* <li>{@code toIndex > length}
* <li>{@code length < 0}, which is implied from the former inequalities
* </ul>
*
* @param fromIndex Lower-bound (inclusive) of the sub-range.
* @param toIndex Upper-bound (exclusive) of the sub-range.
* @param length Upper-bound (exclusive) of the range.
* @throws IndexOutOfBoundsException if the sub-range is out of bounds
*/
static void checkFromToIndex(int fromIndex, int toIndex, int length) {
// Checks as documented above
if (fromIndex < 0 || fromIndex > toIndex || toIndex > length) {
throw new IndexOutOfBoundsException(
msgRangeOutOfBounds(fromIndex, toIndex, length));
}
}
/**
* Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
*
* @param fromIndex Lower-bound (inclusive) of the sub-range.
* @param toIndex Upper-bound (exclusive) of the sub-range.
* @param k Indices.
* @throws IndexOutOfBoundsException if any index is out of bounds
*/
static void checkIndices(int fromIndex, int toIndex, int[] k) {
for (final int i : k) {
checkIndex(fromIndex, toIndex, i);
}
}
/**
* Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
*
* @param fromIndex Lower-bound (inclusive) of the sub-range.
* @param toIndex Upper-bound (exclusive) of the sub-range.
* @param index Index.
* @throws IndexOutOfBoundsException if the index is out of bounds
*/
static void checkIndex(int fromIndex, int toIndex, int index) {
if (index < fromIndex || index >= toIndex) {
throw new IndexOutOfBoundsException(
msgIndexOutOfBounds(fromIndex, toIndex, index));
}
}
// Message formatting moved to separate methods to assist inlining of the validation methods.
/**
* Format a message when range [from, to) is not entirely within the length.
*
* @param fromIndex Lower-bound (inclusive) of the sub-range.
* @param toIndex Upper-bound (exclusive) of the sub-range.
* @param length Upper-bound (exclusive) of the range.
* @return the message
*/
private static String msgRangeOutOfBounds(int fromIndex, int toIndex, int length) {
return String.format("Range [%d, %d) out of bounds for length %d", fromIndex, toIndex, length);
}
/**
* Format a message when index is not within range [from, to).
*
* @param fromIndex Lower-bound (inclusive) of the sub-range.
* @param toIndex Upper-bound (exclusive) of the sub-range.
* @param index Index.
* @return the message
*/
private static String msgIndexOutOfBounds(int fromIndex, int toIndex, int index) {
return String.format("Index %d out of bounds for range [%d, %d)", index, fromIndex, toIndex);
}
}