IndexSupport.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.numbers.arrays;

/**
 * Support for creating {@link UpdatingInterval} implementations and validating indices.
 *
 * @since 1.2
 */
final class IndexSupport {
    /** The upper threshold to use a modified insertion sort to find unique indices. */
    private static final int INSERTION_SORT_SIZE = 20;

    /** No instances. */
    private IndexSupport() {}

    /**
     * Returns an interval that covers the specified indices {@code k}.
     *
     * @param left Lower bound of data (inclusive).
     * @param right Upper bound of data (inclusive).
     * @param k Indices.
     * @param n Count of indices (must be strictly positive).
     * @throws IndexOutOfBoundsException if any index {@code k} is not within the
     * sub-range {@code [left, right]}
     * @return the interval
     */
    static UpdatingInterval createUpdatingInterval(int left, int right, int[] k, int n) {
        // Note: A typical use case is to have a few indices. Thus the heuristics
        // in this method should be very fast when n is small.
        // We have a choice between a KeyUpdatingInterval which requires
        // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
        // The purpose of the heuristics is to avoid a very bad choice of data structure,
        // rather than choosing the best data structure in all situations. As long as the
        // choice is reasonable the speed will not impact a partition algorithm.

        // Simple cases
        if (n == 2) {
            if (k[0] == k[1]) {
                return newUpdatingInterval(k, 1);
            }
            if (k[1] < k[0]) {
                final int v = k[0];
                k[0] = k[1];
                k[1] = v;
            }
            return newUpdatingInterval(k, 2);
        }

        // Strategy: Must be fast on already ascending data.
        // Note: The recommended way to generate a lot of partition indices is to
        // generate in sequence.

        // n <= small:
        //   Modified insertion sort (naturally finds ascending data)
        // n > small:
        //   Look for ascending sequence and compact
        // else:
        //   Remove duplicates using an order(1) data structure and sort

        if (n <= INSERTION_SORT_SIZE) {
            final int unique = Sorting.insertionSortIndices(k, n);
            return newUpdatingInterval(k, unique);
        }

        if (isAscending(k, n)) {
            // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
            // BitIndexUpdatingInterval depending on data length but not significantly
            // slower and the difference is lost in the time taken for partitioning.
            // So always use the keys.
            final int unique = compressDuplicates(k, n);
            return newUpdatingInterval(k, unique);
        }

        // At least 20 indices that are partially unordered.

        // Find min/max to understand the range.
        int min = k[n - 1];
        int max = min;
        for (int i = n - 1; --i >= 0;) {
            min = Math.min(min, k[i]);
            max = Math.max(max, k[i]);
        }

        // Here we use a simple test based on the number of comparisons required
        // to perform the expected next/previous look-ups after a split.
        // It is expected that we can cut n keys a maximum of n-1 times.
        // Each cut requires a scan next/previous to divide the interval into two intervals:
        //
        //            cut
        //             |
        //        k1--------k2---------k3---- ... ---------kn    initial interval
        //         <--| find previous
        //    find next |-->
        //        k1        k2---------k3---- ... ---------kn    divided intervals
        //
        // An BitSet will scan from the cut location and find a match in time proportional to
        // the index density. Average density is (size / n) and the scanning covers 64
        // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)

        // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
        // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)

        // Transition when n * n ~ size / 32
        // Benchmarking shows this is a reasonable approximation when size < 2^20.
        // The speed of the bit keys is approximately independent of n and proportional to size.
        // Large size observes degrading performance of the bit keys vs sorted keys.
        // We introduce a penalty for each 4x increase over size = 2^20.
        // n * n = size/32 * 2^log4(size / 2^20)
        // The transition point still favours the bit keys when sorted keys would be faster.
        // However the difference is held within 4x and the BitSet type structure is still fast
        // enough to be negligible against the speed of partitioning.

        // Transition point: n = sqrt(size/32)
        // size n
        // 2^10 5.66
        // 2^15 32.0
        // 2^20 181.0

        // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
        // size n
        // 2^22 512.0
        // 2^24 1448.2
        // 2^28 11585
        // 2^31 55108

        final int size = max - min + 1;

        // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
        // At 2^31 the shift reduces to 0.
        int shift = 5;
        if (size > (1 << 20)) {
            // log4(size/2^20) == (log2(size) - 20) / 2
            shift -= (ceilLog2(size) - 20) >>> 1;
        }

        if ((long) n * n > (size >> shift)) {
            final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
            for (int i = n; --i >= 0;) {
                interval.set(k[i]);
            }
            return interval;
        }

        // Sort with a hash set to filter indices
        final int unique = Sorting.sortIndices(k, n);
        return new KeyUpdatingInterval(k, unique);
    }

    /**
     * Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
     * Data is assumed to be at least length 1.
     *
     * @param data Data.
     * @param n Length of data.
     * @return true if ascending
     */
    private static boolean isAscending(int[] data, int n) {
        for (int i = 0; ++i < n;) {
            if (data[i] < data[i - 1]) {
                // descending
                return false;
            }
        }
        return true;
    }

    /**
     * Compress duplicates in the ascending data.
     *
     * <p>Warning: Requires {@code n > 0}.
     *
     * @param data Indices.
     * @param n Number of indices.
     * @return the number of unique indices
     */
    private static int compressDuplicates(int[] data, int n) {
        // Compress to remove duplicates
        int last = 0;
        int top = data[0];
        for (int i = 0; ++i < n;) {
            final int v = data[i];
            if (v == top) {
                continue;
            }
            top = v;
            data[++last] = v;
        }
        return last + 1;
    }

    /**
     * Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
     *
     * <p>Returns -1 for {@code x = 0} in place of -infinity.
     *
     * @param x Value.
     * @return {@code ceil(log2(x))}
     */
    private static int ceilLog2(int x) {
        return 32 - Integer.numberOfLeadingZeros(x - 1);
    }

    /**
     * Returns an interval that covers the specified indices {@code k}.
     * The indices must be sorted.
     *
     * @param k Indices.
     * @param n Count of indices (must be strictly positive).
     * @throws IndexOutOfBoundsException if any index {@code k} is not within the
     * sub-range {@code [left, right]}
     * @return the interval
     */
    private static UpdatingInterval newUpdatingInterval(int[] k, int n) {
        return new KeyUpdatingInterval(k, n);
    }

    /**
     * Count the number of indices. Returns a negative value if the indices are sorted.
     *
     * @param keys Keys.
     * @param n Count of indices.
     * @return the count of (sorted) indices
     */
    static int countIndices(UpdatingInterval keys, int n) {
        if (keys instanceof KeyUpdatingInterval) {
            return -((KeyUpdatingInterval) keys).size();
        }
        return n;
    }

    /**
     * Checks if the sub-range from fromIndex (inclusive) to toIndex (exclusive) is
     * within the bounds of range from 0 (inclusive) to length (exclusive).
     *
     * <p>This function provides the functionality of
     * {@code java.utils.Objects.checkFromToIndex} introduced in JDK 9. The <a
     * href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Objects.html#checkFromToIndex(int,int,int)">Objects</a>
     * javadoc has been reproduced for reference. The return value has been changed
     * to void.
     *
     * <p>The sub-range is defined to be out of bounds if any of the following
     * inequalities is true:
     * <ul>
     * <li>{@code fromIndex < 0}
     * <li>{@code fromIndex > toIndex}
     * <li>{@code toIndex > length}
     * <li>{@code length < 0}, which is implied from the former inequalities
     * </ul>
     *
     * @param fromIndex Lower-bound (inclusive) of the sub-range.
     * @param toIndex Upper-bound (exclusive) of the sub-range.
     * @param length Upper-bound (exclusive) of the range.
     * @throws IndexOutOfBoundsException if the sub-range is out of bounds
     */
    static void checkFromToIndex(int fromIndex, int toIndex, int length) {
        // Checks as documented above
        if (fromIndex < 0 || fromIndex > toIndex || toIndex > length) {
            throw new IndexOutOfBoundsException(
                msgRangeOutOfBounds(fromIndex, toIndex, length));
        }
    }

    /**
     * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
     *
     * @param fromIndex Lower-bound (inclusive) of the sub-range.
     * @param toIndex Upper-bound (exclusive) of the sub-range.
     * @param k Indices.
     * @throws IndexOutOfBoundsException if any index is out of bounds
     */
    static void checkIndices(int fromIndex, int toIndex, int[] k) {
        for (final int i : k) {
            checkIndex(fromIndex, toIndex, i);
        }
    }

    /**
     * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
     *
     * @param fromIndex Lower-bound (inclusive) of the sub-range.
     * @param toIndex Upper-bound (exclusive) of the sub-range.
     * @param index Index.
     * @throws IndexOutOfBoundsException if the index is out of bounds
     */
    static void checkIndex(int fromIndex, int toIndex, int index) {
        if (index < fromIndex || index >= toIndex) {
            throw new IndexOutOfBoundsException(
                msgIndexOutOfBounds(fromIndex, toIndex, index));
        }
    }

    // Message formatting moved to separate methods to assist inlining of the validation methods.

    /**
     * Format a message when range [from, to) is not entirely within the length.
     *
     * @param fromIndex Lower-bound (inclusive) of the sub-range.
     * @param toIndex Upper-bound (exclusive) of the sub-range.
     * @param length Upper-bound (exclusive) of the range.
     * @return the message
     */
    private static String msgRangeOutOfBounds(int fromIndex, int toIndex, int length) {
        return String.format("Range [%d, %d) out of bounds for length %d", fromIndex, toIndex, length);
    }

    /**
     * Format a message when index is not within range [from, to).
     *
     * @param fromIndex Lower-bound (inclusive) of the sub-range.
     * @param toIndex Upper-bound (exclusive) of the sub-range.
     * @param index Index.
     * @return the message
     */
    private static String msgIndexOutOfBounds(int fromIndex, int toIndex, int index) {
        return String.format("Index %d out of bounds for range [%d, %d)", index, fromIndex, toIndex);
    }
}