IndexSupport.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.numbers.arrays;

  18. /**
  19.  * Support for creating {@link UpdatingInterval} implementations and validating indices.
  20.  *
  21.  * @since 1.2
  22.  */
  23. final class IndexSupport {
  24.     /** The upper threshold to use a modified insertion sort to find unique indices. */
  25.     private static final int INSERTION_SORT_SIZE = 20;

  26.     /** No instances. */
  27.     private IndexSupport() {}

  28.     /**
  29.      * Returns an interval that covers the specified indices {@code k}.
  30.      *
  31.      * @param left Lower bound of data (inclusive).
  32.      * @param right Upper bound of data (inclusive).
  33.      * @param k Indices.
  34.      * @param n Count of indices (must be strictly positive).
  35.      * @throws IndexOutOfBoundsException if any index {@code k} is not within the
  36.      * sub-range {@code [left, right]}
  37.      * @return the interval
  38.      */
  39.     static UpdatingInterval createUpdatingInterval(int left, int right, int[] k, int n) {
  40.         // Note: A typical use case is to have a few indices. Thus the heuristics
  41.         // in this method should be very fast when n is small.
  42.         // We have a choice between a KeyUpdatingInterval which requires
  43.         // sorted keys or a BitIndexUpdatingInterval which handles keys in any order.
  44.         // The purpose of the heuristics is to avoid a very bad choice of data structure,
  45.         // rather than choosing the best data structure in all situations. As long as the
  46.         // choice is reasonable the speed will not impact a partition algorithm.

  47.         // Simple cases
  48.         if (n == 2) {
  49.             if (k[0] == k[1]) {
  50.                 return newUpdatingInterval(k, 1);
  51.             }
  52.             if (k[1] < k[0]) {
  53.                 final int v = k[0];
  54.                 k[0] = k[1];
  55.                 k[1] = v;
  56.             }
  57.             return newUpdatingInterval(k, 2);
  58.         }

  59.         // Strategy: Must be fast on already ascending data.
  60.         // Note: The recommended way to generate a lot of partition indices is to
  61.         // generate in sequence.

  62.         // n <= small:
  63.         //   Modified insertion sort (naturally finds ascending data)
  64.         // n > small:
  65.         //   Look for ascending sequence and compact
  66.         // else:
  67.         //   Remove duplicates using an order(1) data structure and sort

  68.         if (n <= INSERTION_SORT_SIZE) {
  69.             final int unique = Sorting.insertionSortIndices(k, n);
  70.             return newUpdatingInterval(k, unique);
  71.         }

  72.         if (isAscending(k, n)) {
  73.             // For sorted keys the KeyUpdatingInterval is fast. It may be slower than the
  74.             // BitIndexUpdatingInterval depending on data length but not significantly
  75.             // slower and the difference is lost in the time taken for partitioning.
  76.             // So always use the keys.
  77.             final int unique = compressDuplicates(k, n);
  78.             return newUpdatingInterval(k, unique);
  79.         }

  80.         // At least 20 indices that are partially unordered.

  81.         // Find min/max to understand the range.
  82.         int min = k[n - 1];
  83.         int max = min;
  84.         for (int i = n - 1; --i >= 0;) {
  85.             min = Math.min(min, k[i]);
  86.             max = Math.max(max, k[i]);
  87.         }

  88.         // Here we use a simple test based on the number of comparisons required
  89.         // to perform the expected next/previous look-ups after a split.
  90.         // It is expected that we can cut n keys a maximum of n-1 times.
  91.         // Each cut requires a scan next/previous to divide the interval into two intervals:
  92.         //
  93.         //            cut
  94.         //             |
  95.         //        k1--------k2---------k3---- ... ---------kn    initial interval
  96.         //         <--| find previous
  97.         //    find next |-->
  98.         //        k1        k2---------k3---- ... ---------kn    divided intervals
  99.         //
  100.         // An BitSet will scan from the cut location and find a match in time proportional to
  101.         // the index density. Average density is (size / n) and the scanning covers 64
  102.         // indices together: Order(2 * n * (size / n) / 64) = Order(size / 32)

  103.         // Sorted keys: Sort time Order(n log(n)) : Splitting time Order(log(n)) (binary search approx)
  104.         // Bit keys   : Sort time Order(1)        : Splitting time Order(size / 32)

  105.         // Transition when n * n ~ size / 32
  106.         // Benchmarking shows this is a reasonable approximation when size < 2^20.
  107.         // The speed of the bit keys is approximately independent of n and proportional to size.
  108.         // Large size observes degrading performance of the bit keys vs sorted keys.
  109.         // We introduce a penalty for each 4x increase over size = 2^20.
  110.         // n * n = size/32 * 2^log4(size / 2^20)
  111.         // The transition point still favours the bit keys when sorted keys would be faster.
  112.         // However the difference is held within 4x and the BitSet type structure is still fast
  113.         // enough to be negligible against the speed of partitioning.

  114.         // Transition point: n = sqrt(size/32)
  115.         // size n
  116.         // 2^10 5.66
  117.         // 2^15 32.0
  118.         // 2^20 181.0

  119.         // Transition point: n = sqrt(size/32 * 2^(log4(size/2^20))))
  120.         // size n
  121.         // 2^22 512.0
  122.         // 2^24 1448.2
  123.         // 2^28 11585
  124.         // 2^31 55108

  125.         final int size = max - min + 1;

  126.         // Divide by 32 is a shift of 5. This is reduced for each 4-fold size above 2^20.
  127.         // At 2^31 the shift reduces to 0.
  128.         int shift = 5;
  129.         if (size > (1 << 20)) {
  130.             // log4(size/2^20) == (log2(size) - 20) / 2
  131.             shift -= (ceilLog2(size) - 20) >>> 1;
  132.         }

  133.         if ((long) n * n > (size >> shift)) {
  134.             final BitIndexUpdatingInterval interval = new BitIndexUpdatingInterval(min, max);
  135.             for (int i = n; --i >= 0;) {
  136.                 interval.set(k[i]);
  137.             }
  138.             return interval;
  139.         }

  140.         // Sort with a hash set to filter indices
  141.         final int unique = Sorting.sortIndices(k, n);
  142.         return new KeyUpdatingInterval(k, unique);
  143.     }

  144.     /**
  145.      * Test the data is in ascending order: {@code data[i] <= data[i+1]} for all {@code i}.
  146.      * Data is assumed to be at least length 1.
  147.      *
  148.      * @param data Data.
  149.      * @param n Length of data.
  150.      * @return true if ascending
  151.      */
  152.     private static boolean isAscending(int[] data, int n) {
  153.         for (int i = 0; ++i < n;) {
  154.             if (data[i] < data[i - 1]) {
  155.                 // descending
  156.                 return false;
  157.             }
  158.         }
  159.         return true;
  160.     }

  161.     /**
  162.      * Compress duplicates in the ascending data.
  163.      *
  164.      * <p>Warning: Requires {@code n > 0}.
  165.      *
  166.      * @param data Indices.
  167.      * @param n Number of indices.
  168.      * @return the number of unique indices
  169.      */
  170.     private static int compressDuplicates(int[] data, int n) {
  171.         // Compress to remove duplicates
  172.         int last = 0;
  173.         int top = data[0];
  174.         for (int i = 0; ++i < n;) {
  175.             final int v = data[i];
  176.             if (v == top) {
  177.                 continue;
  178.             }
  179.             top = v;
  180.             data[++last] = v;
  181.         }
  182.         return last + 1;
  183.     }

  184.     /**
  185.      * Compute {@code ceil(log2(x))}. This is valid for all strictly positive {@code x}.
  186.      *
  187.      * <p>Returns -1 for {@code x = 0} in place of -infinity.
  188.      *
  189.      * @param x Value.
  190.      * @return {@code ceil(log2(x))}
  191.      */
  192.     private static int ceilLog2(int x) {
  193.         return 32 - Integer.numberOfLeadingZeros(x - 1);
  194.     }

  195.     /**
  196.      * Returns an interval that covers the specified indices {@code k}.
  197.      * The indices must be sorted.
  198.      *
  199.      * @param k Indices.
  200.      * @param n Count of indices (must be strictly positive).
  201.      * @throws IndexOutOfBoundsException if any index {@code k} is not within the
  202.      * sub-range {@code [left, right]}
  203.      * @return the interval
  204.      */
  205.     private static UpdatingInterval newUpdatingInterval(int[] k, int n) {
  206.         return new KeyUpdatingInterval(k, n);
  207.     }

  208.     /**
  209.      * Count the number of indices. Returns a negative value if the indices are sorted.
  210.      *
  211.      * @param keys Keys.
  212.      * @param n Count of indices.
  213.      * @return the count of (sorted) indices
  214.      */
  215.     static int countIndices(UpdatingInterval keys, int n) {
  216.         if (keys instanceof KeyUpdatingInterval) {
  217.             return -((KeyUpdatingInterval) keys).size();
  218.         }
  219.         return n;
  220.     }

  221.     /**
  222.      * Checks if the sub-range from fromIndex (inclusive) to toIndex (exclusive) is
  223.      * within the bounds of range from 0 (inclusive) to length (exclusive).
  224.      *
  225.      * <p>This function provides the functionality of
  226.      * {@code java.utils.Objects.checkFromToIndex} introduced in JDK 9. The <a
  227.      * href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Objects.html#checkFromToIndex(int,int,int)">Objects</a>
  228.      * javadoc has been reproduced for reference. The return value has been changed
  229.      * to void.
  230.      *
  231.      * <p>The sub-range is defined to be out of bounds if any of the following
  232.      * inequalities is true:
  233.      * <ul>
  234.      * <li>{@code fromIndex < 0}
  235.      * <li>{@code fromIndex > toIndex}
  236.      * <li>{@code toIndex > length}
  237.      * <li>{@code length < 0}, which is implied from the former inequalities
  238.      * </ul>
  239.      *
  240.      * @param fromIndex Lower-bound (inclusive) of the sub-range.
  241.      * @param toIndex Upper-bound (exclusive) of the sub-range.
  242.      * @param length Upper-bound (exclusive) of the range.
  243.      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
  244.      */
  245.     static void checkFromToIndex(int fromIndex, int toIndex, int length) {
  246.         // Checks as documented above
  247.         if (fromIndex < 0 || fromIndex > toIndex || toIndex > length) {
  248.             throw new IndexOutOfBoundsException(
  249.                 msgRangeOutOfBounds(fromIndex, toIndex, length));
  250.         }
  251.     }

  252.     /**
  253.      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
  254.      *
  255.      * @param fromIndex Lower-bound (inclusive) of the sub-range.
  256.      * @param toIndex Upper-bound (exclusive) of the sub-range.
  257.      * @param k Indices.
  258.      * @throws IndexOutOfBoundsException if any index is out of bounds
  259.      */
  260.     static void checkIndices(int fromIndex, int toIndex, int[] k) {
  261.         for (final int i : k) {
  262.             checkIndex(fromIndex, toIndex, i);
  263.         }
  264.     }

  265.     /**
  266.      * Checks if the {@code index} is within the half-open interval {@code [fromIndex, toIndex)}.
  267.      *
  268.      * @param fromIndex Lower-bound (inclusive) of the sub-range.
  269.      * @param toIndex Upper-bound (exclusive) of the sub-range.
  270.      * @param index Index.
  271.      * @throws IndexOutOfBoundsException if the index is out of bounds
  272.      */
  273.     static void checkIndex(int fromIndex, int toIndex, int index) {
  274.         if (index < fromIndex || index >= toIndex) {
  275.             throw new IndexOutOfBoundsException(
  276.                 msgIndexOutOfBounds(fromIndex, toIndex, index));
  277.         }
  278.     }

  279.     // Message formatting moved to separate methods to assist inlining of the validation methods.

  280.     /**
  281.      * Format a message when range [from, to) is not entirely within the length.
  282.      *
  283.      * @param fromIndex Lower-bound (inclusive) of the sub-range.
  284.      * @param toIndex Upper-bound (exclusive) of the sub-range.
  285.      * @param length Upper-bound (exclusive) of the range.
  286.      * @return the message
  287.      */
  288.     private static String msgRangeOutOfBounds(int fromIndex, int toIndex, int length) {
  289.         return String.format("Range [%d, %d) out of bounds for length %d", fromIndex, toIndex, length);
  290.     }

  291.     /**
  292.      * Format a message when index is not within range [from, to).
  293.      *
  294.      * @param fromIndex Lower-bound (inclusive) of the sub-range.
  295.      * @param toIndex Upper-bound (exclusive) of the sub-range.
  296.      * @param index Index.
  297.      * @return the message
  298.      */
  299.     private static String msgIndexOutOfBounds(int fromIndex, int toIndex, int index) {
  300.         return String.format("Index %d out of bounds for range [%d, %d)", index, fromIndex, toIndex);
  301.     }
  302. }