View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.numbers.examples.jmh.arrays;
19  
20  /**
21   * An index set backed by a open-addressed hash table using linear hashing. Table size is a power
22   * of 2 and has a maximum capacity of 2^29 with a fixed load factor of 0.5. If the functional
23   * capacity is exceeded then the set raises an {@link IllegalStateException}.
24   *
25   * <p>Values are stored using bit inversion. Any positive index will have a negative
26   * representation when stored. An empty slot is indicated by a zero.
27   *
28   * <p>This class has a minimal API. It can be used to ensure a collection of indices of
29   * a known size are unique:
30   *
31   * <pre>{@code
32   * int[] keys = ...
33   * HashIndexSet set = new HashIndexSet(keys.length);
34   * for (int k : keys) {
35   *   if (set.add(k)) {
36   *     // first occurrence of k in keys
37   *   }
38   * }
39   * }</pre>
40   *
41   * @see <a href="https://en.wikipedia.org/wiki/Open_addressing">Open addressing (Wikipedia)</a>
42   * @since 1.2
43   */
44  final class HashIndexSet {
45      /** Message for an invalid index. */
46      private static final String INVALID_INDEX = "Invalid index: ";
47      /** The maximum capacity of the set. */
48      private static final int MAX_CAPACITY = 1 << 29;
49      /** The minimum size of the backing array. */
50      private static final int MIN_SIZE = 16;
51      /**
52       * Unsigned 32-bit integer numerator of the golden ratio (0.618) with an assumed
53       * denominator of 2^32.
54       *
55       * <pre>
56       * 2654435769 = round(2^32 * (sqrt(5) - 1) / 2)
57       * Long.toHexString((long)(0x1p32 * (Math.sqrt(5.0) - 1) / 2))
58       * </pre>
59       */
60      private static final int PHI = 0x9e3779b9;
61  
62      /** The set. */
63      private final int[] set;
64      /** The size. */
65      private int size;
66  
67      /**
68       * Create an instance with size to store up to the specified {@code capacity}.
69       *
70       * <p>The functional capacity (number of indices that can be stored) is the next power
71       * of 2 above {@code capacity}; or a minimum size if the requested {@code capacity} is
72       * small.
73       *
74       * @param capacity Capacity (assumed to be positive).
75       */
76      HashIndexSet(int capacity) {
77          if (capacity > MAX_CAPACITY) {
78              throw new IllegalArgumentException("Unsupported capacity: " + capacity);
79          }
80          // This will generate a load factor at capacity in the range (0.25, 0.5]
81          // The use of Math.max will ignore zero/negative capacity requests.
82          set = new int[nextPow2(Math.max(MIN_SIZE, capacity * 2))];
83      }
84  
85      /**
86       * Return the memory footprint in bytes. This is always a power of 2.
87       *
88       * <p>This will return the size as if not limited to a capacity of 2<sup>29</sup>.
89       * In this case the size will exceed the maximum size of an {@code int[]} array.
90       *
91       * <p>This method is intended to provide information to choose if the data structure
92       * is memory efficient.
93       *
94       * @param capacity Capacity.
95       * @return the memory footprint
96       */
97      static long memoryFootprint(int capacity) {
98          if (capacity <= (MIN_SIZE >> 1)) {
99              // 4 bytes/int
100             return MIN_SIZE << 2;
101         }
102         // Double the next power of 2, then convert integer count to bytes (4 bytes/int)
103         // * 2 * 4 == * 2^3
104         return Integer.toUnsignedLong(nextPow2(capacity)) << 3;
105     }
106 
107     /**
108      * Returns the closest power-of-two number greater than or equal to {@code value}.
109      *
110      * <p>Warning: This will return {@link Integer#MIN_VALUE} for any {@code value} above
111      * {@code 1 << 30}. This is the next power of 2 as an unsigned integer.
112      *
113      * <p>See <a
114      * href="https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2">Bit
115      * Hacks: Rounding up to a power of 2</a>
116      *
117      * @param value Value.
118      * @return the closest power-of-two number greater than or equal to value
119      */
120     private static int nextPow2(int value) {
121         int result = value - 1;
122         result |= result >>> 1;
123         result |= result >>> 2;
124         result |= result >>> 4;
125         result |= result >>> 8;
126         return (result | (result >>> 16)) + 1;
127     }
128 
129     /**
130      * Adds the {@code index} to the set.
131      *
132      * @param index Index.
133      * @return true if the set was modified by the operation
134      * @throws IndexOutOfBoundsException if the index is negative
135      */
136     boolean add(int index) {
137         if (index < 0) {
138             throw new IndexOutOfBoundsException(INVALID_INDEX + index);
139         }
140         final int[] keys = set;
141         final int key = ~index;
142         final int mask = keys.length - 1;
143         int pos = mix(index) & mask;
144         int curr = keys[pos];
145         if (curr < 0) {
146             if (curr == key) {
147                 // Already present
148                 return false;
149             }
150             // Probe
151             while ((curr = keys[pos = (pos + 1) & mask]) < 0) {
152                 if (curr == key) {
153                     // Already present
154                     return false;
155                 }
156             }
157         }
158         // Insert
159         keys[pos] = key;
160         // Here the load factor is 0.5: Test if size > keys.length * 0.5
161         if (++size > (mask + 1) >>> 1) {
162             // This is where we should grow the size of the set and re-insert
163             // all current keys into the new key storage. Here we are using a
164             // fixed capacity so raise an exception.
165             throw new IllegalStateException("Functional capacity exceeded: " + (keys.length >>> 1));
166         }
167         return true;
168     }
169 
170     /**
171      * Test if the {@code index} is in the set.
172      *
173      * <p>This method is present for testing. It is not required when filtering a collection
174      * of indices with duplicates to a unique set of indices.
175      *
176      * @param index Index.
177      * @return true if the set contains the index
178      * @throws IndexOutOfBoundsException if the index is negative
179      */
180     boolean contains(int index) {
181         if (index < 0) {
182             throw new IndexOutOfBoundsException(INVALID_INDEX + index);
183         }
184         final int[] keys = set;
185         final int mask = keys.length - 1;
186         int pos = mix(index) & mask;
187         int curr = keys[pos];
188         if (curr == 0) {
189             return false;
190         }
191         final int key = ~index;
192         if (curr == key) {
193             return true;
194         }
195         // Probe
196         while (true) {
197             pos = (pos + 1) & mask;
198             curr = keys[pos];
199             if (curr == 0) {
200                 // No more entries
201                 return false;
202             }
203             if (curr == key) {
204                 return true;
205             }
206         }
207     }
208 
209     /**
210      * Mix the bits of an integer.
211      *
212      * <p>This is the fast hash function used in the linear hash implementation in the <a
213      * href="https://github.com/leventov/Koloboke">Koloboke Collections</a>.
214      *
215      * @param x Bits.
216      * @return the mixed bits
217      */
218     private static int mix(int x) {
219         final int h = x * PHI;
220         return h ^ (h >>> 16);
221     }
222 
223     /**
224      * Returns the number of distinct indices in the set.
225      *
226      * @return the size
227      */
228     int size() {
229         return size;
230     }
231 
232     /**
233      * Write each index in the set into the provided array. Returns the number of indices.
234      *
235      * <p>The caller must ensure the output array has sufficient capacity.
236      *
237      * <p>Warning: The indices are not ordered.
238      *
239      * <p>This method is present for testing. It is not required when filtering a collection
240      * of indices with duplicates to a unique set of indices. It can be used to write
241      * out the unique indices, but they are not in the encounter order of indices
242      * {@link #add(int) added} to the set.
243      *
244      * @param a Output array.
245      * @return count of indices
246      * @see #size()
247      */
248     int toArray(int[] a) {
249         final int[] keys = set;
250         int c = 0;
251         for (final int key : keys) {
252             if (key < 0) {
253                 a[c++] = ~key;
254             }
255         }
256         // assert c == size
257         return c;
258     }
259 }