View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.numbers.examples.jmh.arrays;
18  
19  /**
20   * A fixed size set of indices within an inclusive range {@code [left, right]}.
21   *
22   * <p>This is a specialised class to implement a data structure similar to a
23   * {@link java.util.BitSet}. It supports a fixed size and contains the methods required to
24   * store and look-up intervals of indices.
25   *
26   * <p>An offset is supported to allow the fixed size to cover a range of indices starting
27   * above 0 with the most efficient usage of storage.
28   *
29   * <p>In contrast to a {@link java.util.BitSet}, the data structure does not store all
30   * indices in the range. Indices are compressed by a power of 2. The structure can return
31   * with 100% accuracy if a query index is not within the range. It cannot return with 100%
32   * accuracy if a query index is contained within the range. The presence of a query index
33   * is a probabilistic statement that there is an index within a range of the query index.
34   * The range is defined by the compression level {@code c}.
35   *
36   * <p>Indices are stored offset from {@code left} and compressed. A compressed index
37   * represents 2<sup>c</sup> real indices:
38   *
39   * <pre>
40   * Interval:         012345678
41   * Compressed (c=1): 0-1-2-3-4
42   * Compressed (c=2): 0---1---2
43   * Compressed (c=2): 0-------1
44   * </pre>
45   *
46   * <p>When scanning for the next index the identified compressed index is decompressed and
47   * the lower bound of the range represented by the index is returned.
48   *
49   * <p>When scanning for the previous index the identified compressed index is decompressed
50   * and the upper bound of the range represented by the index is returned.
51   *
52   * <p>When scanning in either direction, if the search index is inside a compressed index
53   * the search index is returned.
54   *
55   * <p>Note: Search for the {@link SearchableInterval} interface outside the supported bounds
56   * {@code [left, right]} is not supported and will result in an {@link IndexOutOfBoundsException}.
57   *
58   * <p>See the BloomFilter code in Commons Collections for use of long[] data to store
59   * bits.
60   *
61   * @since 1.2
62   */
63  final class CompressedIndexSet implements SearchableInterval, SearchableInterval2 {
64      /** All 64-bits bits set. */
65      private static final long LONG_MASK = -1L;
66      /** A bit shift to apply to an integer to divided by 64 (2^6). */
67      private static final int DIVIDE_BY_64 = 6;
68  
69      /** Bit indexes. */
70      private final long[] data;
71  
72      /** Left bound of the support. */
73      private final int left;
74      /** Right bound of the support. */
75      private final int right;
76      /** Compression level. */
77      private final int compression;
78  
79      /**
80       * Create an instance to store indices within the range {@code [left, right]}.
81       *
82       * @param compression Compression level (in {@code [1, 31])}
83       * @param l Lower bound (inclusive).
84       * @param r Upper bound (inclusive).
85       */
86      private CompressedIndexSet(int compression, int l, int r) {
87          this.compression = compression;
88          this.left = l;
89          // Note: The functional upper bound may be higher but the next/previous functionality
90          // support scanning in the original [left, right] bound.
91          this.right = r;
92          // Note: This may allow directly writing to index > right if there
93          // is extra capacity.
94          data = new long[getLongIndex((r - l) >>> 1) + 1];
95      }
96  
97      /**
98       * Create an instance to store indices within the range {@code [left, right]}.
99       * The instance is initially empty.
100      *
101      * <p>Warning: To use this object as an {@link SearchableInterval} the left and right
102      * indices should be added to the set.
103      *
104      * @param compression Compression level (in {@code [1, 31])}
105      * @param left Lower bound (inclusive).
106      * @param right Upper bound (inclusive).
107      * @return the index set
108      * @throws IllegalArgumentException if {@code compression} is not in {@code [1, 31]};
109      * or if {@code right < left}; or if {@code left < 0}
110      */
111     static CompressedIndexSet ofRange(int compression, int left, int right) {
112         checkCompression(compression);
113         checkLeft(left);
114         checkRange(left, right);
115         return new CompressedIndexSet(compression, left, right);
116     }
117 
118     /**
119      * Initialise an instance with the {@code indices}. The capacity is defined by the
120      * range required to store the minimum and maximum index at the specified
121      * {@code compression} level.
122      *
123      * <p>This object can be used as an {@link SearchableInterval} as the left and right
124      * indices will be set.
125      *
126      * @param compression Compression level (in {@code [1, 31])}
127      * @param indices Indices.
128      * @return the index set
129      * @throws IllegalArgumentException if {@code compression} is not in {@code [1, 31]};
130      * or if {@code indices.length == 0}; or if {@code left < 0}
131      */
132     static CompressedIndexSet of(int compression, int[] indices) {
133         return of(compression, indices, indices.length);
134     }
135 
136     /**
137      * Initialise an instance with the {@code indices}. The capacity is defined by the
138      * range required to store the minimum and maximum index at the specified
139      * {@code compression} level.
140      *
141      * <p>This object can be used as an {@link SearchableInterval} as the left and right
142      * indices will be set.
143      *
144      * @param compression Compression level (in {@code [1, 31])}
145      * @param indices Indices.
146      * @param n Number of indices.
147      * @return the index set
148      * @throws IllegalArgumentException if {@code compression} is not in {@code [1, 31]};
149      * or if {@code n == 0}; or if {@code left < 0}
150      */
151     static CompressedIndexSet of(int compression, int[] indices, int n) {
152         if (n <= 0) {
153             throw new IllegalArgumentException("No indices to define the range");
154         }
155         checkCompression(compression);
156         int min = indices[0];
157         int max = min;
158         for (int i = 0; ++i < n;) {
159             min = Math.min(min, indices[i]);
160             max = Math.max(max, indices[i]);
161         }
162         checkLeft(min);
163         final CompressedIndexSet set = new CompressedIndexSet(compression, min, max);
164         for (int i = -1; ++i < n;) {
165             set.set(indices[i]);
166         }
167         return set;
168     }
169 
170     /**
171      * Create an {@link IndexIterator} with the {@code indices}.
172      *
173      * @param compression Compression level (in {@code [1, 31])}
174      * @param indices Indices.
175      * @param n Number of indices.
176      * @return the index set
177      * @throws IllegalArgumentException if {@code compression} is not in {@code [1, 31]};
178      * or if {@code n == 0}; or if {@code left < 0}
179      */
180     static IndexIterator iterator(int compression, int[] indices, int n) {
181         return of(compression, indices, n).new Iterator();
182     }
183 
184     /**
185      * Gets the compressed index for this instance using the left bound and the
186      * compression level.
187      *
188      * @param index Index.
189      * @return the compressed index
190      */
191     private int compressIndex(int index) {
192         return (index - left) >>> compression;
193     }
194 
195     /**
196      * Gets the filter index for the specified bit index assuming the filter is using
197      * 64-bit longs to store bits starting at index 0.
198      *
199      * <p>The index is assumed to be positive. For a positive index the result will match
200      * {@code bitIndex / 64}.</p>
201      *
202      * <p><em>The divide is performed using bit shifts. If the input is negative the
203      * behavior is not defined.</em></p>
204      *
205      * @param bitIndex the bit index (assumed to be positive)
206      * @return the index of the bit map in an array of bit maps.
207      */
208     private static int getLongIndex(final int bitIndex) {
209         // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is
210         // positive.
211         // We do not explicitly check for a negative here. Instead we use a
212         // signed shift. Any negative index will produce a negative value
213         // by sign-extension and if used as an index into an array it will throw an
214         // exception.
215         return bitIndex >> DIVIDE_BY_64;
216     }
217 
218     /**
219      * Gets the filter bit mask for the specified bit index assuming the filter is using
220      * 64-bit longs to store bits starting at index 0. The returned value is a
221      * {@code long} with only 1 bit set.
222      *
223      * <p>The index is assumed to be positive. For a positive index the result will match
224      * {@code 1L << (bitIndex % 64)}.</p>
225      *
226      * <p><em>If the input is negative the behavior is not defined.</em></p>
227      *
228      * @param bitIndex the bit index (assumed to be positive)
229      * @return the filter bit
230      */
231     private static long getLongBit(final int bitIndex) {
232         // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this
233         // using 0x3f (63) or compute bitIndex % 64.
234         // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and
235         // this will identify an incorrect bit.
236         return 1L << bitIndex;
237     }
238 
239     /**
240      * Returns the value of the bit with the specified index.
241      *
242      * <p>Warning: This has no range checks.
243      *
244      * @param bitIndex the bit index (assumed to be positive)
245      * @return the value of the bit with the specified index
246      */
247     boolean get(int bitIndex) {
248         // WARNING: No range checks !!!
249         final int index = compressIndex(bitIndex);
250         final int i = getLongIndex(index);
251         final long m = getLongBit(index);
252         return (data[i] & m) != 0;
253     }
254 
255     /**
256      * Sets the bit at the specified index to {@code true}.
257      *
258      * <p>Warning: This has no range checks.
259      *
260      * @param bitIndex the bit index (assumed to be positive)
261      */
262     void set(int bitIndex) {
263         // WARNING: No range checks !!!
264         final int index = compressIndex(bitIndex);
265         final int i = getLongIndex(index);
266         final long m = getLongBit(index);
267         data[i] |= m;
268     }
269 
270 
271     @Override
272     public int left() {
273         return left;
274     }
275 
276     @Override
277     public int right() {
278         return right;
279     }
280 
281     /**
282      * Returns the nearest index that occurs on or before the specified starting
283      * index, or {@code left - 1} if no such index exists.
284      *
285      * <p>This method exists for comparative testing to {@link #previousIndex(int)}.
286      *
287      * @param k Index to start checking from (inclusive).
288      * @return the previous index, or {@code left - 1}
289      */
290     int previousIndexOrLeftMinus1(int k) {
291         if (k < left) {
292             // index is in an unknown range
293             return left - 1;
294         }
295         // Support searching backward through the known range
296         final int index = compressIndex(k > right ? right : k);
297 
298         int i = getLongIndex(index);
299         long bits = data[i];
300 
301         // Check if this is within a compressed index. If so return the exact result.
302         if ((bits & getLongBit(index)) != 0) {
303             return Math.min(k, right);
304         }
305 
306         // Mask bits before the bit index
307         // mask = 00011111 = -1L >>> (64 - ((index + 1) % 64))
308         bits &= LONG_MASK >>> -(index + 1);
309         for (;;) {
310             if (bits != 0) {
311                 //(i+1)       i
312                 // |   c      |
313                 // |   |      |
314                 // 0  001010000
315                 final int c = (i + 1) * Long.SIZE - Long.numberOfLeadingZeros(bits);
316                 // Decompress the prior unset bit to an index. When inflated this is the
317                 // next index above the upper bound of the compressed range so subtract 1.
318                 return (c << compression) - 1 + left;
319             }
320             if (i == 0) {
321                 return left - 1;
322             }
323             bits = data[--i];
324         }
325     }
326 
327     /**
328      * Returns the nearest index that occurs on or after the specified starting
329      * index, or {@code right + 1} if no such index exists.
330      *
331      * <p>This method exists for comparative testing to {@link #nextIndex(int)}.
332      *
333      * @param k Index to start checking from (inclusive).
334      * @return the next index, or {@code right + 1}
335      */
336     int nextIndexOrRightPlus1(int k) {
337         if (k > right) {
338             // index is in an unknown range
339             return right + 1;
340         }
341         // Support searching forward through the known range
342         final int index = compressIndex(k < left ? left : k);
343 
344         int i = getLongIndex(index);
345         long bits = data[i];
346 
347         // Check if this is within a compressed index. If so return the exact result.
348         if ((bits & getLongBit(index)) != 0) {
349             return Math.max(k, left);
350         }
351 
352         // Mask bits after the bit index
353         // mask = 11111000 = -1L << (index % 64)
354         bits &= LONG_MASK << index;
355         for (;;) {
356             if (bits != 0) {
357                 //(i+1)       i
358                 // |      c   |
359                 // |      |   |
360                 // 0  001010000
361                 final int c = i * Long.SIZE + Long.numberOfTrailingZeros(bits);
362                 // Decompress the set bit to an index. When inflated this is the lower bound of
363                 // the compressed range and is OK for next scanning.
364                 return (c << compression) + left;
365             }
366             if (++i == data.length) {
367                 return right + 1;
368             }
369             bits = data[i];
370         }
371     }
372 
373     @Override
374     public int previousIndex(int k) {
375         // WARNING: No range checks !!!
376         // Assume left <= k <= right and that left and right are set bits acting as sentinals.
377         final int index = compressIndex(k);
378 
379         int i = getLongIndex(index);
380         long bits = data[i];
381 
382         // Check if this is within a compressed index. If so return the exact result.
383         if ((bits & getLongBit(index)) != 0) {
384             return k;
385         }
386 
387         // Mask bits before the bit index
388         // mask = 00011111 = -1L >>> (64 - ((index + 1) % 64))
389         bits &= LONG_MASK >>> -(index + 1);
390         for (;;) {
391             if (bits != 0) {
392                 //(i+1)       i
393                 // |   c      |
394                 // |   |      |
395                 // 0  001010000
396                 final int c = (i + 1) * Long.SIZE - Long.numberOfLeadingZeros(bits);
397                 // Decompress the prior unset bit to an index. When inflated this is the
398                 // next index above the upper bound of the compressed range so subtract 1.
399                 return (c << compression) - 1 + left;
400             }
401             // Unsupported: the interval should contain k
402             //if (i == 0) {
403             //    return left - 1;
404             //}
405             bits = data[--i];
406         }
407     }
408 
409     @Override
410     public int nextIndex(int k) {
411         // WARNING: No range checks !!!
412         // Assume left <= k <= right and that left and right are set bits acting as sentinals.
413         final int index = compressIndex(k);
414 
415         int i = getLongIndex(index);
416         long bits = data[i];
417 
418         // Check if this is within a compressed index. If so return the exact result.
419         if ((bits & getLongBit(index)) != 0) {
420             return k;
421         }
422 
423         // Mask bits after the bit index
424         // mask = 11111000 = -1L << (index % 64)
425         bits &= LONG_MASK << index;
426         for (;;) {
427             if (bits != 0) {
428                 //(i+1)       i
429                 // |      c   |
430                 // |      |   |
431                 // 0  001010000
432                 final int c = i * Long.SIZE + Long.numberOfTrailingZeros(bits);
433                 // Decompress the set bit to an index. When inflated this is the lower bound of
434                 // the compressed range and is OK for next scanning.
435                 return (c << compression) + left;
436             }
437             // Unsupported: the interval should contain k
438             //if (++i == data.length) {
439             //    return right + 1;
440             //}
441             bits = data[++i];
442         }
443     }
444 
445     // SearchableInterval2
446     // This is exactly the same as SearchableInterval as the pointers i are the same as the keys k
447 
448     @Override
449     public int start() {
450         return left();
451     }
452 
453     @Override
454     public int end() {
455         return right();
456     }
457 
458     @Override
459     public int index(int i) {
460         return i;
461     }
462 
463     @Override
464     public int previous(int i, int k) {
465         return previousIndex(k);
466     }
467 
468     @Override
469     public int next(int i, int k) {
470         return nextIndex(k);
471     }
472 
473     /**
474      * Returns the index of the first bit that is set to {@code false} that occurs on or
475      * before the specified starting index <em>within the supported range</em>. If no such
476      * bit exists then {@code left - 1} is returned.
477      *
478      * <p>Assumes {@code k} is within an enabled compressed index.
479      *
480      * @param k Index to start checking from (inclusive).
481      * @return the index of the previous unset bit, or {@code left - 1} if there is no such bit
482      */
483     int previousClearBit(int k) {
484         // WARNING: No range checks !!!
485         // Assume left <= k <= right and that left and right are set bits acting as sentinals.
486         final int index = compressIndex(k);
487 
488         int i = getLongIndex(index);
489 
490         // Note: This method is conceptually the same as previousIndex with the exception
491         // that: all the data is bit-flipped; a check is made when the scan reaches the end;
492         // and no check is made for k within an unset compressed index.
493 
494         // Mask bits before the bit index
495         // mask = 00011111 = -1L >>> (64 - ((index + 1) % 64))
496         long bits = ~data[i] & (LONG_MASK >>> -(index + 1));
497         for (;;) {
498             if (bits != 0) {
499                 final int c = (i + 1) * Long.SIZE - Long.numberOfLeadingZeros(bits);
500                 return (c << compression) - 1 + left;
501             }
502             if (i == 0) {
503                 return left - 1;
504             }
505             bits = ~data[--i];
506         }
507     }
508 
509     /**
510      * Returns the index of the first bit that is set to {@code false} that occurs on or
511      * after the specified starting index <em>within the supported range</em>. If no such
512      * bit exists then the {@code capacity} is returned where {@code capacity = index + 1}
513      * with {@code index} the largest index that can be added to the set without an error.
514      *
515      * <p>Assumes {@code k} is within an enabled compressed index.
516      *
517      * @param k Index to start checking from (inclusive).
518      * @return the index of the next unset bit, or the {@code capacity} if there is no such bit
519      */
520     int nextClearBit(int k) {
521         // WARNING: No range checks !!!
522         // Assume left <= k <= right
523         final int index = compressIndex(k);
524 
525         int i = getLongIndex(index);
526 
527         // Note: This method is conceptually the same as nextIndex with the exception
528         // that: all the data is bit-flipped; a check is made for the capacity when the
529         // scan reaches the end; and no check is made for k within an unset compressed index.
530 
531         // Mask bits after the bit index
532         // mask = 11111000 = -1L << (fromIndex % 64)
533         long bits = ~data[i] & (LONG_MASK << index);
534         for (;;) {
535             if (bits != 0) {
536                 final int c = i * Long.SIZE + Long.numberOfTrailingZeros(bits);
537                 return (c << compression) + left;
538             }
539             if (++i == data.length) {
540                 // Capacity
541                 return right + 1;
542             }
543             bits = ~data[i];
544         }
545     }
546 
547     /**
548      * Check the compression is valid.
549      *
550      * @param compression Compression level.
551      * @throws IllegalArgumentException if {@code compression} is not in {@code [1, 31]}
552      */
553     private static void checkCompression(int compression) {
554         if (!(compression > 0 && compression <= 31)) {
555             throw new IllegalArgumentException("Invalid compression: " + compression);
556         }
557     }
558 
559     /**
560      * Check the lower bound to the range is valid.
561      *
562      * @param left Lower bound (inclusive).
563      * @throws IllegalArgumentException if {@code left < 0}
564      */
565     private static void checkLeft(int left) {
566         if (left < 0) {
567             throw new IllegalArgumentException("Invalid lower index: " + left);
568         }
569     }
570 
571     /**
572      * Check the range is valid.
573      *
574      * @param left Lower bound (inclusive).
575      * @param right Upper bound (inclusive).
576      * @throws IllegalArgumentException if {@code right < left}
577      */
578     private static void checkRange(int left, int right) {
579         if (right < left) {
580             throw new IllegalArgumentException(
581                 String.format("Invalid range: [%d, %d]", left, right));
582         }
583     }
584 
585     /**
586      * {@link IndexIterator} implementation.
587      *
588      * <p>This iterator can efficiently iterate over high-density indices
589      * if the compression level is set to create spacing equal to or above the expected
590      * separation between indices.
591      */
592     private class Iterator implements IndexIterator {
593         /** Iterator left. l is a compressed index. */
594         private int l;
595         /** Iterator right. (r+1) is a clear bit. */
596         private int r;
597         /** Next iterator left. Cached for look ahead functionality. */
598         private int nextL;
599 
600         /**
601          * Create an instance.
602          */
603         Iterator() {
604             l = CompressedIndexSet.this.left();
605             r = nextClearBit(l) - 1;
606             if (r < end()) {
607                 nextL = nextIndex(r + 1);
608             } else {
609                 // Entire range is saturated
610                 r = end();
611             }
612         }
613 
614         @Override
615         public int left() {
616             return l;
617         }
618 
619         @Override
620         public int right() {
621             return r;
622         }
623 
624         @Override
625         public int end() {
626             return CompressedIndexSet.this.right();
627         }
628 
629         @Override
630         public boolean next() {
631             if (r < end()) {
632                 // Reuse the cached next left and advance
633                 l = nextL;
634                 r = nextClearBit(l) - 1;
635                 if (r < end()) {
636                     nextL = nextIndex(r + 1);
637                 } else {
638                     r = end();
639                 }
640                 return true;
641             }
642             return false;
643         }
644 
645         @Override
646         public boolean positionAfter(int index) {
647             // Even though this can provide random access we only allow advancing
648             if (r > index) {
649                 return true;
650             }
651             if (index < end()) {
652                 // Note: Uses 3 scans as it maintains the next left.
653                 // For low density indices scanning for next left will be expensive
654                 // and it would be more efficient to only compute next left on demand.
655                 // For high density indices the next left will be close to
656                 // the new right and the cost is low.
657                 // This iterator favours use on high density indices. A variant
658                 // iterator could be created for comparison purposes.
659 
660                 if (get(index + 1)) {
661                     // (index+1) is set.
662                     // Find [left <= index+1 <= right]
663                     r = nextClearBit(index + 1) - 1;
664                     if (r < end()) {
665                         nextL = nextIndex(r + 1);
666                     } else {
667                         r = end();
668                     }
669                     l = index + 1;
670                     //l = previousClearBit(index) + 1;
671                 } else {
672                     // (index+1) is clear.
673                     // Advance to the next [left, right] pair
674                     l = nextIndex(index + 1);
675                     r = nextClearBit(l) - 1;
676                     if (r < end()) {
677                         nextL = nextIndex(r + 1);
678                     } else {
679                         r = end();
680                     }
681                 }
682                 return true;
683             }
684             // Advance to end. No next left. Not positioned after the target index.
685             l = r = end();
686             return false;
687         }
688 
689         @Override
690         public boolean nextAfter(int index) {
691             if (r < end()) {
692                 return nextL > index;
693             }
694             // no more indices
695             return true;
696         }
697     }
698 }