EnhancedDoubleHasher.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.Objects;
  19. import java.util.function.IntPredicate;

  20. /**
  21.  * A Hasher that implements combinatorial hashing as described by
  22.  * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique
  23.  * described in the wikipedia article  <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>.
  24.  * <p>
  25.  * Common use for this hasher is to generate bit indices from a byte array output of a hashing
  26.  * or MessageDigest algorithm.</p>
  27.  *
  28.  * <h2>Thoughts on the hasher input</h2>
  29.  *
  30.  * <p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than
  31.  * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be
  32.  * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits
  33.  * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for
  34.  * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial}
  35.  * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the
  36.  * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also
  37.  * ignores the negative wrapping but the behavior is the same, some bits cannot be reached.
  38.  * </p><p>
  39.  * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger
  40.  * than the number of bits then the modulus will create a 'random' position and increment within the size.
  41.  * </p>
  42.  *
  43.  * @since 4.5.0-M1
  44.  */
  45. public class EnhancedDoubleHasher implements Hasher {

  46.     /**
  47.      * Convert bytes to big-endian long filling with zero bytes as necessary.
  48.      *
  49.      * @param byteArray the byte array to extract the values from.
  50.      * @param offset the offset to start extraction from.
  51.      * @param len the length of the extraction, may be longer than 8.
  52.      * @return
  53.      */
  54.     private static long toLong(final byte[] byteArray, final int offset, final int len) {
  55.         long val = 0;
  56.         int shift = Long.SIZE;
  57.         final int end = offset + Math.min(len, Long.BYTES);
  58.         for (int i = offset; i < end; i++) {
  59.             shift -= Byte.SIZE;
  60.             val |= (long) (byteArray[i] & 0xFF) << shift;
  61.         }
  62.         return val;
  63.     }

  64.     /**
  65.      * The initial hash value.
  66.      */
  67.     private final long initial;

  68.     /**
  69.      * The value to increment the hash value by.
  70.      */
  71.     private final long increment;

  72.     /**
  73.      * Constructs the EnhancedDoubleHasher from a byte array.
  74.      * <p>
  75.      * This method simplifies the conversion from a Digest or hasher algorithm output
  76.      * to the two values used by the EnhancedDoubleHasher.</p>
  77.      * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value.
  78.      * Excess bytes are ignored.
  79.      * If there are fewer than 16 bytes the following conversions are made.
  80.      * </p>
  81.      * <ol>
  82.      * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li>
  83.      * <li>The bytes allotted are read in big-endian order any byte not populated is set to zero.</li>
  84.      * </ol>
  85.      * <p>
  86.      * This ensures that small arrays generate the largest possible increment and initial values.
  87.      * </p>
  88.      *
  89.      * @param buffer the buffer to extract the longs from.
  90.      * @throws IllegalArgumentException is buffer length is zero.
  91.      */
  92.     public EnhancedDoubleHasher(final byte[] buffer) {
  93.         if (buffer.length == 0) {
  94.             throw new IllegalArgumentException("buffer length must be greater than 0");
  95.         }
  96.         // divide by 2
  97.         final int segment = buffer.length / 2;
  98.         this.initial = toLong(buffer, 0, segment);
  99.         this.increment = toLong(buffer, segment, buffer.length - segment);
  100.     }

  101.     /**
  102.      * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values.
  103.      *
  104.      * @param initial The initial value for the hasher.
  105.      * @param increment The value to increment the hash by on each iteration.
  106.      */
  107.     public EnhancedDoubleHasher(final long initial, final long increment) {
  108.         this.initial = initial;
  109.         this.increment = increment;
  110.     }

  111.     /**
  112.      * Gets the increment value for the hash calculation.
  113.      *
  114.      * @return the increment value for the hash calculation.
  115.      */
  116.     long getIncrement() {
  117.         return increment;
  118.     }

  119.     /**
  120.      * Gets the initial value for the hash calculation.
  121.      *
  122.      * @return the initial value for the hash calculation.
  123.      */
  124.     long getInitial() {
  125.         return initial;
  126.     }

  127.     @Override
  128.     public IndexExtractor indices(final Shape shape) {
  129.         Objects.requireNonNull(shape, "shape");

  130.         return new IndexExtractor() {

  131.             @Override
  132.             public int[] asIndexArray() {
  133.                 final int[] result = new int[shape.getNumberOfHashFunctions()];
  134.                 final int[] idx = new int[1];

  135.                 // This method needs to return duplicate indices

  136.                 processIndices(i -> {
  137.                     result[idx[0]++] = i;
  138.                     return true;
  139.                 });
  140.                 return result;
  141.             }

  142.             @Override
  143.             public boolean processIndices(final IntPredicate consumer) {
  144.                 Objects.requireNonNull(consumer, "consumer");
  145.                 final int bits = shape.getNumberOfBits();
  146.                 // Enhanced double hashing:
  147.                 // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits
  148.                 // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing
  149.                 //
  150.                 // Essentially this is computing a wrapped modulus from a start point and an
  151.                 // increment and an additional term as a tetrahedral number.
  152.                 // You only need two modulus operations before the loop. Within the loop
  153.                 // the modulus is handled using the sign bit to detect wrapping to ensure:
  154.                 // 0 <= index < bits
  155.                 // 0 <= inc < bits
  156.                 // The final hash is:
  157.                 // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits)

  158.                 int index = BitMaps.mod(initial, bits);
  159.                 if (!consumer.test(index)) {
  160.                     return false;
  161.                 }
  162.                 int inc = BitMaps.mod(increment, bits);

  163.                 final int k = shape.getNumberOfHashFunctions();

  164.                 if (k >= bits) {
  165.                     // the tetraheadral incrementer.  We need to ensure that this
  166.                     // number does not exceed bits-1 or we may end up with an index > bits.
  167.                     int tet = 1;
  168.                     for (int i = 1; i < k; i++) {
  169.                         // Update index and handle wrapping
  170.                         index -= inc;
  171.                         index = index < 0 ? index + bits : index;
  172.                         if (!consumer.test(index)) {
  173.                             return false;
  174.                         }

  175.                         // Incorporate the counter into the increment to create a
  176.                         // tetrahedral number additional term, and handle wrapping.
  177.                         inc -= tet;
  178.                         inc = inc < 0 ? inc + bits : inc;
  179.                         if (++tet == bits) {
  180.                             tet = 0;
  181.                         }
  182.                     }
  183.                 } else {
  184.                     for (int i = 1; i < k; i++) {
  185.                         // Update index and handle wrapping
  186.                         index -= inc;
  187.                         index = index < 0 ? index + bits : index;
  188.                         if (!consumer.test(index)) {
  189.                             return false;
  190.                         }

  191.                         // Incorporate the counter into the increment to create a
  192.                         // tetrahedral number additional term, and handle wrapping.
  193.                         inc -= i;
  194.                         inc = inc < 0 ? inc + bits : inc;
  195.                     }

  196.                 }
  197.                 return true;
  198.             }
  199.         };
  200.     }
  201. }