View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Objects;
20  import java.util.function.IntPredicate;
21  
22  /**
23   * A Hasher that implements combinatorial hashing as described by
24   * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique
25   * described in the wikipedia article  <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>.
26   * <p>
27   * Common use for this hasher is to generate bit indices from a byte array output of a hashing
28   * or MessageDigest algorithm.</p>
29   *
30   * <h2>Thoughts on the hasher input</h2>
31   *
32   *<p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than
33   * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be
34   * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits
35   * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for
36   * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial}
37   * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the
38   * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also
39   * ignores the negative wrapping but the behavior is the same, some bits cannot be reached.
40   * </p><p>
41   * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger
42   * than the number of bits then the modulus will create a 'random' position and increment within the size.
43   * </p>
44   *
45   * @since 4.5
46   */
47  public class EnhancedDoubleHasher implements Hasher {
48  
49      /**
50       * Convert bytes to big-endian long filling with zero bytes as necessary..
51       * @param byteArray the byte array to extract the values from.
52       * @param offset the offset to start extraction from.
53       * @param len the length of the extraction, may be longer than 8.
54       * @return
55       */
56      private static long toLong(final byte[] byteArray, final int offset, final int len) {
57          long val = 0;
58          int shift = Long.SIZE;
59          final int end = offset + Math.min(len, Long.BYTES);
60          for (int i = offset; i < end; i++) {
61              shift -= Byte.SIZE;
62              val |= (long) (byteArray[i] & 0xFF) << shift;
63          }
64          return val;
65      }
66  
67      /**
68       * The initial hash value.
69       */
70      private final long initial;
71  
72      /**
73       * The value to increment the hash value by.
74       */
75      private final long increment;
76  
77      /**
78       * Constructs the EnhancedDoubleHasher from a byte array.
79       * <p>
80       * This method simplifies the conversion from a Digest or hasher algorithm output
81       * to the two values used by the EnhancedDoubleHasher.</p>
82       * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value.
83       * Excess bytes are ignored.
84       * If there are fewer than 16 bytes the following conversions are made.
85       *</p>
86       * <ol>
87       * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li>
88       * <li>The bytes alloted are read in big-endian order any byte not populated is set to zero.</li>
89       * </ol>
90       * <p>
91       * This ensures that small arrays generate the largest possible increment and initial values.
92       * </p>
93       * @param buffer the buffer to extract the longs from.
94       * @throws IllegalArgumentException is buffer length is zero.
95       */
96      public EnhancedDoubleHasher(final byte[] buffer) {
97          if (buffer.length == 0) {
98              throw new IllegalArgumentException("buffer length must be greater than 0");
99          }
100         // divide by 2
101         final int segment = buffer.length / 2;
102         this.initial = toLong(buffer, 0, segment);
103         this.increment = toLong(buffer, segment, buffer.length - segment);
104     }
105 
106     /**
107      * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values.
108      * @param initial The initial value for the hasher.
109      * @param increment The value to increment the hash by on each iteration.
110      */
111     public EnhancedDoubleHasher(final long initial, final long increment) {
112         this.initial = initial;
113         this.increment = increment;
114     }
115 
116     /**
117      * Gets the increment value for the hash calculation.
118      * @return the increment value for the hash calculation.
119      */
120     long getIncrement() {
121         return increment;
122     }
123 
124     /**
125      * Gets the initial value for the hash calculation.
126      * @return the initial value for the hash calculation.
127      */
128     long getInitial() {
129         return initial;
130     }
131 
132     @Override
133     public IndexProducer indices(final Shape shape) {
134         Objects.requireNonNull(shape, "shape");
135 
136         return new IndexProducer() {
137 
138             @Override
139             public int[] asIndexArray() {
140                 final int[] result = new int[shape.getNumberOfHashFunctions()];
141                 final int[] idx = new int[1];
142 
143                 // This method needs to return duplicate indices
144 
145                 forEachIndex(i -> {
146                     result[idx[0]++] = i;
147                     return true;
148                 });
149                 return result;
150             }
151 
152             @Override
153             public boolean forEachIndex(final IntPredicate consumer) {
154                 Objects.requireNonNull(consumer, "consumer");
155                 final int bits = shape.getNumberOfBits();
156                 // Enhanced double hashing:
157                 // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits
158                 // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing
159                 //
160                 // Essentially this is computing a wrapped modulus from a start point and an
161                 // increment and an additional term as a tetrahedral number.
162                 // You only need two modulus operations before the loop. Within the loop
163                 // the modulus is handled using the sign bit to detect wrapping to ensure:
164                 // 0 <= index < bits
165                 // 0 <= inc < bits
166                 // The final hash is:
167                 // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits)
168 
169                 int index = BitMap.mod(initial, bits);
170                 int inc = BitMap.mod(increment, bits);
171 
172                 final int k = shape.getNumberOfHashFunctions();
173                 if (k > bits) {
174                     for (int j = k; j > 0;) {
175                         // handle k > bits
176                         final int block = Math.min(j, bits);
177                         j -= block;
178                         for (int i = 0; i < block; i++) {
179                             if (!consumer.test(index)) {
180                                 return false;
181                             }
182                             // Update index and handle wrapping
183                             index -= inc;
184                             index = index < 0 ? index + bits : index;
185 
186                             // Incorporate the counter into the increment to create a
187                             // tetrahedral number additional term, and handle wrapping.
188                             inc -= i;
189                             inc = inc < 0 ? inc + bits : inc;
190                         }
191                     }
192                 } else {
193                     for (int i = 0; i < k; i++) {
194                         if (!consumer.test(index)) {
195                             return false;
196                         }
197                         // Update index and handle wrapping
198                         index -= inc;
199                         index = index < 0 ? index + bits : index;
200 
201                         // Incorporate the counter into the increment to create a
202                         // tetrahedral number additional term, and handle wrapping.
203                         inc -= i;
204                         inc = inc < 0 ? inc + bits : inc;
205                     }
206                 }
207                 return true;
208             }
209         };
210     }
211 }