1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.bloomfilter;
18
19 import java.util.Objects;
20 import java.util.function.IntPredicate;
21
22 /**
23 * A Hasher that implements combinatorial hashing as described by
24 * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique
25 * described in the wikipedia article <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>.
26 * <p>
27 * Common use for this hasher is to generate bit indices from a byte array output of a hashing
28 * or MessageDigest algorithm.</p>
29 *
30 * <h2>Thoughts on the hasher input</h2>
31 *
32 * <p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than
33 * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be
34 * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits
35 * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for
36 * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial}
37 * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the
38 * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also
39 * ignores the negative wrapping but the behavior is the same, some bits cannot be reached.
40 * </p><p>
41 * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger
42 * than the number of bits then the modulus will create a 'random' position and increment within the size.
43 * </p>
44 *
45 * @since 4.5.0-M1
46 */
47 public class EnhancedDoubleHasher implements Hasher {
48
49 /**
50 * Convert bytes to big-endian long filling with zero bytes as necessary.
51 *
52 * @param byteArray the byte array to extract the values from.
53 * @param offset the offset to start extraction from.
54 * @param len the length of the extraction, may be longer than 8.
55 * @return
56 */
57 private static long toLong(final byte[] byteArray, final int offset, final int len) {
58 long val = 0;
59 int shift = Long.SIZE;
60 final int end = offset + Math.min(len, Long.BYTES);
61 for (int i = offset; i < end; i++) {
62 shift -= Byte.SIZE;
63 val |= (long) (byteArray[i] & 0xFF) << shift;
64 }
65 return val;
66 }
67
68 /**
69 * The initial hash value.
70 */
71 private final long initial;
72
73 /**
74 * The value to increment the hash value by.
75 */
76 private final long increment;
77
78 /**
79 * Constructs the EnhancedDoubleHasher from a byte array.
80 * <p>
81 * This method simplifies the conversion from a Digest or hasher algorithm output
82 * to the two values used by the EnhancedDoubleHasher.</p>
83 * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value.
84 * Excess bytes are ignored.
85 * If there are fewer than 16 bytes the following conversions are made.
86 * </p>
87 * <ol>
88 * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li>
89 * <li>The bytes allotted are read in big-endian order any byte not populated is set to zero.</li>
90 * </ol>
91 * <p>
92 * This ensures that small arrays generate the largest possible increment and initial values.
93 * </p>
94 *
95 * @param buffer the buffer to extract the longs from.
96 * @throws IllegalArgumentException is buffer length is zero.
97 */
98 public EnhancedDoubleHasher(final byte[] buffer) {
99 if (buffer.length == 0) {
100 throw new IllegalArgumentException("buffer length must be greater than 0");
101 }
102 // divide by 2
103 final int segment = buffer.length / 2;
104 this.initial = toLong(buffer, 0, segment);
105 this.increment = toLong(buffer, segment, buffer.length - segment);
106 }
107
108 /**
109 * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values.
110 *
111 * @param initial The initial value for the hasher.
112 * @param increment The value to increment the hash by on each iteration.
113 */
114 public EnhancedDoubleHasher(final long initial, final long increment) {
115 this.initial = initial;
116 this.increment = increment;
117 }
118
119 /**
120 * Gets the increment value for the hash calculation.
121 *
122 * @return the increment value for the hash calculation.
123 */
124 long getIncrement() {
125 return increment;
126 }
127
128 /**
129 * Gets the initial value for the hash calculation.
130 *
131 * @return the initial value for the hash calculation.
132 */
133 long getInitial() {
134 return initial;
135 }
136
137 @Override
138 public IndexExtractor indices(final Shape shape) {
139 Objects.requireNonNull(shape, "shape");
140
141 return new IndexExtractor() {
142
143 @Override
144 public int[] asIndexArray() {
145 final int[] result = new int[shape.getNumberOfHashFunctions()];
146 final int[] idx = new int[1];
147
148 // This method needs to return duplicate indices
149
150 processIndices(i -> {
151 result[idx[0]++] = i;
152 return true;
153 });
154 return result;
155 }
156
157 @Override
158 public boolean processIndices(final IntPredicate consumer) {
159 Objects.requireNonNull(consumer, "consumer");
160 final int bits = shape.getNumberOfBits();
161 // Enhanced double hashing:
162 // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits
163 // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing
164 //
165 // Essentially this is computing a wrapped modulus from a start point and an
166 // increment and an additional term as a tetrahedral number.
167 // You only need two modulus operations before the loop. Within the loop
168 // the modulus is handled using the sign bit to detect wrapping to ensure:
169 // 0 <= index < bits
170 // 0 <= inc < bits
171 // The final hash is:
172 // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits)
173
174 int index = BitMaps.mod(initial, bits);
175 if (!consumer.test(index)) {
176 return false;
177 }
178 int inc = BitMaps.mod(increment, bits);
179
180 final int k = shape.getNumberOfHashFunctions();
181
182 if (k >= bits) {
183 // the tetraheadral incrementer. We need to ensure that this
184 // number does not exceed bits-1 or we may end up with an index > bits.
185 int tet = 1;
186 for (int i = 1; i < k; i++) {
187 // Update index and handle wrapping
188 index -= inc;
189 index = index < 0 ? index + bits : index;
190 if (!consumer.test(index)) {
191 return false;
192 }
193
194 // Incorporate the counter into the increment to create a
195 // tetrahedral number additional term, and handle wrapping.
196 inc -= tet;
197 inc = inc < 0 ? inc + bits : inc;
198 if (++tet == bits) {
199 tet = 0;
200 }
201 }
202 } else {
203 for (int i = 1; i < k; i++) {
204 // Update index and handle wrapping
205 index -= inc;
206 index = index < 0 ? index + bits : index;
207 if (!consumer.test(index)) {
208 return false;
209 }
210
211 // Incorporate the counter into the increment to create a
212 // tetrahedral number additional term, and handle wrapping.
213 inc -= i;
214 inc = inc < 0 ? inc + bits : inc;
215 }
216
217 }
218 return true;
219 }
220 };
221 }
222 }