1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.bloomfilter;
18
19 /**
20 * The definition of a Bloom filter shape.
21 *
22 * <p>This class contains the values for the filter configuration and is used to
23 * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are
24 * compatible. (i.e. can be compared or merged)</p>
25 *
26 * <h2>Interrelatedness of values</h2>
27 *
28 * <dl>
29 * <dt>Number of Items ({@code n})</dt>
30 * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd>
31 * <dt>Probability of False Positives ({@code p})</dt>
32 * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd>
33 * <dt>Number of Bits ({@code m})</dt>
34 * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd>
35 * <dt>Number of Functions ({@code k})</dt>
36 * <dd>{@code k = round((m / n) * ln(2))}</dd>
37 * </dl>
38 *
39 * <h2>Estimations from cardinality based on shape</h2>
40 *
41 * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
42 *
43 * <p>In the calculation below the following values are used:</p>
44 * <ul>
45 * <li>double c = the cardinality of the Bloom filter.</li>
46 * <li>double m = numberOfBits as specified in the shape.</li>
47 * <li>double k = numberOfHashFunctions as specified in the shape.</li>
48 * </ul>
49 *
50 * <h3>Estimate N - n()</h3>
51 *
52 * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}. This is the calculation
53 * performed by the {@code Shape.estimateN(cardinality)} method below. This estimate is roughly equivalent to the
54 * number of hashers that have been merged into a filter to create the cardinality specified.</p>
55 *
56 * <p><em>Note:</em></p>
57 * <ul>
58 * <li>if cardinality == numberOfBits, then result is infinity.</li>
59 * <li>if cardinality > numberOfBits, then result is NaN.</li>
60 * </ul>
61 *
62 * <h3>Estimate N of Union - n(A ∪ B)</h3>
63 *
64 * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
65 * calculate the estimated N from the result.</p>
66 *
67 * <h3>Estimate N of the Intersection - n(A ∩ B)</h3>
68 *
69 * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
70 * n(A) + n(b) - n(A ∪ B).</p>
71 *
72 * <p>Care must be taken when any of the n(x) returns infinity. In general the following assumptions are true:
73 *
74 * <ul>
75 * <li>If n(A) = ∞ and n(B) < ∞ then n(A ∩ B) = n(B)</li>
76 * <li>If n(A) < ∞ and n(B) = ∞ then n(A ∩ B) = n(A)</li>
77 * <li>If n(A) = ∞ and n(B) = ∞ then n(A ∩ B) = ∞</li>
78 * <li>If n(A) < ∞ and n(B) < ∞ and n(A ∪ B) = ∞ then n(A ∩ B) is undefined.</li>
79 * </ul>
80 *
81 * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
82 * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
83 * [Wikipedia]</a>
84 * @since 4.5.0-M1
85 */
86 public final class Shape {
87
88 /**
89 * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945.
90 */
91 private static final double LN_2 = Math.log(2.0);
92
93 /**
94 * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201.
95 *
96 * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2)</p>
97 */
98 private static final double DENOMINATOR = -LN_2 * LN_2;
99
100 /**
101 * Calculates the number of hash functions given numberOfItems and numberOfBits.
102 * This is a method so that the calculation is consistent across all constructors.
103 *
104 * @param numberOfItems the number of items in the filter.
105 * @param numberOfBits the number of bits in the filter.
106 * @return the optimal number of hash functions.
107 * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1}
108 */
109 private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) {
110 // k = round((m / n) * ln(2)) We change order so that we use real math rather
111 // than integer math.
112 final long k = Math.round(LN_2 * numberOfBits / numberOfItems);
113 if (k < 1) {
114 throw new IllegalArgumentException(String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k));
115 }
116 // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but
117 // since numberOfBits is at most Integer.MAX_VALUE the numerator of
118 // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the
119 // value of k cannot be above Integer.MAX_VALUE.
120 return (int) k;
121 }
122
123 /**
124 * Checks the calculated probability is {@code < 1.0}.
125 *
126 * <p>
127 * This function is used to verify that the dynamically calculated probability for the Shape is in the valid range 0 to 1 exclusive. This need only be
128 * performed once upon construction.
129 * </p>
130 *
131 * @param probability the probability
132 * @throws IllegalArgumentException if the probability is {@code >= 1.0}.
133 */
134 private static void checkCalculatedProbability(final double probability) {
135 // We do not need to check for p <= 0.0 since we only allow positive values for
136 // parameters and the closest we can come to exp(-kn/m) == 1 is
137 // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow(x, y) will
138 // always be 0<x<1 and y>0
139 if (probability >= 1.0) {
140 throw new IllegalArgumentException("Calculated probability is greater than or equal to 1: " + probability);
141 }
142 }
143
144 /**
145 * Checks number of bits is strictly positive.
146 *
147 * @param numberOfBits the number of bits
148 * @return the number of bits
149 * @throws IllegalArgumentException if the number of bits is {@code < 1}.
150 */
151 private static int checkNumberOfBits(final int numberOfBits) {
152 if (numberOfBits < 1) {
153 throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits);
154 }
155 return numberOfBits;
156 }
157
158 /**
159 * Checks number of hash functions is strictly positive.
160 *
161 * @param numberOfHashFunctions the number of hash functions
162 * @return the number of hash functions
163 * @throws IllegalArgumentException if the number of hash functions is {@code < 1}.
164 */
165 private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) {
166 if (numberOfHashFunctions < 1) {
167 throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions);
168 }
169 return numberOfHashFunctions;
170 }
171
172 /**
173 * Checks number of items is strictly positive.
174 *
175 * @param numberOfItems the number of items
176 * @return the number of items
177 * @throws IllegalArgumentException if the number of items is {@code < 1}.
178 */
179 private static int checkNumberOfItems(final int numberOfItems) {
180 if (numberOfItems < 1) {
181 throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems);
182 }
183 return numberOfItems;
184 }
185
186 /**
187 * Checks the probability is in the range 0.0, exclusive, to 1.0, exclusive.
188 *
189 * @param probability the probability
190 * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)}
191 */
192 private static void checkProbability(final double probability) {
193 // Using the negation of within the desired range will catch NaN
194 if (!(probability > 0.0 && probability < 1.0)) {
195 throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability);
196 }
197 }
198
199 /**
200 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
201 * bits ({@code m}).
202 *
203 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
204 * @param numberOfBits The number of bits in the filter
205 * @return a valid Shape.
206 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
207 */
208 public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) {
209 return new Shape(numberOfHashFunctions, numberOfBits);
210 }
211
212 /**
213 * Constructs a filter configuration with the specified number of items ({@code n}) and
214 * bits ({@code m}).
215 *
216 * <p>The optimal number of hash functions ({@code k}) is computed.
217 * <pre>k = round((m / n) * ln(2))</pre>
218 *
219 * <p>The false-positive probability is computed using the number of items, bits and hash
220 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
221 * shape is invalid for use as a Bloom filter).
222 *
223 * @param numberOfItems Number of items to be placed in the filter
224 * @param numberOfBits The number of bits in the filter
225 * @return a valid Shape.
226 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
227 * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0}
228 */
229 public static Shape fromNM(final int numberOfItems, final int numberOfBits) {
230 checkNumberOfItems(numberOfItems);
231 checkNumberOfBits(numberOfBits);
232 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
233 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
234 // check that probability is within range
235 checkCalculatedProbability(shape.getProbability(numberOfItems));
236 return shape;
237 }
238
239 /**
240 * Constructs a filter configuration with the specified number of items, bits
241 * and hash functions.
242 *
243 * <p>The false-positive probability is computed using the number of items, bits and hash
244 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
245 * shape is invalid for use as a Bloom filter).
246 *
247 * @param numberOfItems Number of items to be placed in the filter
248 * @param numberOfBits The number of bits in the filter.
249 * @param numberOfHashFunctions The number of hash functions in the filter
250 * @return a valid Shape.
251 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
252 * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}.
253 */
254 public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) {
255 checkNumberOfItems(numberOfItems);
256 checkNumberOfBits(numberOfBits);
257 checkNumberOfHashFunctions(numberOfHashFunctions);
258 // check that probability is within range
259 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
260 // check that probability is within range
261 checkCalculatedProbability(shape.getProbability(numberOfItems));
262 return shape;
263 }
264
265 /**
266 * Constructs a filter configuration with the specified number of items ({@code n}) and
267 * desired false-positive probability ({@code p}).
268 *
269 * <p>The number of bits ({@code m}) for the filter is computed.
270 * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre>
271 *
272 * <p>The optimal number of hash functions ({@code k}) is computed.
273 * <pre>k = round((m / n) * ln(2))</pre>
274 *
275 * <p>The actual probability will be approximately equal to the
276 * desired probability but will be dependent upon the calculated number of bits and hash
277 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
278 * shape is invalid for use as a Bloom filter).
279 *
280 * @param numberOfItems Number of items to be placed in the filter
281 * @param probability The desired false-positive probability in the range {@code (0, 1)}
282 * @return a valid Shape
283 * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability
284 * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}.
285 */
286 public static Shape fromNP(final int numberOfItems, final double probability) {
287 checkNumberOfItems(numberOfItems);
288 checkProbability(probability);
289
290 // Number of bits (m)
291 final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR);
292 if (m > Integer.MAX_VALUE) {
293 throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m);
294 }
295 final int numberOfBits = (int) m;
296
297 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
298 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
299 // check that probability is within range
300 checkCalculatedProbability(shape.getProbability(numberOfItems));
301 return shape;
302 }
303
304 /**
305 * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
306 * specified number of bits ({@code m}) and hash functions ({@code k}).
307 *
308 * <p>The number of items ({@code n}) to be stored in the filter is computed.
309 * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre>
310 *
311 * <p>The actual probability will be approximately equal to the
312 * desired probability but will be dependent upon the calculated Bloom filter capacity
313 * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the
314 * shape is invalid for use as a Bloom filter).
315 *
316 * @param probability The desired false-positive probability in the range {@code (0, 1)}
317 * @param numberOfBits The number of bits in the filter
318 * @param numberOfHashFunctions The number of hash functions in the filter
319 * @return a valid Shape.
320 * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)},
321 * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual
322 * probability is {@code >= 1.0}
323 */
324 public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) {
325 checkProbability(probability);
326 checkNumberOfBits(numberOfBits);
327 checkNumberOfHashFunctions(numberOfHashFunctions);
328
329 // Number of items (n):
330 // n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
331 final double n = Math.ceil(numberOfBits / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions))));
332
333 // log of probability is always < 0
334 // number of hash functions is >= 1
335 // e^x where x < 0 = [0,1)
336 // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53
337 // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0
338 // ceil( >0 ) >= 1
339 // so we cannot produce a negative value thus we don't check for it.
340 //
341 // similarly we cannot produce a number greater than numberOfBits so we
342 // do not have to check for Integer.MAX_VALUE either.
343
344 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
345 // check that probability is within range
346 checkCalculatedProbability(shape.getProbability((int) n));
347 return shape;
348 }
349
350 /**
351 * Number of hash functions to create a filter ({@code k}).
352 */
353 private final int numberOfHashFunctions;
354
355 /**
356 * Number of bits in the filter ({@code m}).
357 */
358 private final int numberOfBits;
359
360 /**
361 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
362 * bits ({@code m}).
363 *
364 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
365 * @param numberOfBits The number of bits in the filter
366 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
367 */
368 private Shape(final int numberOfHashFunctions, final int numberOfBits) {
369 this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
370 this.numberOfBits = checkNumberOfBits(numberOfBits);
371 }
372
373 @Override
374 public boolean equals(final Object obj) {
375 // Shape is final so no check for the same class as inheritance is not possible
376 if (obj instanceof Shape) {
377 final Shape other = (Shape) obj;
378 return numberOfBits == other.numberOfBits && numberOfHashFunctions == other.numberOfHashFunctions;
379 }
380 return false;
381 }
382
383 /**
384 * Estimates the maximum number of elements that can be merged into a filter of
385 * this shape before the false positive rate exceeds the desired rate. <p> The
386 * formula for deriving {@code k} when {@code m} and {@code n} are known is:
387 *
388 * <p>{@code k = ln2 * m / n}</p>
389 *
390 * <p>Solving for {@code n} yields:</p>
391 *
392 * <p>{@code n = ln2 * m / k}</p>
393 *
394 * @return An estimate of max N.
395 */
396 public double estimateMaxN() {
397 return numberOfBits * LN_2 / numberOfHashFunctions;
398 }
399
400 /**
401 * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled.
402 *
403 * <p><em>Note:</em></p>
404 * <ul>
405 * <li> if cardinality == numberOfBits, then result is infinity.</li>
406 * <li> if cardinality > numberOfBits, then result is NaN.</li>
407 * </ul>
408 *
409 * @param cardinality the number of enabled bits also known as the hamming value.
410 * @return An estimate of the number of items in the Bloom filter.
411 */
412 public double estimateN(final int cardinality) {
413 final double c = cardinality;
414 final double m = numberOfBits;
415 final double k = numberOfHashFunctions;
416 return -(m / k) * Math.log1p(-c / m);
417 }
418
419 /**
420 * Gets the number of bits in the Bloom filter.
421 * This is also known as {@code m}.
422 *
423 * @return the number of bits in the Bloom filter ({@code m}).
424 */
425 public int getNumberOfBits() {
426 return numberOfBits;
427 }
428
429 /**
430 * Gets the number of hash functions used to construct the filter.
431 * This is also known as {@code k}.
432 *
433 * @return the number of hash functions used to construct the filter ({@code k}).
434 */
435 public int getNumberOfHashFunctions() {
436 return numberOfHashFunctions;
437 }
438
439 /**
440 * Calculates the probability of false positives ({@code p}) given
441 * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}).
442 * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre>
443 *
444 * <p>This is the probability that a Bloom filter will return true for the presence of an item
445 * when it does not contain the item.</p>
446 *
447 * <p>The probability assumes that the Bloom filter is filled with the expected number of
448 * items. If the filter contains fewer items then the actual probability will be lower.
449 * Thus, this returns the worst-case false positive probability for a filter that has not
450 * exceeded its expected number of items.</p>
451 *
452 * @param numberOfItems the number of items hashed into the Bloom filter.
453 * @return the probability of false positives.
454 */
455 public double getProbability(final int numberOfItems) {
456 if (numberOfItems < 0) {
457 throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems);
458 }
459 if (numberOfItems == 0) {
460 return 0;
461 }
462 return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), numberOfHashFunctions);
463 }
464
465 @Override
466 public int hashCode() {
467 // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions})
468 return (31 + numberOfBits) * 31 + numberOfHashFunctions;
469 }
470
471 /**
472 * Determines if a cardinality is sparse based on the shape.
473 * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory
474 * necessary to store the cardinality as indexes is less than the estimated memory for bit maps,
475 * the cardinality is determined to be {@code sparse}.</p>
476 *
477 * @param cardinality the cardinality to check.
478 * @return true if the cardinality is sparse within the shape.
479 */
480 public boolean isSparse(final int cardinality) {
481
482 /*
483 * Since the size of a bit map is a long and the size of an index is an int,
484 * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly
485 * distributed across the range of possible values, Thus if the cardinality
486 * (number of indexes) is less than or equal to 2*number of bit maps the
487 * cardinality is sparse within the shape.
488 */
489 return cardinality <= BitMaps.numberOfBitMaps(this) * 2;
490 }
491
492 @Override
493 public String toString() {
494 return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits);
495 }
496 }