View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  /**
20   * The definition of a Bloom filter shape.
21   *
22   * <p> This class contains the values for the filter configuration and is used to
23   * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are
24   * compatible. (i.e. can be compared or merged)</p>
25   *
26   * <h2>Interrelatedness of values</h2>
27   *
28   * <dl>
29   * <dt>Number of Items ({@code n})</dt>
30   * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd>
31   * <dt>Probability of False Positives ({@code p})</dt>
32   * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd>
33   * <dt>Number of Bits ({@code m})</dt>
34   * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd>
35   * <dt>Number of Functions ({@code k})</dt>
36   * <dd>{@code k = round((m / n) * ln(2))}</dd>
37   * </dl>
38   *
39   * <h2>Estimations from cardinality based on shape</h2>
40   *
41   * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
42   *
43   * <p>In the calculation below the following values are used:</p>
44   * <ul>
45   * <li>double c = the cardinality of the Bloom filter.</li>
46   * <li>double m = numberOfBits as specified in the shape.</li>
47   * <li>double k = numberOfHashFunctions as specified in the shape.</li>
48   * </ul>
49   *
50   * <h3>Estimate N - n()</h3>
51   *
52   * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}.  This is the calculation
53   * performed by the {@code Shape.estimateN(cardinality)} method below.  This estimate is roughly equivalent to the
54   * number of hashers that have been merged into a filter to create the cardinality specified.</p>
55   *
56   * <p><em>Note:</em></p>
57   * <ul>
58   * <li>if cardinality == numberOfBits, then result is infinity.</li>
59   * <li>if cardinality &gt; numberOfBits, then result is NaN.</li>
60   * </ul>
61   *
62   * <h3>Estimate N of Union - n(A &cup; B)</h3>
63   *
64   * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
65   * calculate the estimated N from the result.</p>
66   *
67   * <h3>Estimate N of the Intersection - n(A &cap; B)</h3>
68   *
69   * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
70   * n(A) + n(b) - n(A &cup; B).</p>
71   *
72   * <p>Care must be taken when any of the n(x) returns infinity.  In general the following assumptions are true:
73   *
74   * <ul>
75   * <li>If n(A) = &infin; and n(B) &lt; &infin; then n(A &cap; B) = n(B)</li>
76   * <li>If n(A) &lt; &infin; and n(B) = &infin; then n(A &cap; B) = n(A)</li>
77   * <li>If n(A) = &infin; and n(B) = &infin; then n(A &cap; B) = &infin;</li>
78   * <li>If n(A) &lt; &infin; and n(B) &lt; &infin; and n(A &cup; B) = &infin; then n(A &cap; B) is undefined.</li>
79   * </ul>
80   *
81   * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
82   * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
83   * [Wikipedia]</a>
84   * @since 4.5
85   */
86  public final class Shape {
87  
88      /**
89       * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945.
90       */
91      private static final double LN_2 = Math.log(2.0);
92  
93      /**
94       * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201.
95       *
96       * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2)
97       */
98      private static final double DENOMINATOR = -LN_2 * LN_2;
99  
100     /**
101      * Calculates the number of hash functions given numberOfItems and numberOfBits.
102      * This is a method so that the calculation is consistent across all constructors.
103      *
104      * @param numberOfItems the number of items in the filter.
105      * @param numberOfBits the number of bits in the filter.
106      * @return the optimal number of hash functions.
107      * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1}
108      */
109     private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) {
110         // k = round((m / n) * ln(2)) We change order so that we use real math rather
111         // than integer math.
112         final long k = Math.round(LN_2 * numberOfBits / numberOfItems);
113         if (k < 1) {
114             throw new IllegalArgumentException(
115                     String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k));
116         }
117         // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but
118         // since numberOfBits is at most Integer.MAX_VALUE the numerator of
119         // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the
120         // value of k can not be above Integer.MAX_VALUE.
121         return (int) k;
122     }
123 
124     /**
125      * Check the calculated probability is {@code < 1.0}.
126      *
127      * <p>This function is used to verify that the dynamically calculated probability for the
128      * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon
129      * construction.
130      *
131      * @param probability the probability
132      * @throws IllegalArgumentException if the probability is {@code >= 1.0}.
133      */
134     private static void checkCalculatedProbability(final double probability) {
135         // We do not need to check for p <= 0.0 since we only allow positive values for
136         // parameters and the closest we can come to exp(-kn/m) == 1 is
137         // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will
138         // always be 0<x<1 and y>0
139         if (probability >= 1.0) {
140             throw new IllegalArgumentException(
141                     String.format("Calculated probability is greater than or equal to 1: " + probability));
142         }
143     }
144 
145     /**
146      * Check number of bits is strictly positive.
147      *
148      * @param numberOfBits the number of bits
149      * @return the number of bits
150      * @throws IllegalArgumentException if the number of bits is {@code < 1}.
151      */
152     private static int checkNumberOfBits(final int numberOfBits) {
153         if (numberOfBits < 1) {
154             throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits);
155         }
156         return numberOfBits;
157     }
158 
159     /**
160      * Check number of hash functions is strictly positive.
161      *
162      * @param numberOfHashFunctions the number of hash functions
163      * @return the number of hash functions
164      * @throws IllegalArgumentException if the number of hash functions is {@code < 1}.
165      */
166     private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) {
167         if (numberOfHashFunctions < 1) {
168             throw new IllegalArgumentException(
169                     "Number of hash functions must be greater than 0: " + numberOfHashFunctions);
170         }
171         return numberOfHashFunctions;
172     }
173 
174     /**
175      * Check number of items is strictly positive.
176      *
177      * @param numberOfItems the number of items
178      * @return the number of items
179      * @throws IllegalArgumentException if the number of items is {@code < 1}.
180      */
181     private static int checkNumberOfItems(final int numberOfItems) {
182         if (numberOfItems < 1) {
183             throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems);
184         }
185         return numberOfItems;
186     }
187 
188     /**
189      * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive.
190      *
191      * @param probability the probability
192      * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)}
193      */
194     private static void checkProbability(final double probability) {
195         // Using the negation of within the desired range will catch NaN
196         if (!(probability > 0.0 && probability < 1.0)) {
197             throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability);
198         }
199     }
200 
201     /**
202      * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
203      * bits ({@code m}).
204      *
205      * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
206      * @param numberOfBits The number of bits in the filter
207      * @return a valid Shape.
208      * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
209      */
210     public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) {
211         return new Shape(numberOfHashFunctions, numberOfBits);
212     }
213 
214     /**
215      * Constructs a filter configuration with the specified number of items ({@code n}) and
216      * bits ({@code m}).
217      *
218      * <p>The optimal number of hash functions ({@code k}) is computed.
219      * <pre>k = round((m / n) * ln(2))</pre>
220      *
221      * <p>The false-positive probability is computed using the number of items, bits and hash
222      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
223      * shape is invalid for use as a Bloom filter).
224      *
225      * @param numberOfItems Number of items to be placed in the filter
226      * @param numberOfBits The number of bits in the filter
227      * @return a valid Shape.
228      * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
229      * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0}
230      */
231     public static Shape fromNM(final int numberOfItems, final int numberOfBits) {
232         checkNumberOfItems(numberOfItems);
233         checkNumberOfBits(numberOfBits);
234         final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
235         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
236         // check that probability is within range
237         checkCalculatedProbability(shape.getProbability(numberOfItems));
238         return shape;
239     }
240 
241     /**
242      * Constructs a filter configuration with the specified number of items, bits
243      * and hash functions.
244      *
245      * <p>The false-positive probability is computed using the number of items, bits and hash
246      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
247      * shape is invalid for use as a Bloom filter).
248      *
249      * @param numberOfItems Number of items to be placed in the filter
250      * @param numberOfBits The number of bits in the filter.
251      * @param numberOfHashFunctions The number of hash functions in the filter
252      * @return a valid Shape.
253      * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
254      * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}.
255      */
256     public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) {
257         checkNumberOfItems(numberOfItems);
258         checkNumberOfBits(numberOfBits);
259         checkNumberOfHashFunctions(numberOfHashFunctions);
260         // check that probability is within range
261         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
262         // check that probability is within range
263         checkCalculatedProbability(shape.getProbability(numberOfItems));
264         return shape;
265     }
266 
267     /**
268      * Constructs a filter configuration with the specified number of items ({@code n}) and
269      * desired false-positive probability ({@code p}).
270      *
271      * <p>The number of bits ({@code m}) for the filter is computed.
272      * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre>
273      *
274      * <p>The optimal number of hash functions ({@code k}) is computed.
275      * <pre>k = round((m / n) * ln(2))</pre>
276      *
277      * <p>The actual probability will be approximately equal to the
278      * desired probability but will be dependent upon the calculated number of bits and hash
279      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
280      * shape is invalid for use as a Bloom filter).
281      *
282      * @param numberOfItems Number of items to be placed in the filter
283      * @param probability The desired false-positive probability in the range {@code (0, 1)}
284      * @return a valid Shape
285      * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability
286      * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}.
287      */
288     public static Shape fromNP(final int numberOfItems, final double probability) {
289         checkNumberOfItems(numberOfItems);
290         checkProbability(probability);
291 
292         // Number of bits (m)
293         final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR);
294         if (m > Integer.MAX_VALUE) {
295             throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m);
296         }
297         final int numberOfBits = (int) m;
298 
299         final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
300         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
301         // check that probability is within range
302         checkCalculatedProbability(shape.getProbability(numberOfItems));
303         return shape;
304     }
305 
306     /**
307      * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
308      * specified number of bits ({@code m}) and hash functions ({@code k}).
309      *
310      * <p>The number of items ({@code n}) to be stored in the filter is computed.
311      * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre>
312      *
313      * <p>The actual probability will be approximately equal to the
314      * desired probability but will be dependent upon the calculated Bloom filter capacity
315      * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the
316      * shape is invalid for use as a Bloom filter).
317      *
318      * @param probability The desired false-positive probability in the range {@code (0, 1)}
319      * @param numberOfBits The number of bits in the filter
320      * @param numberOfHashFunctions The number of hash functions in the filter
321      * @return a valid Shape.
322      * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)},
323      * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual
324      * probability is {@code >= 1.0}
325      */
326     public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) {
327         checkProbability(probability);
328         checkNumberOfBits(numberOfBits);
329         checkNumberOfHashFunctions(numberOfHashFunctions);
330 
331         // Number of items (n):
332         // n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
333         final double n = Math.ceil(numberOfBits
334                 / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions))));
335 
336         // log of probability is always < 0
337         // number of hash functions is >= 1
338         // e^x where x < 0 = [0,1)
339         // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53
340         // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0
341         // ceil( >0 ) >= 1
342         // so we can not produce a negative value thus we don't check for it.
343         //
344         // similarly we can not produce a number greater than numberOfBits so we
345         // do not have to check for Integer.MAX_VALUE either.
346 
347         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
348         // check that probability is within range
349         checkCalculatedProbability(shape.getProbability((int) n));
350         return shape;
351     }
352 
353     /**
354      * Number of hash functions to create a filter ({@code k}).
355      */
356     private final int numberOfHashFunctions;
357 
358     /**
359      * Number of bits in the filter ({@code m}).
360      */
361     private final int numberOfBits;
362 
363     /**
364      * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
365      * bits ({@code m}).
366      *
367      * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
368      * @param numberOfBits The number of bits in the filter
369      * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
370      */
371     private Shape(final int numberOfHashFunctions, final int numberOfBits) {
372         this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
373         this.numberOfBits = checkNumberOfBits(numberOfBits);
374     }
375 
376     @Override
377     public boolean equals(final Object obj) {
378         // Shape is final so no check for the same class as inheritance is not possible
379         if (obj instanceof Shape) {
380             final Shape other = (Shape) obj;
381             return numberOfBits == other.numberOfBits &&
382                    numberOfHashFunctions == other.numberOfHashFunctions;
383         }
384         return false;
385     }
386 
387     /**
388      * Estimates the maximum number of elements that can be merged into a filter of
389      * this shape before the false positive rate exceeds the desired rate. <p> The
390      * formula for deriving {@code k} when {@code m} and {@code n} are known is:
391      *
392      * <p>{@code k = ln2 * m / n}</p>
393      *
394      * <p>Solving for {@code n} yields:</p>
395      *
396      * <p>{@code n = ln2 * m / k}</p>
397      *
398      * @return An estimate of max N.
399      */
400     public double estimateMaxN() {
401         return numberOfBits * LN_2 / numberOfHashFunctions;
402     }
403 
404     /**
405      * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled.
406      *
407      * <p><em>Note:</em></p>
408      * <ul>
409      * <li> if cardinality == numberOfBits, then result is infinity.</li>
410      * <li> if cardinality &gt; numberOfBits, then result is NaN.</li>
411      * </ul>
412      *
413      * @param cardinality the number of enabled  bits also known as the hamming value.
414      * @return An estimate of the number of items in the Bloom filter.
415      */
416     public double estimateN(final int cardinality) {
417         final double c = cardinality;
418         final double m = numberOfBits;
419         final double k = numberOfHashFunctions;
420         return -(m / k) * Math.log1p(-c / m);
421     }
422 
423     /**
424      * Gets the number of bits in the Bloom filter.
425      * This is also known as {@code m}.
426      *
427      * @return the number of bits in the Bloom filter ({@code m}).
428      */
429     public int getNumberOfBits() {
430         return numberOfBits;
431     }
432 
433     /**
434      * Gets the number of hash functions used to construct the filter.
435      * This is also known as {@code k}.
436      *
437      * @return the number of hash functions used to construct the filter ({@code k}).
438      */
439     public int getNumberOfHashFunctions() {
440         return numberOfHashFunctions;
441     }
442 
443     /**
444      * Calculates the probability of false positives ({@code p}) given
445      * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}).
446      * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre>
447      *
448      * <p>This is the probability that a Bloom filter will return true for the presence of an item
449      * when it does not contain the item.</p>
450      *
451      * <p>The probability assumes that the Bloom filter is filled with the expected number of
452      * items. If the filter contains fewer items then the actual probability will be lower.
453      * Thus, this returns the worst-case false positive probability for a filter that has not
454      * exceeded its expected number of items.</p>
455      *
456      * @param numberOfItems the number of items hashed into the Bloom filter.
457      * @return the probability of false positives.
458      */
459     public double getProbability(final int numberOfItems) {
460         if (numberOfItems < 0) {
461             throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems);
462         }
463         if (numberOfItems == 0) {
464             return 0;
465         }
466         return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits),
467                 numberOfHashFunctions);
468     }
469 
470     @Override
471     public int hashCode() {
472         // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions})
473         return (31 + numberOfBits) * 31 + numberOfHashFunctions;
474     }
475 
476     /**
477      * Determines if a cardinality is sparse based on the shape.
478      * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory
479      * necessary to store the cardinality as indexes is less than the estimated memory for bit maps,
480      * the cardinality is determined to be {@code sparse}.</p>
481      * @param cardinality the cardinality to check.
482      * @return true if the cardinality is sparse within the shape.
483      */
484     public boolean isSparse(final int cardinality) {
485         /*
486          * Since the size of a bit map is a long and the size of an index is an int,
487          * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly
488          * distributed across the range of possible values, Thus if the cardinality
489          * (number of indexes) is less than or equal to 2*number of bit maps the
490          * cardinality is sparse within the shape.
491          */
492         return cardinality <= BitMap.numberOfBitMaps(getNumberOfBits()) * 2;
493     }
494 
495     @Override
496     public String toString() {
497         return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits);
498     }
499 }