001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.bloomfilter;
018
019/**
020 * The definition of a Bloom filter shape.
021 *
022 * <p> This class contains the values for the filter configuration and is used to
023 * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are
024 * compatible. (i.e. can be compared or merged)</p>
025 *
026 * <h2>Interrelatedness of values</h2>
027 *
028 * <dl>
029 * <dt>Number of Items ({@code n})</dt>
030 * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd>
031 * <dt>Probability of False Positives ({@code p})</dt>
032 * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd>
033 * <dt>Number of Bits ({@code m})</dt>
034 * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd>
035 * <dt>Number of Functions ({@code k})</dt>
036 * <dd>{@code k = round((m / n) * ln(2))}</dd>
037 * </dl>
038 *
039 * <h2>Estimations from cardinality based on shape</h2>
040 *
041 * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
042 *
043 * <p>In the calculation below the following values are used:</p>
044 * <ul>
045 * <li>double c = the cardinality of the Bloom filter.</li>
046 * <li>double m = numberOfBits as specified in the shape.</li>
047 * <li>double k = numberOfHashFunctions as specified in the shape.</li>
048 * </ul>
049 *
050 * <h3>Estimate N - n()</h3>
051 *
052 * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}.  This is the calculation
053 * performed by the {@code Shape.estimateN(cardinality)} method below.  This estimate is roughly equivalent to the
054 * number of hashers that have been merged into a filter to create the cardinality specified.</p>
055 *
056 * <p><em>Note:</em></p>
057 * <ul>
058 * <li>if cardinality == numberOfBits, then result is infinity.</li>
059 * <li>if cardinality &gt; numberOfBits, then result is NaN.</li>
060 * </ul>
061 *
062 * <h3>Estimate N of Union - n(A &cup; B)</h3>
063 *
064 * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
065 * calculate the estimated N from the result.</p>
066 *
067 * <h3>Estimate N of the Intersection - n(A &cap; B)</h3>
068 *
069 * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
070 * n(A) + n(b) - n(A &cup; B).</p>
071 *
072 * <p>Care must be taken when any of the n(x) returns infinity.  In general the following assumptions are true:
073 *
074 * <ul>
075 * <li>If n(A) = &infin; and n(B) &lt; &infin; then n(A &cap; B) = n(B)</li>
076 * <li>If n(A) &lt; &infin; and n(B) = &infin; then n(A &cap; B) = n(A)</li>
077 * <li>If n(A) = &infin; and n(B) = &infin; then n(A &cap; B) = &infin;</li>
078 * <li>If n(A) &lt; &infin; and n(B) &lt; &infin; and n(A &cup; B) = &infin; then n(A &cap; B) is undefined.</li>
079 * </ul>
080 *
081 * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
082 * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
083 * [Wikipedia]</a>
084 * @since 4.5
085 */
086public final class Shape {
087
088    /**
089     * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945.
090     */
091    private static final double LN_2 = Math.log(2.0);
092
093    /**
094     * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201.
095     *
096     * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2)
097     */
098    private static final double DENOMINATOR = -LN_2 * LN_2;
099
100    /**
101     * Calculates the number of hash functions given numberOfItems and numberOfBits.
102     * This is a method so that the calculation is consistent across all constructors.
103     *
104     * @param numberOfItems the number of items in the filter.
105     * @param numberOfBits the number of bits in the filter.
106     * @return the optimal number of hash functions.
107     * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1}
108     */
109    private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) {
110        // k = round((m / n) * ln(2)) We change order so that we use real math rather
111        // than integer math.
112        final long k = Math.round(LN_2 * numberOfBits / numberOfItems);
113        if (k < 1) {
114            throw new IllegalArgumentException(
115                    String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k));
116        }
117        // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but
118        // since numberOfBits is at most Integer.MAX_VALUE the numerator of
119        // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the
120        // value of k can not be above Integer.MAX_VALUE.
121        return (int) k;
122    }
123
124    /**
125     * Check the calculated probability is {@code < 1.0}.
126     *
127     * <p>This function is used to verify that the dynamically calculated probability for the
128     * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon
129     * construction.
130     *
131     * @param probability the probability
132     * @throws IllegalArgumentException if the probability is {@code >= 1.0}.
133     */
134    private static void checkCalculatedProbability(final double probability) {
135        // We do not need to check for p <= 0.0 since we only allow positive values for
136        // parameters and the closest we can come to exp(-kn/m) == 1 is
137        // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will
138        // always be 0<x<1 and y>0
139        if (probability >= 1.0) {
140            throw new IllegalArgumentException(
141                    String.format("Calculated probability is greater than or equal to 1: " + probability));
142        }
143    }
144
145    /**
146     * Check number of bits is strictly positive.
147     *
148     * @param numberOfBits the number of bits
149     * @return the number of bits
150     * @throws IllegalArgumentException if the number of bits is {@code < 1}.
151     */
152    private static int checkNumberOfBits(final int numberOfBits) {
153        if (numberOfBits < 1) {
154            throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits);
155        }
156        return numberOfBits;
157    }
158
159    /**
160     * Check number of hash functions is strictly positive.
161     *
162     * @param numberOfHashFunctions the number of hash functions
163     * @return the number of hash functions
164     * @throws IllegalArgumentException if the number of hash functions is {@code < 1}.
165     */
166    private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) {
167        if (numberOfHashFunctions < 1) {
168            throw new IllegalArgumentException(
169                    "Number of hash functions must be greater than 0: " + numberOfHashFunctions);
170        }
171        return numberOfHashFunctions;
172    }
173
174    /**
175     * Check number of items is strictly positive.
176     *
177     * @param numberOfItems the number of items
178     * @return the number of items
179     * @throws IllegalArgumentException if the number of items is {@code < 1}.
180     */
181    private static int checkNumberOfItems(final int numberOfItems) {
182        if (numberOfItems < 1) {
183            throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems);
184        }
185        return numberOfItems;
186    }
187
188    /**
189     * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive.
190     *
191     * @param probability the probability
192     * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)}
193     */
194    private static void checkProbability(final double probability) {
195        // Using the negation of within the desired range will catch NaN
196        if (!(probability > 0.0 && probability < 1.0)) {
197            throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability);
198        }
199    }
200
201    /**
202     * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
203     * bits ({@code m}).
204     *
205     * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
206     * @param numberOfBits The number of bits in the filter
207     * @return a valid Shape.
208     * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
209     */
210    public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) {
211        return new Shape(numberOfHashFunctions, numberOfBits);
212    }
213
214    /**
215     * Constructs a filter configuration with the specified number of items ({@code n}) and
216     * bits ({@code m}).
217     *
218     * <p>The optimal number of hash functions ({@code k}) is computed.
219     * <pre>k = round((m / n) * ln(2))</pre>
220     *
221     * <p>The false-positive probability is computed using the number of items, bits and hash
222     * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
223     * shape is invalid for use as a Bloom filter).
224     *
225     * @param numberOfItems Number of items to be placed in the filter
226     * @param numberOfBits The number of bits in the filter
227     * @return a valid Shape.
228     * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
229     * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0}
230     */
231    public static Shape fromNM(final int numberOfItems, final int numberOfBits) {
232        checkNumberOfItems(numberOfItems);
233        checkNumberOfBits(numberOfBits);
234        final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
235        final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
236        // check that probability is within range
237        checkCalculatedProbability(shape.getProbability(numberOfItems));
238        return shape;
239    }
240
241    /**
242     * Constructs a filter configuration with the specified number of items, bits
243     * and hash functions.
244     *
245     * <p>The false-positive probability is computed using the number of items, bits and hash
246     * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
247     * shape is invalid for use as a Bloom filter).
248     *
249     * @param numberOfItems Number of items to be placed in the filter
250     * @param numberOfBits The number of bits in the filter.
251     * @param numberOfHashFunctions The number of hash functions in the filter
252     * @return a valid Shape.
253     * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
254     * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}.
255     */
256    public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) {
257        checkNumberOfItems(numberOfItems);
258        checkNumberOfBits(numberOfBits);
259        checkNumberOfHashFunctions(numberOfHashFunctions);
260        // check that probability is within range
261        final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
262        // check that probability is within range
263        checkCalculatedProbability(shape.getProbability(numberOfItems));
264        return shape;
265    }
266
267    /**
268     * Constructs a filter configuration with the specified number of items ({@code n}) and
269     * desired false-positive probability ({@code p}).
270     *
271     * <p>The number of bits ({@code m}) for the filter is computed.
272     * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre>
273     *
274     * <p>The optimal number of hash functions ({@code k}) is computed.
275     * <pre>k = round((m / n) * ln(2))</pre>
276     *
277     * <p>The actual probability will be approximately equal to the
278     * desired probability but will be dependent upon the calculated number of bits and hash
279     * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
280     * shape is invalid for use as a Bloom filter).
281     *
282     * @param numberOfItems Number of items to be placed in the filter
283     * @param probability The desired false-positive probability in the range {@code (0, 1)}
284     * @return a valid Shape
285     * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability
286     * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}.
287     */
288    public static Shape fromNP(final int numberOfItems, final double probability) {
289        checkNumberOfItems(numberOfItems);
290        checkProbability(probability);
291
292        // Number of bits (m)
293        final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR);
294        if (m > Integer.MAX_VALUE) {
295            throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m);
296        }
297        final int numberOfBits = (int) m;
298
299        final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
300        final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
301        // check that probability is within range
302        checkCalculatedProbability(shape.getProbability(numberOfItems));
303        return shape;
304    }
305
306    /**
307     * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
308     * specified number of bits ({@code m}) and hash functions ({@code k}).
309     *
310     * <p>The number of items ({@code n}) to be stored in the filter is computed.
311     * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre>
312     *
313     * <p>The actual probability will be approximately equal to the
314     * desired probability but will be dependent upon the calculated Bloom filter capacity
315     * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the
316     * shape is invalid for use as a Bloom filter).
317     *
318     * @param probability The desired false-positive probability in the range {@code (0, 1)}
319     * @param numberOfBits The number of bits in the filter
320     * @param numberOfHashFunctions The number of hash functions in the filter
321     * @return a valid Shape.
322     * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)},
323     * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual
324     * probability is {@code >= 1.0}
325     */
326    public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) {
327        checkProbability(probability);
328        checkNumberOfBits(numberOfBits);
329        checkNumberOfHashFunctions(numberOfHashFunctions);
330
331        // Number of items (n):
332        // n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
333        final double n = Math.ceil(numberOfBits
334                / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions))));
335
336        // log of probability is always < 0
337        // number of hash functions is >= 1
338        // e^x where x < 0 = [0,1)
339        // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53
340        // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0
341        // ceil( >0 ) >= 1
342        // so we can not produce a negative value thus we don't check for it.
343        //
344        // similarly we can not produce a number greater than numberOfBits so we
345        // do not have to check for Integer.MAX_VALUE either.
346
347        final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
348        // check that probability is within range
349        checkCalculatedProbability(shape.getProbability((int) n));
350        return shape;
351    }
352
353    /**
354     * Number of hash functions to create a filter ({@code k}).
355     */
356    private final int numberOfHashFunctions;
357
358    /**
359     * Number of bits in the filter ({@code m}).
360     */
361    private final int numberOfBits;
362
363    /**
364     * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
365     * bits ({@code m}).
366     *
367     * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
368     * @param numberOfBits The number of bits in the filter
369     * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
370     */
371    private Shape(final int numberOfHashFunctions, final int numberOfBits) {
372        this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
373        this.numberOfBits = checkNumberOfBits(numberOfBits);
374    }
375
376    @Override
377    public boolean equals(final Object obj) {
378        // Shape is final so no check for the same class as inheritance is not possible
379        if (obj instanceof Shape) {
380            final Shape other = (Shape) obj;
381            return numberOfBits == other.numberOfBits &&
382                   numberOfHashFunctions == other.numberOfHashFunctions;
383        }
384        return false;
385    }
386
387    /**
388     * Estimates the maximum number of elements that can be merged into a filter of
389     * this shape before the false positive rate exceeds the desired rate. <p> The
390     * formula for deriving {@code k} when {@code m} and {@code n} are known is:
391     *
392     * <p>{@code k = ln2 * m / n}</p>
393     *
394     * <p>Solving for {@code n} yields:</p>
395     *
396     * <p>{@code n = ln2 * m / k}</p>
397     *
398     * @return An estimate of max N.
399     */
400    public double estimateMaxN() {
401        return numberOfBits * LN_2 / numberOfHashFunctions;
402    }
403
404    /**
405     * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled.
406     *
407     * <p><em>Note:</em></p>
408     * <ul>
409     * <li> if cardinality == numberOfBits, then result is infinity.</li>
410     * <li> if cardinality &gt; numberOfBits, then result is NaN.</li>
411     * </ul>
412     *
413     * @param cardinality the number of enabled  bits also known as the hamming value.
414     * @return An estimate of the number of items in the Bloom filter.
415     */
416    public double estimateN(final int cardinality) {
417        final double c = cardinality;
418        final double m = numberOfBits;
419        final double k = numberOfHashFunctions;
420        return -(m / k) * Math.log1p(-c / m);
421    }
422
423    /**
424     * Gets the number of bits in the Bloom filter.
425     * This is also known as {@code m}.
426     *
427     * @return the number of bits in the Bloom filter ({@code m}).
428     */
429    public int getNumberOfBits() {
430        return numberOfBits;
431    }
432
433    /**
434     * Gets the number of hash functions used to construct the filter.
435     * This is also known as {@code k}.
436     *
437     * @return the number of hash functions used to construct the filter ({@code k}).
438     */
439    public int getNumberOfHashFunctions() {
440        return numberOfHashFunctions;
441    }
442
443    /**
444     * Calculates the probability of false positives ({@code p}) given
445     * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}).
446     * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre>
447     *
448     * <p>This is the probability that a Bloom filter will return true for the presence of an item
449     * when it does not contain the item.</p>
450     *
451     * <p>The probability assumes that the Bloom filter is filled with the expected number of
452     * items. If the filter contains fewer items then the actual probability will be lower.
453     * Thus, this returns the worst-case false positive probability for a filter that has not
454     * exceeded its expected number of items.</p>
455     *
456     * @param numberOfItems the number of items hashed into the Bloom filter.
457     * @return the probability of false positives.
458     */
459    public double getProbability(final int numberOfItems) {
460        if (numberOfItems < 0) {
461            throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems);
462        }
463        if (numberOfItems == 0) {
464            return 0;
465        }
466        return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits),
467                numberOfHashFunctions);
468    }
469
470    @Override
471    public int hashCode() {
472        // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions})
473        return (31 + numberOfBits) * 31 + numberOfHashFunctions;
474    }
475
476    /**
477     * Determines if a cardinality is sparse based on the shape.
478     * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory
479     * necessary to store the cardinality as indexes is less than the estimated memory for bit maps,
480     * the cardinality is determined to be {@code sparse}.</p>
481     * @param cardinality the cardinality to check.
482     * @return true if the cardinality is sparse within the shape.
483     */
484    public boolean isSparse(final int cardinality) {
485        /*
486         * Since the size of a bit map is a long and the size of an index is an int,
487         * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly
488         * distributed across the range of possible values, Thus if the cardinality
489         * (number of indexes) is less than or equal to 2*number of bit maps the
490         * cardinality is sparse within the shape.
491         */
492        return cardinality <= BitMap.numberOfBitMaps(getNumberOfBits()) * 2;
493    }
494
495    @Override
496    public String toString() {
497        return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits);
498    }
499}