1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.collections4.bloomfilter; 18 19 /** 20 * The definition of a Bloom filter shape. 21 * 22 * <p> This class contains the values for the filter configuration and is used to 23 * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are 24 * compatible. (i.e. can be compared or merged)</p> 25 * 26 * <h2>Interrelatedness of values</h2> 27 * 28 * <dl> 29 * <dt>Number of Items ({@code n})</dt> 30 * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd> 31 * <dt>Probability of False Positives ({@code p})</dt> 32 * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd> 33 * <dt>Number of Bits ({@code m})</dt> 34 * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd> 35 * <dt>Number of Functions ({@code k})</dt> 36 * <dd>{@code k = round((m / n) * ln(2))}</dd> 37 * </dl> 38 * 39 * <h2>Estimations from cardinality based on shape</h2> 40 * 41 * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p> 42 * 43 * <p>In the calculation below the following values are used:</p> 44 * <ul> 45 * <li>double c = the cardinality of the Bloom filter.</li> 46 * <li>double m = numberOfBits as specified in the shape.</li> 47 * <li>double k = numberOfHashFunctions as specified in the shape.</li> 48 * </ul> 49 * 50 * <h3>Estimate N - n()</h3> 51 * 52 * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}. This is the calculation 53 * performed by the {@code Shape.estimateN(cardinality)} method below. This estimate is roughly equivalent to the 54 * number of hashers that have been merged into a filter to create the cardinality specified.</p> 55 * 56 * <p><em>Note:</em></p> 57 * <ul> 58 * <li>if cardinality == numberOfBits, then result is infinity.</li> 59 * <li>if cardinality > numberOfBits, then result is NaN.</li> 60 * </ul> 61 * 62 * <h3>Estimate N of Union - n(A ∪ B)</h3> 63 * 64 * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and 65 * calculate the estimated N from the result.</p> 66 * 67 * <h3>Estimate N of the Intersection - n(A ∩ B)</h3> 68 * 69 * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is: 70 * n(A) + n(b) - n(A ∪ B).</p> 71 * 72 * <p>Care must be taken when any of the n(x) returns infinity. In general the following assumptions are true: 73 * 74 * <ul> 75 * <li>If n(A) = ∞ and n(B) < ∞ then n(A ∩ B) = n(B)</li> 76 * <li>If n(A) < ∞ and n(B) = ∞ then n(A ∩ B) = n(A)</li> 77 * <li>If n(A) = ∞ and n(B) = ∞ then n(A ∩ B) = ∞</li> 78 * <li>If n(A) < ∞ and n(B) < ∞ and n(A ∪ B) = ∞ then n(A ∩ B) is undefined.</li> 79 * </ul> 80 * 81 * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a> 82 * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter 83 * [Wikipedia]</a> 84 * @since 4.5 85 */ 86 public final class Shape { 87 88 /** 89 * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. 90 */ 91 private static final double LN_2 = Math.log(2.0); 92 93 /** 94 * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. 95 * 96 * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) 97 */ 98 private static final double DENOMINATOR = -LN_2 * LN_2; 99 100 /** 101 * Calculates the number of hash functions given numberOfItems and numberOfBits. 102 * This is a method so that the calculation is consistent across all constructors. 103 * 104 * @param numberOfItems the number of items in the filter. 105 * @param numberOfBits the number of bits in the filter. 106 * @return the optimal number of hash functions. 107 * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} 108 */ 109 private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { 110 // k = round((m / n) * ln(2)) We change order so that we use real math rather 111 // than integer math. 112 final long k = Math.round(LN_2 * numberOfBits / numberOfItems); 113 if (k < 1) { 114 throw new IllegalArgumentException( 115 String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); 116 } 117 // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but 118 // since numberOfBits is at most Integer.MAX_VALUE the numerator of 119 // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the 120 // value of k can not be above Integer.MAX_VALUE. 121 return (int) k; 122 } 123 124 /** 125 * Check the calculated probability is {@code < 1.0}. 126 * 127 * <p>This function is used to verify that the dynamically calculated probability for the 128 * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon 129 * construction. 130 * 131 * @param probability the probability 132 * @throws IllegalArgumentException if the probability is {@code >= 1.0}. 133 */ 134 private static void checkCalculatedProbability(final double probability) { 135 // We do not need to check for p <= 0.0 since we only allow positive values for 136 // parameters and the closest we can come to exp(-kn/m) == 1 is 137 // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will 138 // always be 0<x<1 and y>0 139 if (probability >= 1.0) { 140 throw new IllegalArgumentException( 141 String.format("Calculated probability is greater than or equal to 1: " + probability)); 142 } 143 } 144 145 /** 146 * Check number of bits is strictly positive. 147 * 148 * @param numberOfBits the number of bits 149 * @return the number of bits 150 * @throws IllegalArgumentException if the number of bits is {@code < 1}. 151 */ 152 private static int checkNumberOfBits(final int numberOfBits) { 153 if (numberOfBits < 1) { 154 throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); 155 } 156 return numberOfBits; 157 } 158 159 /** 160 * Check number of hash functions is strictly positive. 161 * 162 * @param numberOfHashFunctions the number of hash functions 163 * @return the number of hash functions 164 * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. 165 */ 166 private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { 167 if (numberOfHashFunctions < 1) { 168 throw new IllegalArgumentException( 169 "Number of hash functions must be greater than 0: " + numberOfHashFunctions); 170 } 171 return numberOfHashFunctions; 172 } 173 174 /** 175 * Check number of items is strictly positive. 176 * 177 * @param numberOfItems the number of items 178 * @return the number of items 179 * @throws IllegalArgumentException if the number of items is {@code < 1}. 180 */ 181 private static int checkNumberOfItems(final int numberOfItems) { 182 if (numberOfItems < 1) { 183 throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); 184 } 185 return numberOfItems; 186 } 187 188 /** 189 * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. 190 * 191 * @param probability the probability 192 * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} 193 */ 194 private static void checkProbability(final double probability) { 195 // Using the negation of within the desired range will catch NaN 196 if (!(probability > 0.0 && probability < 1.0)) { 197 throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); 198 } 199 } 200 201 /** 202 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 203 * bits ({@code m}). 204 * 205 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 206 * @param numberOfBits The number of bits in the filter 207 * @return a valid Shape. 208 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 209 */ 210 public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) { 211 return new Shape(numberOfHashFunctions, numberOfBits); 212 } 213 214 /** 215 * Constructs a filter configuration with the specified number of items ({@code n}) and 216 * bits ({@code m}). 217 * 218 * <p>The optimal number of hash functions ({@code k}) is computed. 219 * <pre>k = round((m / n) * ln(2))</pre> 220 * 221 * <p>The false-positive probability is computed using the number of items, bits and hash 222 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 223 * shape is invalid for use as a Bloom filter). 224 * 225 * @param numberOfItems Number of items to be placed in the filter 226 * @param numberOfBits The number of bits in the filter 227 * @return a valid Shape. 228 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 229 * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} 230 */ 231 public static Shape fromNM(final int numberOfItems, final int numberOfBits) { 232 checkNumberOfItems(numberOfItems); 233 checkNumberOfBits(numberOfBits); 234 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 235 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 236 // check that probability is within range 237 checkCalculatedProbability(shape.getProbability(numberOfItems)); 238 return shape; 239 } 240 241 /** 242 * Constructs a filter configuration with the specified number of items, bits 243 * and hash functions. 244 * 245 * <p>The false-positive probability is computed using the number of items, bits and hash 246 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 247 * shape is invalid for use as a Bloom filter). 248 * 249 * @param numberOfItems Number of items to be placed in the filter 250 * @param numberOfBits The number of bits in the filter. 251 * @param numberOfHashFunctions The number of hash functions in the filter 252 * @return a valid Shape. 253 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 254 * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. 255 */ 256 public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { 257 checkNumberOfItems(numberOfItems); 258 checkNumberOfBits(numberOfBits); 259 checkNumberOfHashFunctions(numberOfHashFunctions); 260 // check that probability is within range 261 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 262 // check that probability is within range 263 checkCalculatedProbability(shape.getProbability(numberOfItems)); 264 return shape; 265 } 266 267 /** 268 * Constructs a filter configuration with the specified number of items ({@code n}) and 269 * desired false-positive probability ({@code p}). 270 * 271 * <p>The number of bits ({@code m}) for the filter is computed. 272 * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre> 273 * 274 * <p>The optimal number of hash functions ({@code k}) is computed. 275 * <pre>k = round((m / n) * ln(2))</pre> 276 * 277 * <p>The actual probability will be approximately equal to the 278 * desired probability but will be dependent upon the calculated number of bits and hash 279 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 280 * shape is invalid for use as a Bloom filter). 281 * 282 * @param numberOfItems Number of items to be placed in the filter 283 * @param probability The desired false-positive probability in the range {@code (0, 1)} 284 * @return a valid Shape 285 * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability 286 * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. 287 */ 288 public static Shape fromNP(final int numberOfItems, final double probability) { 289 checkNumberOfItems(numberOfItems); 290 checkProbability(probability); 291 292 // Number of bits (m) 293 final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); 294 if (m > Integer.MAX_VALUE) { 295 throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); 296 } 297 final int numberOfBits = (int) m; 298 299 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 300 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 301 // check that probability is within range 302 checkCalculatedProbability(shape.getProbability(numberOfItems)); 303 return shape; 304 } 305 306 /** 307 * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the 308 * specified number of bits ({@code m}) and hash functions ({@code k}). 309 * 310 * <p>The number of items ({@code n}) to be stored in the filter is computed. 311 * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre> 312 * 313 * <p>The actual probability will be approximately equal to the 314 * desired probability but will be dependent upon the calculated Bloom filter capacity 315 * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the 316 * shape is invalid for use as a Bloom filter). 317 * 318 * @param probability The desired false-positive probability in the range {@code (0, 1)} 319 * @param numberOfBits The number of bits in the filter 320 * @param numberOfHashFunctions The number of hash functions in the filter 321 * @return a valid Shape. 322 * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, 323 * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual 324 * probability is {@code >= 1.0} 325 */ 326 public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { 327 checkProbability(probability); 328 checkNumberOfBits(numberOfBits); 329 checkNumberOfHashFunctions(numberOfHashFunctions); 330 331 // Number of items (n): 332 // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) 333 final double n = Math.ceil(numberOfBits 334 / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions)))); 335 336 // log of probability is always < 0 337 // number of hash functions is >= 1 338 // e^x where x < 0 = [0,1) 339 // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 340 // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 341 // ceil( >0 ) >= 1 342 // so we can not produce a negative value thus we don't check for it. 343 // 344 // similarly we can not produce a number greater than numberOfBits so we 345 // do not have to check for Integer.MAX_VALUE either. 346 347 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 348 // check that probability is within range 349 checkCalculatedProbability(shape.getProbability((int) n)); 350 return shape; 351 } 352 353 /** 354 * Number of hash functions to create a filter ({@code k}). 355 */ 356 private final int numberOfHashFunctions; 357 358 /** 359 * Number of bits in the filter ({@code m}). 360 */ 361 private final int numberOfBits; 362 363 /** 364 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 365 * bits ({@code m}). 366 * 367 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 368 * @param numberOfBits The number of bits in the filter 369 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 370 */ 371 private Shape(final int numberOfHashFunctions, final int numberOfBits) { 372 this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); 373 this.numberOfBits = checkNumberOfBits(numberOfBits); 374 } 375 376 @Override 377 public boolean equals(final Object obj) { 378 // Shape is final so no check for the same class as inheritance is not possible 379 if (obj instanceof Shape) { 380 final Shape other = (Shape) obj; 381 return numberOfBits == other.numberOfBits && 382 numberOfHashFunctions == other.numberOfHashFunctions; 383 } 384 return false; 385 } 386 387 /** 388 * Estimates the maximum number of elements that can be merged into a filter of 389 * this shape before the false positive rate exceeds the desired rate. <p> The 390 * formula for deriving {@code k} when {@code m} and {@code n} are known is: 391 * 392 * <p>{@code k = ln2 * m / n}</p> 393 * 394 * <p>Solving for {@code n} yields:</p> 395 * 396 * <p>{@code n = ln2 * m / k}</p> 397 * 398 * @return An estimate of max N. 399 */ 400 public double estimateMaxN() { 401 return numberOfBits * LN_2 / numberOfHashFunctions; 402 } 403 404 /** 405 * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. 406 * 407 * <p><em>Note:</em></p> 408 * <ul> 409 * <li> if cardinality == numberOfBits, then result is infinity.</li> 410 * <li> if cardinality > numberOfBits, then result is NaN.</li> 411 * </ul> 412 * 413 * @param cardinality the number of enabled bits also known as the hamming value. 414 * @return An estimate of the number of items in the Bloom filter. 415 */ 416 public double estimateN(final int cardinality) { 417 final double c = cardinality; 418 final double m = numberOfBits; 419 final double k = numberOfHashFunctions; 420 return -(m / k) * Math.log1p(-c / m); 421 } 422 423 /** 424 * Gets the number of bits in the Bloom filter. 425 * This is also known as {@code m}. 426 * 427 * @return the number of bits in the Bloom filter ({@code m}). 428 */ 429 public int getNumberOfBits() { 430 return numberOfBits; 431 } 432 433 /** 434 * Gets the number of hash functions used to construct the filter. 435 * This is also known as {@code k}. 436 * 437 * @return the number of hash functions used to construct the filter ({@code k}). 438 */ 439 public int getNumberOfHashFunctions() { 440 return numberOfHashFunctions; 441 } 442 443 /** 444 * Calculates the probability of false positives ({@code p}) given 445 * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). 446 * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre> 447 * 448 * <p>This is the probability that a Bloom filter will return true for the presence of an item 449 * when it does not contain the item.</p> 450 * 451 * <p>The probability assumes that the Bloom filter is filled with the expected number of 452 * items. If the filter contains fewer items then the actual probability will be lower. 453 * Thus, this returns the worst-case false positive probability for a filter that has not 454 * exceeded its expected number of items.</p> 455 * 456 * @param numberOfItems the number of items hashed into the Bloom filter. 457 * @return the probability of false positives. 458 */ 459 public double getProbability(final int numberOfItems) { 460 if (numberOfItems < 0) { 461 throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); 462 } 463 if (numberOfItems == 0) { 464 return 0; 465 } 466 return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), 467 numberOfHashFunctions); 468 } 469 470 @Override 471 public int hashCode() { 472 // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions}) 473 return (31 + numberOfBits) * 31 + numberOfHashFunctions; 474 } 475 476 /** 477 * Determines if a cardinality is sparse based on the shape. 478 * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory 479 * necessary to store the cardinality as indexes is less than the estimated memory for bit maps, 480 * the cardinality is determined to be {@code sparse}.</p> 481 * @param cardinality the cardinality to check. 482 * @return true if the cardinality is sparse within the shape. 483 */ 484 public boolean isSparse(final int cardinality) { 485 /* 486 * Since the size of a bit map is a long and the size of an index is an int, 487 * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly 488 * distributed across the range of possible values, Thus if the cardinality 489 * (number of indexes) is less than or equal to 2*number of bit maps the 490 * cardinality is sparse within the shape. 491 */ 492 return cardinality <= BitMap.numberOfBitMaps(getNumberOfBits()) * 2; 493 } 494 495 @Override 496 public String toString() { 497 return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits); 498 } 499 }