1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.collections4.bloomfilter; 18 19 /** 20 * The definition of a Bloom filter shape. 21 * 22 * <p>This class contains the values for the filter configuration and is used to 23 * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are 24 * compatible. (i.e. can be compared or merged)</p> 25 * 26 * <h2>Interrelatedness of values</h2> 27 * 28 * <dl> 29 * <dt>Number of Items ({@code n})</dt> 30 * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd> 31 * <dt>Probability of False Positives ({@code p})</dt> 32 * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd> 33 * <dt>Number of Bits ({@code m})</dt> 34 * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd> 35 * <dt>Number of Functions ({@code k})</dt> 36 * <dd>{@code k = round((m / n) * ln(2))}</dd> 37 * </dl> 38 * 39 * <h2>Estimations from cardinality based on shape</h2> 40 * 41 * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p> 42 * 43 * <p>In the calculation below the following values are used:</p> 44 * <ul> 45 * <li>double c = the cardinality of the Bloom filter.</li> 46 * <li>double m = numberOfBits as specified in the shape.</li> 47 * <li>double k = numberOfHashFunctions as specified in the shape.</li> 48 * </ul> 49 * 50 * <h3>Estimate N - n()</h3> 51 * 52 * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}. This is the calculation 53 * performed by the {@code Shape.estimateN(cardinality)} method below. This estimate is roughly equivalent to the 54 * number of hashers that have been merged into a filter to create the cardinality specified.</p> 55 * 56 * <p><em>Note:</em></p> 57 * <ul> 58 * <li>if cardinality == numberOfBits, then result is infinity.</li> 59 * <li>if cardinality > numberOfBits, then result is NaN.</li> 60 * </ul> 61 * 62 * <h3>Estimate N of Union - n(A ∪ B)</h3> 63 * 64 * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and 65 * calculate the estimated N from the result.</p> 66 * 67 * <h3>Estimate N of the Intersection - n(A ∩ B)</h3> 68 * 69 * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is: 70 * n(A) + n(b) - n(A ∪ B).</p> 71 * 72 * <p>Care must be taken when any of the n(x) returns infinity. In general the following assumptions are true: 73 * 74 * <ul> 75 * <li>If n(A) = ∞ and n(B) < ∞ then n(A ∩ B) = n(B)</li> 76 * <li>If n(A) < ∞ and n(B) = ∞ then n(A ∩ B) = n(A)</li> 77 * <li>If n(A) = ∞ and n(B) = ∞ then n(A ∩ B) = ∞</li> 78 * <li>If n(A) < ∞ and n(B) < ∞ and n(A ∪ B) = ∞ then n(A ∩ B) is undefined.</li> 79 * </ul> 80 * 81 * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a> 82 * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter 83 * [Wikipedia]</a> 84 * @since 4.5.0-M1 85 */ 86 public final class Shape { 87 88 /** 89 * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. 90 */ 91 private static final double LN_2 = Math.log(2.0); 92 93 /** 94 * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. 95 * 96 * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2)</p> 97 */ 98 private static final double DENOMINATOR = -LN_2 * LN_2; 99 100 /** 101 * Calculates the number of hash functions given numberOfItems and numberOfBits. 102 * This is a method so that the calculation is consistent across all constructors. 103 * 104 * @param numberOfItems the number of items in the filter. 105 * @param numberOfBits the number of bits in the filter. 106 * @return the optimal number of hash functions. 107 * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} 108 */ 109 private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { 110 // k = round((m / n) * ln(2)) We change order so that we use real math rather 111 // than integer math. 112 final long k = Math.round(LN_2 * numberOfBits / numberOfItems); 113 if (k < 1) { 114 throw new IllegalArgumentException(String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); 115 } 116 // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but 117 // since numberOfBits is at most Integer.MAX_VALUE the numerator of 118 // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the 119 // value of k cannot be above Integer.MAX_VALUE. 120 return (int) k; 121 } 122 123 /** 124 * Checks the calculated probability is {@code < 1.0}. 125 * 126 * <p> 127 * This function is used to verify that the dynamically calculated probability for the Shape is in the valid range 0 to 1 exclusive. This need only be 128 * performed once upon construction. 129 * </p> 130 * 131 * @param probability the probability 132 * @throws IllegalArgumentException if the probability is {@code >= 1.0}. 133 */ 134 private static void checkCalculatedProbability(final double probability) { 135 // We do not need to check for p <= 0.0 since we only allow positive values for 136 // parameters and the closest we can come to exp(-kn/m) == 1 is 137 // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow(x, y) will 138 // always be 0<x<1 and y>0 139 if (probability >= 1.0) { 140 throw new IllegalArgumentException("Calculated probability is greater than or equal to 1: " + probability); 141 } 142 } 143 144 /** 145 * Checks number of bits is strictly positive. 146 * 147 * @param numberOfBits the number of bits 148 * @return the number of bits 149 * @throws IllegalArgumentException if the number of bits is {@code < 1}. 150 */ 151 private static int checkNumberOfBits(final int numberOfBits) { 152 if (numberOfBits < 1) { 153 throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); 154 } 155 return numberOfBits; 156 } 157 158 /** 159 * Checks number of hash functions is strictly positive. 160 * 161 * @param numberOfHashFunctions the number of hash functions 162 * @return the number of hash functions 163 * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. 164 */ 165 private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { 166 if (numberOfHashFunctions < 1) { 167 throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); 168 } 169 return numberOfHashFunctions; 170 } 171 172 /** 173 * Checks number of items is strictly positive. 174 * 175 * @param numberOfItems the number of items 176 * @return the number of items 177 * @throws IllegalArgumentException if the number of items is {@code < 1}. 178 */ 179 private static int checkNumberOfItems(final int numberOfItems) { 180 if (numberOfItems < 1) { 181 throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); 182 } 183 return numberOfItems; 184 } 185 186 /** 187 * Checks the probability is in the range 0.0, exclusive, to 1.0, exclusive. 188 * 189 * @param probability the probability 190 * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} 191 */ 192 private static void checkProbability(final double probability) { 193 // Using the negation of within the desired range will catch NaN 194 if (!(probability > 0.0 && probability < 1.0)) { 195 throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); 196 } 197 } 198 199 /** 200 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 201 * bits ({@code m}). 202 * 203 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 204 * @param numberOfBits The number of bits in the filter 205 * @return a valid Shape. 206 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 207 */ 208 public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) { 209 return new Shape(numberOfHashFunctions, numberOfBits); 210 } 211 212 /** 213 * Constructs a filter configuration with the specified number of items ({@code n}) and 214 * bits ({@code m}). 215 * 216 * <p>The optimal number of hash functions ({@code k}) is computed. 217 * <pre>k = round((m / n) * ln(2))</pre> 218 * 219 * <p>The false-positive probability is computed using the number of items, bits and hash 220 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 221 * shape is invalid for use as a Bloom filter). 222 * 223 * @param numberOfItems Number of items to be placed in the filter 224 * @param numberOfBits The number of bits in the filter 225 * @return a valid Shape. 226 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 227 * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} 228 */ 229 public static Shape fromNM(final int numberOfItems, final int numberOfBits) { 230 checkNumberOfItems(numberOfItems); 231 checkNumberOfBits(numberOfBits); 232 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 233 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 234 // check that probability is within range 235 checkCalculatedProbability(shape.getProbability(numberOfItems)); 236 return shape; 237 } 238 239 /** 240 * Constructs a filter configuration with the specified number of items, bits 241 * and hash functions. 242 * 243 * <p>The false-positive probability is computed using the number of items, bits and hash 244 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 245 * shape is invalid for use as a Bloom filter). 246 * 247 * @param numberOfItems Number of items to be placed in the filter 248 * @param numberOfBits The number of bits in the filter. 249 * @param numberOfHashFunctions The number of hash functions in the filter 250 * @return a valid Shape. 251 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 252 * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. 253 */ 254 public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { 255 checkNumberOfItems(numberOfItems); 256 checkNumberOfBits(numberOfBits); 257 checkNumberOfHashFunctions(numberOfHashFunctions); 258 // check that probability is within range 259 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 260 // check that probability is within range 261 checkCalculatedProbability(shape.getProbability(numberOfItems)); 262 return shape; 263 } 264 265 /** 266 * Constructs a filter configuration with the specified number of items ({@code n}) and 267 * desired false-positive probability ({@code p}). 268 * 269 * <p>The number of bits ({@code m}) for the filter is computed. 270 * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre> 271 * 272 * <p>The optimal number of hash functions ({@code k}) is computed. 273 * <pre>k = round((m / n) * ln(2))</pre> 274 * 275 * <p>The actual probability will be approximately equal to the 276 * desired probability but will be dependent upon the calculated number of bits and hash 277 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 278 * shape is invalid for use as a Bloom filter). 279 * 280 * @param numberOfItems Number of items to be placed in the filter 281 * @param probability The desired false-positive probability in the range {@code (0, 1)} 282 * @return a valid Shape 283 * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability 284 * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. 285 */ 286 public static Shape fromNP(final int numberOfItems, final double probability) { 287 checkNumberOfItems(numberOfItems); 288 checkProbability(probability); 289 290 // Number of bits (m) 291 final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); 292 if (m > Integer.MAX_VALUE) { 293 throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); 294 } 295 final int numberOfBits = (int) m; 296 297 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 298 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 299 // check that probability is within range 300 checkCalculatedProbability(shape.getProbability(numberOfItems)); 301 return shape; 302 } 303 304 /** 305 * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the 306 * specified number of bits ({@code m}) and hash functions ({@code k}). 307 * 308 * <p>The number of items ({@code n}) to be stored in the filter is computed. 309 * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre> 310 * 311 * <p>The actual probability will be approximately equal to the 312 * desired probability but will be dependent upon the calculated Bloom filter capacity 313 * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the 314 * shape is invalid for use as a Bloom filter). 315 * 316 * @param probability The desired false-positive probability in the range {@code (0, 1)} 317 * @param numberOfBits The number of bits in the filter 318 * @param numberOfHashFunctions The number of hash functions in the filter 319 * @return a valid Shape. 320 * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, 321 * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual 322 * probability is {@code >= 1.0} 323 */ 324 public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { 325 checkProbability(probability); 326 checkNumberOfBits(numberOfBits); 327 checkNumberOfHashFunctions(numberOfHashFunctions); 328 329 // Number of items (n): 330 // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) 331 final double n = Math.ceil(numberOfBits / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions)))); 332 333 // log of probability is always < 0 334 // number of hash functions is >= 1 335 // e^x where x < 0 = [0,1) 336 // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 337 // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 338 // ceil( >0 ) >= 1 339 // so we cannot produce a negative value thus we don't check for it. 340 // 341 // similarly we cannot produce a number greater than numberOfBits so we 342 // do not have to check for Integer.MAX_VALUE either. 343 344 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 345 // check that probability is within range 346 checkCalculatedProbability(shape.getProbability((int) n)); 347 return shape; 348 } 349 350 /** 351 * Number of hash functions to create a filter ({@code k}). 352 */ 353 private final int numberOfHashFunctions; 354 355 /** 356 * Number of bits in the filter ({@code m}). 357 */ 358 private final int numberOfBits; 359 360 /** 361 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 362 * bits ({@code m}). 363 * 364 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 365 * @param numberOfBits The number of bits in the filter 366 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 367 */ 368 private Shape(final int numberOfHashFunctions, final int numberOfBits) { 369 this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); 370 this.numberOfBits = checkNumberOfBits(numberOfBits); 371 } 372 373 @Override 374 public boolean equals(final Object obj) { 375 // Shape is final so no check for the same class as inheritance is not possible 376 if (obj instanceof Shape) { 377 final Shape other = (Shape) obj; 378 return numberOfBits == other.numberOfBits && numberOfHashFunctions == other.numberOfHashFunctions; 379 } 380 return false; 381 } 382 383 /** 384 * Estimates the maximum number of elements that can be merged into a filter of 385 * this shape before the false positive rate exceeds the desired rate. <p> The 386 * formula for deriving {@code k} when {@code m} and {@code n} are known is: 387 * 388 * <p>{@code k = ln2 * m / n}</p> 389 * 390 * <p>Solving for {@code n} yields:</p> 391 * 392 * <p>{@code n = ln2 * m / k}</p> 393 * 394 * @return An estimate of max N. 395 */ 396 public double estimateMaxN() { 397 return numberOfBits * LN_2 / numberOfHashFunctions; 398 } 399 400 /** 401 * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. 402 * 403 * <p><em>Note:</em></p> 404 * <ul> 405 * <li> if cardinality == numberOfBits, then result is infinity.</li> 406 * <li> if cardinality > numberOfBits, then result is NaN.</li> 407 * </ul> 408 * 409 * @param cardinality the number of enabled bits also known as the hamming value. 410 * @return An estimate of the number of items in the Bloom filter. 411 */ 412 public double estimateN(final int cardinality) { 413 final double c = cardinality; 414 final double m = numberOfBits; 415 final double k = numberOfHashFunctions; 416 return -(m / k) * Math.log1p(-c / m); 417 } 418 419 /** 420 * Gets the number of bits in the Bloom filter. 421 * This is also known as {@code m}. 422 * 423 * @return the number of bits in the Bloom filter ({@code m}). 424 */ 425 public int getNumberOfBits() { 426 return numberOfBits; 427 } 428 429 /** 430 * Gets the number of hash functions used to construct the filter. 431 * This is also known as {@code k}. 432 * 433 * @return the number of hash functions used to construct the filter ({@code k}). 434 */ 435 public int getNumberOfHashFunctions() { 436 return numberOfHashFunctions; 437 } 438 439 /** 440 * Calculates the probability of false positives ({@code p}) given 441 * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). 442 * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre> 443 * 444 * <p>This is the probability that a Bloom filter will return true for the presence of an item 445 * when it does not contain the item.</p> 446 * 447 * <p>The probability assumes that the Bloom filter is filled with the expected number of 448 * items. If the filter contains fewer items then the actual probability will be lower. 449 * Thus, this returns the worst-case false positive probability for a filter that has not 450 * exceeded its expected number of items.</p> 451 * 452 * @param numberOfItems the number of items hashed into the Bloom filter. 453 * @return the probability of false positives. 454 */ 455 public double getProbability(final int numberOfItems) { 456 if (numberOfItems < 0) { 457 throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); 458 } 459 if (numberOfItems == 0) { 460 return 0; 461 } 462 return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), numberOfHashFunctions); 463 } 464 465 @Override 466 public int hashCode() { 467 // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions}) 468 return (31 + numberOfBits) * 31 + numberOfHashFunctions; 469 } 470 471 /** 472 * Determines if a cardinality is sparse based on the shape. 473 * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory 474 * necessary to store the cardinality as indexes is less than the estimated memory for bit maps, 475 * the cardinality is determined to be {@code sparse}.</p> 476 * 477 * @param cardinality the cardinality to check. 478 * @return true if the cardinality is sparse within the shape. 479 */ 480 public boolean isSparse(final int cardinality) { 481 482 /* 483 * Since the size of a bit map is a long and the size of an index is an int, 484 * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly 485 * distributed across the range of possible values, Thus if the cardinality 486 * (number of indexes) is less than or equal to 2*number of bit maps the 487 * cardinality is sparse within the shape. 488 */ 489 return cardinality <= BitMaps.numberOfBitMaps(this) * 2; 490 } 491 492 @Override 493 public String toString() { 494 return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits); 495 } 496 }