Shape.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. /**
  19.  * The definition of a Bloom filter shape.
  20.  *
  21.  * <p>This class contains the values for the filter configuration and is used to
  22.  * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are
  23.  * compatible. (i.e. can be compared or merged)</p>
  24.  *
  25.  * <h2>Interrelatedness of values</h2>
  26.  *
  27.  * <dl>
  28.  * <dt>Number of Items ({@code n})</dt>
  29.  * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd>
  30.  * <dt>Probability of False Positives ({@code p})</dt>
  31.  * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd>
  32.  * <dt>Number of Bits ({@code m})</dt>
  33.  * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd>
  34.  * <dt>Number of Functions ({@code k})</dt>
  35.  * <dd>{@code k = round((m / n) * ln(2))}</dd>
  36.  * </dl>
  37.  *
  38.  * <h2>Estimations from cardinality based on shape</h2>
  39.  *
  40.  * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
  41.  *
  42.  * <p>In the calculation below the following values are used:</p>
  43.  * <ul>
  44.  * <li>double c = the cardinality of the Bloom filter.</li>
  45.  * <li>double m = numberOfBits as specified in the shape.</li>
  46.  * <li>double k = numberOfHashFunctions as specified in the shape.</li>
  47.  * </ul>
  48.  *
  49.  * <h3>Estimate N - n()</h3>
  50.  *
  51.  * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}.  This is the calculation
  52.  * performed by the {@code Shape.estimateN(cardinality)} method below.  This estimate is roughly equivalent to the
  53.  * number of hashers that have been merged into a filter to create the cardinality specified.</p>
  54.  *
  55.  * <p><em>Note:</em></p>
  56.  * <ul>
  57.  * <li>if cardinality == numberOfBits, then result is infinity.</li>
  58.  * <li>if cardinality &gt; numberOfBits, then result is NaN.</li>
  59.  * </ul>
  60.  *
  61.  * <h3>Estimate N of Union - n(A &cup; B)</h3>
  62.  *
  63.  * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
  64.  * calculate the estimated N from the result.</p>
  65.  *
  66.  * <h3>Estimate N of the Intersection - n(A &cap; B)</h3>
  67.  *
  68.  * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
  69.  * n(A) + n(b) - n(A &cup; B).</p>
  70.  *
  71.  * <p>Care must be taken when any of the n(x) returns infinity.  In general the following assumptions are true:
  72.  *
  73.  * <ul>
  74.  * <li>If n(A) = &infin; and n(B) &lt; &infin; then n(A &cap; B) = n(B)</li>
  75.  * <li>If n(A) &lt; &infin; and n(B) = &infin; then n(A &cap; B) = n(A)</li>
  76.  * <li>If n(A) = &infin; and n(B) = &infin; then n(A &cap; B) = &infin;</li>
  77.  * <li>If n(A) &lt; &infin; and n(B) &lt; &infin; and n(A &cup; B) = &infin; then n(A &cap; B) is undefined.</li>
  78.  * </ul>
  79.  *
  80.  * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
  81.  * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
  82.  * [Wikipedia]</a>
  83.  * @since 4.5.0-M1
  84.  */
  85. public final class Shape {

  86.     /**
  87.      * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945.
  88.      */
  89.     private static final double LN_2 = Math.log(2.0);

  90.     /**
  91.      * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201.
  92.      *
  93.      * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2)</p>
  94.      */
  95.     private static final double DENOMINATOR = -LN_2 * LN_2;

  96.     /**
  97.      * Calculates the number of hash functions given numberOfItems and numberOfBits.
  98.      * This is a method so that the calculation is consistent across all constructors.
  99.      *
  100.      * @param numberOfItems the number of items in the filter.
  101.      * @param numberOfBits the number of bits in the filter.
  102.      * @return the optimal number of hash functions.
  103.      * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1}
  104.      */
  105.     private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) {
  106.         // k = round((m / n) * ln(2)) We change order so that we use real math rather
  107.         // than integer math.
  108.         final long k = Math.round(LN_2 * numberOfBits / numberOfItems);
  109.         if (k < 1) {
  110.             throw new IllegalArgumentException(String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k));
  111.         }
  112.         // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but
  113.         // since numberOfBits is at most Integer.MAX_VALUE the numerator of
  114.         // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the
  115.         // value of k cannot be above Integer.MAX_VALUE.
  116.         return (int) k;
  117.     }

  118.     /**
  119.      * Checks the calculated probability is {@code < 1.0}.
  120.      *
  121.      * <p>
  122.      * This function is used to verify that the dynamically calculated probability for the Shape is in the valid range 0 to 1 exclusive. This need only be
  123.      * performed once upon construction.
  124.      * </p>
  125.      *
  126.      * @param probability the probability
  127.      * @throws IllegalArgumentException if the probability is {@code >= 1.0}.
  128.      */
  129.     private static void checkCalculatedProbability(final double probability) {
  130.         // We do not need to check for p <= 0.0 since we only allow positive values for
  131.         // parameters and the closest we can come to exp(-kn/m) == 1 is
  132.         // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow(x, y) will
  133.         // always be 0<x<1 and y>0
  134.         if (probability >= 1.0) {
  135.             throw new IllegalArgumentException("Calculated probability is greater than or equal to 1: " + probability);
  136.         }
  137.     }

  138.     /**
  139.      * Checks number of bits is strictly positive.
  140.      *
  141.      * @param numberOfBits the number of bits
  142.      * @return the number of bits
  143.      * @throws IllegalArgumentException if the number of bits is {@code < 1}.
  144.      */
  145.     private static int checkNumberOfBits(final int numberOfBits) {
  146.         if (numberOfBits < 1) {
  147.             throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits);
  148.         }
  149.         return numberOfBits;
  150.     }

  151.     /**
  152.      * Checks number of hash functions is strictly positive.
  153.      *
  154.      * @param numberOfHashFunctions the number of hash functions
  155.      * @return the number of hash functions
  156.      * @throws IllegalArgumentException if the number of hash functions is {@code < 1}.
  157.      */
  158.     private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) {
  159.         if (numberOfHashFunctions < 1) {
  160.             throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions);
  161.         }
  162.         return numberOfHashFunctions;
  163.     }

  164.     /**
  165.      * Checks number of items is strictly positive.
  166.      *
  167.      * @param numberOfItems the number of items
  168.      * @return the number of items
  169.      * @throws IllegalArgumentException if the number of items is {@code < 1}.
  170.      */
  171.     private static int checkNumberOfItems(final int numberOfItems) {
  172.         if (numberOfItems < 1) {
  173.             throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems);
  174.         }
  175.         return numberOfItems;
  176.     }

  177.     /**
  178.      * Checks the probability is in the range 0.0, exclusive, to 1.0, exclusive.
  179.      *
  180.      * @param probability the probability
  181.      * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)}
  182.      */
  183.     private static void checkProbability(final double probability) {
  184.         // Using the negation of within the desired range will catch NaN
  185.         if (!(probability > 0.0 && probability < 1.0)) {
  186.             throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability);
  187.         }
  188.     }

  189.     /**
  190.      * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
  191.      * bits ({@code m}).
  192.      *
  193.      * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
  194.      * @param numberOfBits The number of bits in the filter
  195.      * @return a valid Shape.
  196.      * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
  197.      */
  198.     public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) {
  199.         return new Shape(numberOfHashFunctions, numberOfBits);
  200.     }

  201.     /**
  202.      * Constructs a filter configuration with the specified number of items ({@code n}) and
  203.      * bits ({@code m}).
  204.      *
  205.      * <p>The optimal number of hash functions ({@code k}) is computed.
  206.      * <pre>k = round((m / n) * ln(2))</pre>
  207.      *
  208.      * <p>The false-positive probability is computed using the number of items, bits and hash
  209.      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
  210.      * shape is invalid for use as a Bloom filter).
  211.      *
  212.      * @param numberOfItems Number of items to be placed in the filter
  213.      * @param numberOfBits The number of bits in the filter
  214.      * @return a valid Shape.
  215.      * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
  216.      * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0}
  217.      */
  218.     public static Shape fromNM(final int numberOfItems, final int numberOfBits) {
  219.         checkNumberOfItems(numberOfItems);
  220.         checkNumberOfBits(numberOfBits);
  221.         final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
  222.         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
  223.         // check that probability is within range
  224.         checkCalculatedProbability(shape.getProbability(numberOfItems));
  225.         return shape;
  226.     }

  227.     /**
  228.      * Constructs a filter configuration with the specified number of items, bits
  229.      * and hash functions.
  230.      *
  231.      * <p>The false-positive probability is computed using the number of items, bits and hash
  232.      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
  233.      * shape is invalid for use as a Bloom filter).
  234.      *
  235.      * @param numberOfItems Number of items to be placed in the filter
  236.      * @param numberOfBits The number of bits in the filter.
  237.      * @param numberOfHashFunctions The number of hash functions in the filter
  238.      * @return a valid Shape.
  239.      * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1},
  240.      * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}.
  241.      */
  242.     public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) {
  243.         checkNumberOfItems(numberOfItems);
  244.         checkNumberOfBits(numberOfBits);
  245.         checkNumberOfHashFunctions(numberOfHashFunctions);
  246.         // check that probability is within range
  247.         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
  248.         // check that probability is within range
  249.         checkCalculatedProbability(shape.getProbability(numberOfItems));
  250.         return shape;
  251.     }

  252.     /**
  253.      * Constructs a filter configuration with the specified number of items ({@code n}) and
  254.      * desired false-positive probability ({@code p}).
  255.      *
  256.      * <p>The number of bits ({@code m}) for the filter is computed.
  257.      * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre>
  258.      *
  259.      * <p>The optimal number of hash functions ({@code k}) is computed.
  260.      * <pre>k = round((m / n) * ln(2))</pre>
  261.      *
  262.      * <p>The actual probability will be approximately equal to the
  263.      * desired probability but will be dependent upon the calculated number of bits and hash
  264.      * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
  265.      * shape is invalid for use as a Bloom filter).
  266.      *
  267.      * @param numberOfItems Number of items to be placed in the filter
  268.      * @param probability The desired false-positive probability in the range {@code (0, 1)}
  269.      * @return a valid Shape
  270.      * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability
  271.      * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}.
  272.      */
  273.     public static Shape fromNP(final int numberOfItems, final double probability) {
  274.         checkNumberOfItems(numberOfItems);
  275.         checkProbability(probability);

  276.         // Number of bits (m)
  277.         final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR);
  278.         if (m > Integer.MAX_VALUE) {
  279.             throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m);
  280.         }
  281.         final int numberOfBits = (int) m;

  282.         final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
  283.         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
  284.         // check that probability is within range
  285.         checkCalculatedProbability(shape.getProbability(numberOfItems));
  286.         return shape;
  287.     }

  288.     /**
  289.      * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
  290.      * specified number of bits ({@code m}) and hash functions ({@code k}).
  291.      *
  292.      * <p>The number of items ({@code n}) to be stored in the filter is computed.
  293.      * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre>
  294.      *
  295.      * <p>The actual probability will be approximately equal to the
  296.      * desired probability but will be dependent upon the calculated Bloom filter capacity
  297.      * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the
  298.      * shape is invalid for use as a Bloom filter).
  299.      *
  300.      * @param probability The desired false-positive probability in the range {@code (0, 1)}
  301.      * @param numberOfBits The number of bits in the filter
  302.      * @param numberOfHashFunctions The number of hash functions in the filter
  303.      * @return a valid Shape.
  304.      * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)},
  305.      * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual
  306.      * probability is {@code >= 1.0}
  307.      */
  308.     public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) {
  309.         checkProbability(probability);
  310.         checkNumberOfBits(numberOfBits);
  311.         checkNumberOfHashFunctions(numberOfHashFunctions);

  312.         // Number of items (n):
  313.         // n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
  314.         final double n = Math.ceil(numberOfBits / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions))));

  315.         // log of probability is always < 0
  316.         // number of hash functions is >= 1
  317.         // e^x where x < 0 = [0,1)
  318.         // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53
  319.         // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0
  320.         // ceil( >0 ) >= 1
  321.         // so we cannot produce a negative value thus we don't check for it.
  322.         //
  323.         // similarly we cannot produce a number greater than numberOfBits so we
  324.         // do not have to check for Integer.MAX_VALUE either.

  325.         final Shape shape = new Shape(numberOfHashFunctions, numberOfBits);
  326.         // check that probability is within range
  327.         checkCalculatedProbability(shape.getProbability((int) n));
  328.         return shape;
  329.     }

  330.     /**
  331.      * Number of hash functions to create a filter ({@code k}).
  332.      */
  333.     private final int numberOfHashFunctions;

  334.     /**
  335.      * Number of bits in the filter ({@code m}).
  336.      */
  337.     private final int numberOfBits;

  338.     /**
  339.      * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and
  340.      * bits ({@code m}).
  341.      *
  342.      * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter.
  343.      * @param numberOfBits The number of bits in the filter
  344.      * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1}
  345.      */
  346.     private Shape(final int numberOfHashFunctions, final int numberOfBits) {
  347.         this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
  348.         this.numberOfBits = checkNumberOfBits(numberOfBits);
  349.     }

  350.     @Override
  351.     public boolean equals(final Object obj) {
  352.         // Shape is final so no check for the same class as inheritance is not possible
  353.         if (obj instanceof Shape) {
  354.             final Shape other = (Shape) obj;
  355.             return numberOfBits == other.numberOfBits && numberOfHashFunctions == other.numberOfHashFunctions;
  356.         }
  357.         return false;
  358.     }

  359.     /**
  360.      * Estimates the maximum number of elements that can be merged into a filter of
  361.      * this shape before the false positive rate exceeds the desired rate. <p> The
  362.      * formula for deriving {@code k} when {@code m} and {@code n} are known is:
  363.      *
  364.      * <p>{@code k = ln2 * m / n}</p>
  365.      *
  366.      * <p>Solving for {@code n} yields:</p>
  367.      *
  368.      * <p>{@code n = ln2 * m / k}</p>
  369.      *
  370.      * @return An estimate of max N.
  371.      */
  372.     public double estimateMaxN() {
  373.         return numberOfBits * LN_2 / numberOfHashFunctions;
  374.     }

  375.     /**
  376.      * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled.
  377.      *
  378.      * <p><em>Note:</em></p>
  379.      * <ul>
  380.      * <li> if cardinality == numberOfBits, then result is infinity.</li>
  381.      * <li> if cardinality &gt; numberOfBits, then result is NaN.</li>
  382.      * </ul>
  383.      *
  384.      * @param cardinality the number of enabled  bits also known as the hamming value.
  385.      * @return An estimate of the number of items in the Bloom filter.
  386.      */
  387.     public double estimateN(final int cardinality) {
  388.         final double c = cardinality;
  389.         final double m = numberOfBits;
  390.         final double k = numberOfHashFunctions;
  391.         return -(m / k) * Math.log1p(-c / m);
  392.     }

  393.     /**
  394.      * Gets the number of bits in the Bloom filter.
  395.      * This is also known as {@code m}.
  396.      *
  397.      * @return the number of bits in the Bloom filter ({@code m}).
  398.      */
  399.     public int getNumberOfBits() {
  400.         return numberOfBits;
  401.     }

  402.     /**
  403.      * Gets the number of hash functions used to construct the filter.
  404.      * This is also known as {@code k}.
  405.      *
  406.      * @return the number of hash functions used to construct the filter ({@code k}).
  407.      */
  408.     public int getNumberOfHashFunctions() {
  409.         return numberOfHashFunctions;
  410.     }

  411.     /**
  412.      * Calculates the probability of false positives ({@code p}) given
  413.      * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}).
  414.      * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre>
  415.      *
  416.      * <p>This is the probability that a Bloom filter will return true for the presence of an item
  417.      * when it does not contain the item.</p>
  418.      *
  419.      * <p>The probability assumes that the Bloom filter is filled with the expected number of
  420.      * items. If the filter contains fewer items then the actual probability will be lower.
  421.      * Thus, this returns the worst-case false positive probability for a filter that has not
  422.      * exceeded its expected number of items.</p>
  423.      *
  424.      * @param numberOfItems the number of items hashed into the Bloom filter.
  425.      * @return the probability of false positives.
  426.      */
  427.     public double getProbability(final int numberOfItems) {
  428.         if (numberOfItems < 0) {
  429.             throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems);
  430.         }
  431.         if (numberOfItems == 0) {
  432.             return 0;
  433.         }
  434.         return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), numberOfHashFunctions);
  435.     }

  436.     @Override
  437.     public int hashCode() {
  438.         // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions})
  439.         return (31 + numberOfBits) * 31 + numberOfHashFunctions;
  440.     }

  441.     /**
  442.      * Determines if a cardinality is sparse based on the shape.
  443.      * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory
  444.      * necessary to store the cardinality as indexes is less than the estimated memory for bit maps,
  445.      * the cardinality is determined to be {@code sparse}.</p>
  446.      *
  447.      * @param cardinality the cardinality to check.
  448.      * @return true if the cardinality is sparse within the shape.
  449.      */
  450.     public boolean isSparse(final int cardinality) {

  451.         /*
  452.          * Since the size of a bit map is a long and the size of an index is an int,
  453.          * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly
  454.          * distributed across the range of possible values, Thus if the cardinality
  455.          * (number of indexes) is less than or equal to 2*number of bit maps the
  456.          * cardinality is sparse within the shape.
  457.          */
  458.         return cardinality <= BitMaps.numberOfBitMaps(this) * 2;
  459.     }

  460.     @Override
  461.     public String toString() {
  462.         return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits);
  463.     }
  464. }