BloomFilter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.Objects;

  19. /**
  20.  * The interface that describes a Bloom filter.
  21.  * <p>
  22.  * <em>See implementation notes for {@link BitMapExtractor} and {@link IndexExtractor}.</em>
  23.  * </p>
  24.  *
  25.  * @param <T> The BloomFilter type.
  26.  * @see BitMapExtractor
  27.  * @see IndexExtractor
  28.  * @since 4.5.0-M1
  29.  */
  30. public interface BloomFilter<T extends BloomFilter<T>> extends IndexExtractor, BitMapExtractor {

  31.     /**
  32.      * The sparse characteristic used to determine the best method for matching: {@value}.
  33.      * <p>
  34.      * For `sparse` implementations the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations the
  35.      * {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier.
  36.      * </p>
  37.      */
  38.     int SPARSE = 0x1;

  39.     /**
  40.      * Gets the cardinality (number of enabled bits) of this Bloom filter.
  41.      *
  42.      * <p>This is also known as the Hamming value or Hamming number.</p>
  43.      *
  44.      * @return the cardinality of this filter
  45.      */
  46.     int cardinality();

  47.     // Query Operations

  48.     /**
  49.      * Gets the characteristics of the filter.
  50.      * <p>
  51.      * Characteristics are defined as bits within the characteristics integer.
  52.      * </p>
  53.      *
  54.      * @return the characteristics for this bloom filter.
  55.      */
  56.     int characteristics();

  57.     /**
  58.      * Clears the filter to by resetting it to its initial, unpopulated state.
  59.      */
  60.     void clear();

  61.     /**
  62.      * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
  63.      * bitMapExtractor.
  64.      *
  65.      * @param bitMapExtractor the {@code BitMapExtractor} to provide the bit maps.
  66.      * @return {@code true} if this filter is enabled for all bits specified by the bit maps
  67.      */
  68.     default boolean contains(final BitMapExtractor bitMapExtractor) {
  69.         return processBitMapPairs(bitMapExtractor, (x, y) -> (x & y) == y);
  70.     }

  71.     /**
  72.      * Returns {@code true} if this filter contains the specified filter.
  73.      *
  74.      * <p>Specifically this
  75.      * returns {@code true} if this filter is enabled for all bits that are enabled in the
  76.      * {@code other} filter. Using the bit representations this is
  77.      * effectively {@code (this AND other) == other}.</p>
  78.      *
  79.      * @param other the other Bloom filter
  80.      * @return true if all enabled bits in the other filter are enabled in this filter.
  81.      */
  82.     default boolean contains(final BloomFilter<?> other) {
  83.         Objects.requireNonNull(other, "other");
  84.         return (characteristics() & SPARSE) != 0 ? contains((IndexExtractor) other) : contains((BitMapExtractor) other);
  85.     }

  86.     /**
  87.      * Returns {@code true} if this filter contains the bits specified in the hasher.
  88.      *
  89.      * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
  90.      * identified by the {@code hasher}. Using the bit map representations this is
  91.      * effectively {@code (this AND hasher) == hasher}.</p>
  92.      *
  93.      * @param hasher the hasher to provide the indexes
  94.      * @return true if this filter is enabled for all bits specified by the hasher
  95.      */
  96.     default boolean contains(final Hasher hasher) {
  97.         Objects.requireNonNull(hasher, "Hasher");
  98.         final Shape shape = getShape();
  99.         return contains(hasher.indices(shape));
  100.     }

  101.     /**
  102.      * Returns {@code true} if this filter contains the indices specified IndexExtractor.
  103.      *
  104.      * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
  105.      * identified by the {@code IndexExtractor}.</p>
  106.      *
  107.      * @param indexExtractor the IndexExtractor to provide the indexes
  108.      * @return {@code true} if this filter is enabled for all bits specified by the IndexExtractor
  109.      */
  110.     boolean contains(IndexExtractor indexExtractor);

  111.     /**
  112.      * Creates a new instance of this {@link BloomFilter} with the same properties as the current one.
  113.      *
  114.      * @return a copy of this {@link BloomFilter}.
  115.      */
  116.     T copy();

  117.     // update operations

  118.     /**
  119.      * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
  120.      *
  121.      * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
  122.      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
  123.      *
  124.      * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
  125.      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
  126.      * may be thrown.</em></p>
  127.      *
  128.      * @param other The other Bloom filter
  129.      * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
  130.      * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
  131.      * @see #estimateN()
  132.      * @see Shape
  133.      */
  134.     default int estimateIntersection(final BloomFilter<?> other) {
  135.         Objects.requireNonNull(other, "other");
  136.         final double eThis = getShape().estimateN(cardinality());
  137.         final double eOther = getShape().estimateN(other.cardinality());
  138.         if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
  139.             // if both are infinite the union is infinite and we return Integer.MAX_VALUE
  140.             return Integer.MAX_VALUE;
  141.         }
  142.         long estimate;
  143.         // if one is infinite the intersection is the other.
  144.         if (Double.isInfinite(eThis)) {
  145.             estimate = Math.round(eOther);
  146.         } else if (Double.isInfinite(eOther)) {
  147.             estimate = Math.round(eThis);
  148.         } else {
  149.             final T union = this.copy();
  150.             union.merge(other);
  151.             final double eUnion = getShape().estimateN(union.cardinality());
  152.             if (Double.isInfinite(eUnion)) {
  153.                 throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
  154.             }
  155.             // maximum estimate value using integer values is: 46144189292 thus
  156.             // eThis + eOther cannot overflow the long value.
  157.             estimate = Math.round(eThis + eOther - eUnion);
  158.             estimate = estimate < 0 ? 0 : estimate;
  159.         }
  160.         return estimate > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) estimate;
  161.     }

  162.     /**
  163.      * Estimates the number of items in the Bloom filter.
  164.      *
  165.      * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
  166.      * shape and cardinality of this filter.</p>
  167.      *
  168.      * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
  169.      * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
  170.      *
  171.      * <p><em>Note:</em></p>
  172.      * <ul>
  173.      * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
  174.      * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
  175.      * </ul>
  176.      *
  177.      * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
  178.      * estimate is larger than Integer.MAX_VALUE.
  179.      * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
  180.      * @see Shape#estimateN(int)
  181.      * @see Shape
  182.      */
  183.     default int estimateN() {
  184.         final double d = getShape().estimateN(cardinality());
  185.         if (Double.isInfinite(d)) {
  186.             return Integer.MAX_VALUE;
  187.         }
  188.         if (Double.isNaN(d)) {
  189.             throw new IllegalArgumentException("Cardinality too large: " + cardinality());
  190.         }
  191.         final long l = Math.round(d);
  192.         return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
  193.     }

  194.     /**
  195.      * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
  196.      *
  197.      * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
  198.      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
  199.      *
  200.      * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
  201.      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
  202.      * may be thrown.</em></p>
  203.      *
  204.      * @param other The other Bloom filter
  205.      * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
  206.      * estimate is larger than Integer.MAX_VALUE.
  207.      * @see #estimateN()
  208.      * @see Shape
  209.      */
  210.     default int estimateUnion(final BloomFilter<?> other) {
  211.         Objects.requireNonNull(other, "other");
  212.         final T copy = this.copy();
  213.         copy.merge(other);
  214.         return copy.estimateN();
  215.     }

  216.     /**
  217.      * Gets the shape that was used when the filter was built.
  218.      * @return The shape the filter was built with.
  219.      */
  220.     Shape getShape();

  221.     // Counting Operations

  222.     /**
  223.      * Determines if all the bits are off. This is equivalent to
  224.      * {@code cardinality() == 0}.
  225.      *
  226.      * <p>
  227.      * <em>Note: This method is optimized for non-sparse filters.</em> Implementers
  228.      * are encouraged to implement faster checks if possible.
  229.      * </p>
  230.      *
  231.      * @return {@code true} if no bits are enabled, {@code false} otherwise.
  232.      */
  233.     default boolean isEmpty() {
  234.         return processBitMaps(y -> y == 0);
  235.     }

  236.     /**
  237.      * Determines if the bloom filter is "full".
  238.      *
  239.      * <p>Full is defined as having no unset bits.</p>
  240.      *
  241.      * @return {@code true} if the filter is full, {@code false} otherwise.
  242.      */
  243.     default boolean isFull() {
  244.         return cardinality() == getShape().getNumberOfBits();
  245.     }

  246.     /**
  247.      * Merges the specified hasher into this Bloom filter. Specifically all
  248.      * bit indexes that are identified by the {@code bitMapExtractor} will be enabled in this filter.
  249.      *
  250.      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
  251.      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
  252.      * enabled in the {@code bitMapExtractor}.</em>  This state may occur in complex Bloom filter implementations like
  253.      * counting Bloom filters.</p>
  254.      *
  255.      * @param bitMapExtractor The BitMapExtractor to merge.
  256.      * @return true if the merge was successful
  257.      * @throws IllegalArgumentException if bitMapExtractor sends illegal value.
  258.      */
  259.     boolean merge(BitMapExtractor bitMapExtractor);

  260.     /**
  261.      * Merges the specified Bloom filter into this Bloom filter.
  262.      *
  263.      * <p>Specifically all
  264.      * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
  265.      *
  266.      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
  267.      * enabled. A {@code false} result indicates that this filter may or may not contain
  268.      * the {@code other} Bloom filter.</em>  This state may occur in complex Bloom filter implementations like
  269.      * counting Bloom filters.</p>
  270.      *
  271.      * @param other The bloom filter to merge into this one.
  272.      * @return true if the merge was successful
  273.      */
  274.     default boolean merge(final BloomFilter<?> other) {
  275.         return (characteristics() & SPARSE) != 0 ? merge((IndexExtractor) other) : merge((BitMapExtractor) other);
  276.     }

  277.     /**
  278.      * Merges the specified hasher into this Bloom filter. Specifically all
  279.      * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
  280.      *
  281.      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
  282.      * enabled. A {@code false} result indicates that this filter may or may not contain
  283.      * the {@code hasher} values.</em>  This state may occur in complex Bloom filter implementations like
  284.      * counting Bloom filters.</p>
  285.      *
  286.      * @param hasher The hasher to merge.
  287.      * @return true if the merge was successful
  288.      * @throws IllegalArgumentException if hasher produces an illegal value.
  289.      */
  290.     default boolean merge(final Hasher hasher) {
  291.         Objects.requireNonNull(hasher, "hasher");
  292.         return merge(hasher.indices(getShape()));
  293.     }

  294.     /**
  295.      * Merges the specified IndexExtractor into this Bloom filter. Specifically all
  296.      * bit indexes that are identified by the {@code indexExtractor} will be enabled in this filter.
  297.      *
  298.      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
  299.      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
  300.      * the {@code indexExtractor}.</em>  This state may occur in complex Bloom filter implementations like
  301.      * counting Bloom filters.</p>
  302.      *
  303.      * @param indexExtractor The IndexExtractor to merge.
  304.      * @return true if the merge was successful
  305.      * @throws IllegalArgumentException if indexExtractor sends illegal value.
  306.      */
  307.     boolean merge(IndexExtractor indexExtractor);

  308.     /**
  309.      * Most Bloom filters create unique IndexExtractors.
  310.      */
  311.     @Override
  312.     default IndexExtractor uniqueIndices() {
  313.         return this;
  314.     }
  315. }