LayeredBloomFilter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.Arrays;
  19. import java.util.NoSuchElementException;
  20. import java.util.Objects;
  21. import java.util.function.IntPredicate;
  22. import java.util.function.LongPredicate;
  23. import java.util.function.Predicate;

  24. /**
  25.  * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd
  26.  * International Conference on Advanced Computer Theory and Engineering (ICACTE 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN
  27.  * 978-1-4244-6539-2, S2CID 3108985
  28.  * <p>
  29.  * In short, Layered Bloom filter contains several bloom filters arranged in layers.
  30.  * </p>
  31.  * <ul>
  32.  * <li>When membership in the filter is checked each layer in turn is checked and if a match is found {@code true} is returned.</li>
  33.  * <li>When merging each bloom filter is merged into the newest filter in the list of layers.</li>
  34.  * <li>When questions of cardinality are asked the cardinality of the union of the enclosed Bloom filters is used.</li>
  35.  * </ul>
  36.  * <p>
  37.  * The net result is that the layered Bloom filter can be populated with more items than the Shape would indicate and yet still return a false positive rate in
  38.  * line with the Shape and not the over population.
  39.  * </p>
  40.  * <p>
  41.  * This implementation uses a LayerManager to handle the manipulation of the layers.
  42.  * </p>
  43.  * <ul>
  44.  * <li>Level 0 is the oldest layer and the highest level is the newest.</li>
  45.  * <li>There is always at least one enclosed filter.</li>
  46.  * <li>The newest filter is the {@code target} into which merges are performed.
  47.  * <li>Whenever the target is retrieved, or a {@code merge} operation is performed the code checks if any older layers should be removed, and if so removes
  48.  * them. It also checks it a new layer should be added, and if so adds it and sets the {@code target} before the operation.</li>
  49.  * </ul>
  50.  *
  51.  * @param <T> The type of Bloom Filter that is used for the layers.
  52.  * @since 4.5.0-M2
  53.  */
  54. public class LayeredBloomFilter<T extends BloomFilter<T>> implements BloomFilter<LayeredBloomFilter<T>>, BloomFilterExtractor {

  55.     /**
  56.      * A class used to locate matching filters across all the layers.
  57.      */
  58.     private class Finder implements Predicate<BloomFilter> {
  59.         int[] result = new int[layerManager.getDepth()];
  60.         int bfIdx;
  61.         int resultIdx;
  62.         BloomFilter<?> bf;

  63.         Finder(final BloomFilter<?> bf) {
  64.             this.bf = bf;
  65.         }

  66.         int[] getResult() {
  67.             return Arrays.copyOf(result, resultIdx);
  68.         }

  69.         @Override
  70.         public boolean test(final BloomFilter x) {
  71.             if (x.contains(bf)) {
  72.                 result[resultIdx++] = bfIdx;
  73.             }
  74.             bfIdx++;
  75.             return true;
  76.         }
  77.     }

  78.     private final Shape shape;

  79.     private final LayerManager<T> layerManager;

  80.     /**
  81.      * Constructs a new instance.
  82.      *
  83.      * @param shape        the Shape of the enclosed Bloom filters
  84.      * @param layerManager the LayerManager to manage the layers.
  85.      */
  86.     public LayeredBloomFilter(final Shape shape, final LayerManager<T> layerManager) {
  87.         this.shape = shape;
  88.         this.layerManager = layerManager;
  89.     }

  90.     @Override
  91.     public int cardinality() {
  92.         return SetOperations.cardinality(this);
  93.     }

  94.     @Override
  95.     public int characteristics() {
  96.         return 0;
  97.     }

  98.     /**
  99.      * Forces the execution of the cleanup Consumer that was provided when the associated LayerManager was built.
  100.      *
  101.      * @see LayerManager.Builder#setCleanup(java.util.function.Consumer)
  102.      */
  103.     public void cleanup() {
  104.         layerManager.cleanup();
  105.     }

  106.     @Override
  107.     public final void clear() {
  108.         layerManager.clear();
  109.     }

  110.     @Override
  111.     public boolean contains(final BitMapExtractor bitMapExtractor) {
  112.         return contains(createFilter(bitMapExtractor));
  113.     }

  114.     /**
  115.      * Returns {@code true} if this any layer contained by this filter contains the specified filter.
  116.      * <p>
  117.      * If the {@code other} is a BloomFilterExtractor each filter within the {@code other} is checked to see if it exits within this filter.
  118.      * </p>
  119.      *
  120.      * @param other the other Bloom filter
  121.      * @return {@code true} if this filter contains the other filter.
  122.      */
  123.     @Override
  124.     public boolean contains(final BloomFilter other) {
  125.         return other instanceof BloomFilterExtractor ? contains((BloomFilterExtractor) other) : !processBloomFilters(x -> !x.contains(other));
  126.     }

  127.     /**
  128.      * Returns {@code true} if each filter within the {@code bloomFilterExtractor} exits within this filter.
  129.      *
  130.      * @param bloomFilterExtractor the BloomFilterExtractor that provides the filters to check for.
  131.      * @return {@code true} if this filter contains all of the filters contained in the {@code bloomFilterExtractor}.
  132.      */
  133.     public boolean contains(final BloomFilterExtractor bloomFilterExtractor) {
  134.         final boolean[] result = { true };
  135.         // return false when we have found a match to short circuit checks
  136.         return bloomFilterExtractor.processBloomFilters(x -> {
  137.             result[0] &= contains(x);
  138.             return result[0];
  139.         });
  140.     }

  141.     @Override
  142.     public boolean contains(final Hasher hasher) {
  143.         return contains(createFilter(hasher));
  144.     }

  145.     @Override
  146.     public boolean contains(final IndexExtractor indexExtractor) {
  147.         return contains(createFilter(indexExtractor));
  148.     }

  149.     /**
  150.      * Creates a new instance of this {@link LayeredBloomFilter} with the same properties as the current one.
  151.      *
  152.      * @return a copy of this {@link LayeredBloomFilter}.
  153.      */
  154.     @Override
  155.     public LayeredBloomFilter<T> copy() {
  156.         return new LayeredBloomFilter<>(shape, layerManager.copy());
  157.     }

  158.     /**
  159.      * Creates a Bloom filter from a BitMapExtractor.
  160.      *
  161.      * @param bitMapExtractor the BitMapExtractor to create the filter from.
  162.      * @return the BloomFilter.
  163.      */
  164.     private SimpleBloomFilter createFilter(final BitMapExtractor bitMapExtractor) {
  165.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  166.         bf.merge(bitMapExtractor);
  167.         return bf;
  168.     }

  169.     /**
  170.      * Creates a Bloom filter from a Hasher.
  171.      *
  172.      * @param hasher the hasher to create the filter from.
  173.      * @return the BloomFilter.
  174.      */
  175.     private SimpleBloomFilter createFilter(final Hasher hasher) {
  176.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  177.         bf.merge(hasher);
  178.         return bf;
  179.     }

  180.     /**
  181.      * Creates a Bloom filter from an IndexExtractor.
  182.      *
  183.      * @param indexExtractor the IndexExtractor to create the filter from.
  184.      * @return the BloomFilter.
  185.      */
  186.     private SimpleBloomFilter createFilter(final IndexExtractor indexExtractor) {
  187.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  188.         bf.merge(indexExtractor);
  189.         return bf;
  190.     }

  191.     @Override
  192.     public int estimateN() {
  193.         return flatten().estimateN();
  194.     }

  195.     @Override
  196.     public int estimateUnion(final BloomFilter other) {
  197.         Objects.requireNonNull(other, "other");
  198.         final BloomFilter cpy = this.flatten();
  199.         cpy.merge(other);
  200.         return cpy.estimateN();
  201.     }

  202.     /**
  203.      * Finds the layers in which the BitMapExtractor is found.
  204.      *
  205.      * @param bitMapExtractor the BitMapExtractor to search for.
  206.      * @return an array of layer indices in which the Bloom filter is found.
  207.      */
  208.     public int[] find(final BitMapExtractor bitMapExtractor) {
  209.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  210.         bf.merge(bitMapExtractor);
  211.         return find(bf);
  212.     }

  213.     /**
  214.      * Finds the layers in which the Bloom filter is found.
  215.      *
  216.      * @param bf the Bloom filter to search for.
  217.      * @return an array of layer indices in which the Bloom filter is found.
  218.      */
  219.     public int[] find(final BloomFilter bf) {
  220.         final Finder finder = new Finder(bf);
  221.         processBloomFilters(finder);
  222.         return finder.getResult();
  223.     }

  224.     /**
  225.      * Finds the layers in which the Hasher is found.
  226.      *
  227.      * @param hasher the Hasher to search for.
  228.      * @return an array of layer indices in which the Bloom filter is found.
  229.      */
  230.     public int[] find(final Hasher hasher) {
  231.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  232.         bf.merge(hasher);
  233.         return find(bf);
  234.     }

  235.     /**
  236.      * Finds the layers in which the IndexExtractor is found.
  237.      *
  238.      * @param indexExtractor the Index extractor to search for.
  239.      * @return an array of layer indices in which the Bloom filter is found.
  240.      */
  241.     public int[] find(final IndexExtractor indexExtractor) {
  242.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  243.         bf.merge(indexExtractor);
  244.         return find(bf);
  245.     }

  246.     /**
  247.      * Create a standard (non-layered) Bloom filter by merging all of the layers. If the filter is empty this method will return an empty Bloom filter.
  248.      *
  249.      * @return the merged bloom filter.
  250.      */
  251.     @Override
  252.     public SimpleBloomFilter flatten() {
  253.         final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
  254.         processBloomFilters(bf::merge);
  255.         return bf;
  256.     }

  257.     /**
  258.      * Gets the Bloom filter at the specified depth
  259.      *
  260.      * @param depth the depth of the filter to return.
  261.      * @return the Bloom filter at the specified depth.
  262.      * @throws NoSuchElementException if depth is not in the range [0,getDepth())
  263.      */
  264.     public T get(final int depth) {
  265.         return layerManager.get(depth);
  266.     }

  267.     /**
  268.      * Gets the depth of the deepest layer. The minimum value returned by this method is 1.
  269.      *
  270.      * @return the depth of the deepest layer.
  271.      */
  272.     public final int getDepth() {
  273.         return layerManager.getDepth();
  274.     }

  275.     @Override
  276.     public final Shape getShape() {
  277.         return shape;
  278.     }

  279.     @Override
  280.     public boolean isEmpty() {
  281.         return processBloomFilters(BloomFilter::isEmpty);
  282.     }

  283.     @Override
  284.     public boolean merge(final BitMapExtractor bitMapExtractor) {
  285.         return layerManager.getTarget().merge(bitMapExtractor);
  286.     }

  287.     @Override
  288.     public boolean merge(final BloomFilter bf) {
  289.         return layerManager.getTarget().merge(bf);
  290.     }

  291.     @Override
  292.     public boolean merge(final IndexExtractor indexExtractor) {
  293.         return layerManager.getTarget().merge(indexExtractor);
  294.     }

  295.     /**
  296.      * Forces and advance to the next layer. This method will clean-up the current layers and generate a new filter layer. In most cases is it unnecessary to
  297.      * call this method directly.
  298.      *
  299.      * @see LayerManager.Builder#setCleanup(java.util.function.Consumer)
  300.      * @see LayerManager.Builder#setExtendCheck(Predicate)
  301.      */
  302.     public void next() {
  303.         layerManager.next();
  304.     }

  305.     @Override
  306.     public boolean processBitMaps(final LongPredicate predicate) {
  307.         return flatten().processBitMaps(predicate);
  308.     }

  309.     /**
  310.      * Processes the Bloom filters in depth order with the most recent filters first. Each filter is passed to the predicate in turn. The function exits on the
  311.      * first {@code false} returned by the predicate.
  312.      *
  313.      * @param bloomFilterPredicate the predicate to execute.
  314.      * @return {@code true} if all filters passed the predicate, {@code false} otherwise.
  315.      */
  316.     @Override
  317.     public final boolean processBloomFilters(final Predicate<BloomFilter> bloomFilterPredicate) {
  318.         return layerManager.processBloomFilters(bloomFilterPredicate);
  319.     }

  320.     @Override
  321.     public boolean processIndices(final IntPredicate predicate) {
  322.         return processBloomFilters(bf -> bf.processIndices(predicate));
  323.     }

  324. }