LayeredBloomFilter.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Arrays;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.LongPredicate;
import java.util.function.Predicate;
/**
* Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd
* International Conference on Advanced Computer Theory and Engineering (ICACTE 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN
* 978-1-4244-6539-2, S2CID 3108985
* <p>
* In short, Layered Bloom filter contains several bloom filters arranged in layers.
* </p>
* <ul>
* <li>When membership in the filter is checked each layer in turn is checked and if a match is found {@code true} is returned.</li>
* <li>When merging each bloom filter is merged into the newest filter in the list of layers.</li>
* <li>When questions of cardinality are asked the cardinality of the union of the enclosed Bloom filters is used.</li>
* </ul>
* <p>
* The net result is that the layered Bloom filter can be populated with more items than the Shape would indicate and yet still return a false positive rate in
* line with the Shape and not the over population.
* </p>
* <p>
* This implementation uses a LayerManager to handle the manipulation of the layers.
* </p>
* <ul>
* <li>Level 0 is the oldest layer and the highest level is the newest.</li>
* <li>There is always at least one enclosed filter.</li>
* <li>The newest filter is the {@code target} into which merges are performed.
* <li>Whenever the target is retrieved, or a {@code merge} operation is performed the code checks if any older layers should be removed, and if so removes
* them. It also checks it a new layer should be added, and if so adds it and sets the {@code target} before the operation.</li>
* </ul>
*
* @param <T> The type of Bloom Filter that is used for the layers.
* @since 4.5.0-M2
*/
public class LayeredBloomFilter<T extends BloomFilter<T>> implements BloomFilter<LayeredBloomFilter<T>>, BloomFilterExtractor {
/**
* A class used to locate matching filters across all the layers.
*/
private class Finder implements Predicate<BloomFilter> {
int[] result = new int[layerManager.getDepth()];
int bfIdx;
int resultIdx;
BloomFilter<?> bf;
Finder(final BloomFilter<?> bf) {
this.bf = bf;
}
int[] getResult() {
return Arrays.copyOf(result, resultIdx);
}
@Override
public boolean test(final BloomFilter x) {
if (x.contains(bf)) {
result[resultIdx++] = bfIdx;
}
bfIdx++;
return true;
}
}
private final Shape shape;
private final LayerManager<T> layerManager;
/**
* Constructs a new instance.
*
* @param shape the Shape of the enclosed Bloom filters
* @param layerManager the LayerManager to manage the layers.
*/
public LayeredBloomFilter(final Shape shape, final LayerManager<T> layerManager) {
this.shape = shape;
this.layerManager = layerManager;
}
@Override
public int cardinality() {
return SetOperations.cardinality(this);
}
@Override
public int characteristics() {
return 0;
}
/**
* Forces the execution of the cleanup Consumer that was provided when the associated LayerManager was built.
*
* @see LayerManager.Builder#setCleanup(java.util.function.Consumer)
*/
public void cleanup() {
layerManager.cleanup();
}
@Override
public final void clear() {
layerManager.clear();
}
@Override
public boolean contains(final BitMapExtractor bitMapExtractor) {
return contains(createFilter(bitMapExtractor));
}
/**
* Returns {@code true} if this any layer contained by this filter contains the specified filter.
* <p>
* If the {@code other} is a BloomFilterExtractor each filter within the {@code other} is checked to see if it exits within this filter.
* </p>
*
* @param other the other Bloom filter
* @return {@code true} if this filter contains the other filter.
*/
@Override
public boolean contains(final BloomFilter other) {
return other instanceof BloomFilterExtractor ? contains((BloomFilterExtractor) other) : !processBloomFilters(x -> !x.contains(other));
}
/**
* Returns {@code true} if each filter within the {@code bloomFilterExtractor} exits within this filter.
*
* @param bloomFilterExtractor the BloomFilterExtractor that provides the filters to check for.
* @return {@code true} if this filter contains all of the filters contained in the {@code bloomFilterExtractor}.
*/
public boolean contains(final BloomFilterExtractor bloomFilterExtractor) {
final boolean[] result = { true };
// return false when we have found a match to short circuit checks
return bloomFilterExtractor.processBloomFilters(x -> {
result[0] &= contains(x);
return result[0];
});
}
@Override
public boolean contains(final Hasher hasher) {
return contains(createFilter(hasher));
}
@Override
public boolean contains(final IndexExtractor indexExtractor) {
return contains(createFilter(indexExtractor));
}
/**
* Creates a new instance of this {@link LayeredBloomFilter} with the same properties as the current one.
*
* @return a copy of this {@link LayeredBloomFilter}.
*/
@Override
public LayeredBloomFilter<T> copy() {
return new LayeredBloomFilter<>(shape, layerManager.copy());
}
/**
* Creates a Bloom filter from a BitMapExtractor.
*
* @param bitMapExtractor the BitMapExtractor to create the filter from.
* @return the BloomFilter.
*/
private SimpleBloomFilter createFilter(final BitMapExtractor bitMapExtractor) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(bitMapExtractor);
return bf;
}
/**
* Creates a Bloom filter from a Hasher.
*
* @param hasher the hasher to create the filter from.
* @return the BloomFilter.
*/
private SimpleBloomFilter createFilter(final Hasher hasher) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(hasher);
return bf;
}
/**
* Creates a Bloom filter from an IndexExtractor.
*
* @param indexExtractor the IndexExtractor to create the filter from.
* @return the BloomFilter.
*/
private SimpleBloomFilter createFilter(final IndexExtractor indexExtractor) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(indexExtractor);
return bf;
}
@Override
public int estimateN() {
return flatten().estimateN();
}
@Override
public int estimateUnion(final BloomFilter other) {
Objects.requireNonNull(other, "other");
final BloomFilter cpy = this.flatten();
cpy.merge(other);
return cpy.estimateN();
}
/**
* Finds the layers in which the BitMapExtractor is found.
*
* @param bitMapExtractor the BitMapExtractor to search for.
* @return an array of layer indices in which the Bloom filter is found.
*/
public int[] find(final BitMapExtractor bitMapExtractor) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(bitMapExtractor);
return find(bf);
}
/**
* Finds the layers in which the Bloom filter is found.
*
* @param bf the Bloom filter to search for.
* @return an array of layer indices in which the Bloom filter is found.
*/
public int[] find(final BloomFilter bf) {
final Finder finder = new Finder(bf);
processBloomFilters(finder);
return finder.getResult();
}
/**
* Finds the layers in which the Hasher is found.
*
* @param hasher the Hasher to search for.
* @return an array of layer indices in which the Bloom filter is found.
*/
public int[] find(final Hasher hasher) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(hasher);
return find(bf);
}
/**
* Finds the layers in which the IndexExtractor is found.
*
* @param indexExtractor the Index extractor to search for.
* @return an array of layer indices in which the Bloom filter is found.
*/
public int[] find(final IndexExtractor indexExtractor) {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
bf.merge(indexExtractor);
return find(bf);
}
/**
* Create a standard (non-layered) Bloom filter by merging all of the layers. If the filter is empty this method will return an empty Bloom filter.
*
* @return the merged bloom filter.
*/
@Override
public SimpleBloomFilter flatten() {
final SimpleBloomFilter bf = new SimpleBloomFilter(shape);
processBloomFilters(bf::merge);
return bf;
}
/**
* Gets the Bloom filter at the specified depth
*
* @param depth the depth of the filter to return.
* @return the Bloom filter at the specified depth.
* @throws NoSuchElementException if depth is not in the range [0,getDepth())
*/
public T get(final int depth) {
return layerManager.get(depth);
}
/**
* Gets the depth of the deepest layer. The minimum value returned by this method is 1.
*
* @return the depth of the deepest layer.
*/
public final int getDepth() {
return layerManager.getDepth();
}
@Override
public final Shape getShape() {
return shape;
}
@Override
public boolean isEmpty() {
return processBloomFilters(BloomFilter::isEmpty);
}
@Override
public boolean merge(final BitMapExtractor bitMapExtractor) {
return layerManager.getTarget().merge(bitMapExtractor);
}
@Override
public boolean merge(final BloomFilter bf) {
return layerManager.getTarget().merge(bf);
}
@Override
public boolean merge(final IndexExtractor indexExtractor) {
return layerManager.getTarget().merge(indexExtractor);
}
/**
* Forces and advance to the next layer. This method will clean-up the current layers and generate a new filter layer. In most cases is it unnecessary to
* call this method directly.
*
* @see LayerManager.Builder#setCleanup(java.util.function.Consumer)
* @see LayerManager.Builder#setExtendCheck(Predicate)
*/
public void next() {
layerManager.next();
}
@Override
public boolean processBitMaps(final LongPredicate predicate) {
return flatten().processBitMaps(predicate);
}
/**
* Processes the Bloom filters in depth order with the most recent filters first. Each filter is passed to the predicate in turn. The function exits on the
* first {@code false} returned by the predicate.
*
* @param bloomFilterPredicate the predicate to execute.
* @return {@code true} if all filters passed the predicate, {@code false} otherwise.
*/
@Override
public final boolean processBloomFilters(final Predicate<BloomFilter> bloomFilterPredicate) {
return layerManager.processBloomFilters(bloomFilterPredicate);
}
@Override
public boolean processIndices(final IntPredicate predicate) {
return processBloomFilters(bf -> bf.processIndices(predicate));
}
}