001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Arrays; 020import java.util.NoSuchElementException; 021import java.util.Objects; 022import java.util.function.IntPredicate; 023import java.util.function.LongPredicate; 024import java.util.function.Predicate; 025 026/** 027 * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd 028 * International Conference on Advanced Computer Theory and Engineering (ICACTE 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN 029 * 978-1-4244-6539-2, S2CID 3108985 030 * <p> 031 * In short, Layered Bloom filter contains several bloom filters arranged in layers. 032 * </p> 033 * <ul> 034 * <li>When membership in the filter is checked each layer in turn is checked and if a match is found {@code true} is returned.</li> 035 * <li>When merging each bloom filter is merged into the newest filter in the list of layers.</li> 036 * <li>When questions of cardinality are asked the cardinality of the union of the enclosed Bloom filters is used.</li> 037 * </ul> 038 * <p> 039 * The net result is that the layered Bloom filter can be populated with more items than the Shape would indicate and yet still return a false positive rate in 040 * line with the Shape and not the over population. 041 * </p> 042 * <p> 043 * This implementation uses a LayerManager to handle the manipulation of the layers. 044 * </p> 045 * <ul> 046 * <li>Level 0 is the oldest layer and the highest level is the newest.</li> 047 * <li>There is always at least one enclosed filter.</li> 048 * <li>The newest filter is the {@code target} into which merges are performed. 049 * <li>Whenever the target is retrieved, or a {@code merge} operation is performed the code checks if any older layers should be removed, and if so removes 050 * them. It also checks it a new layer should be added, and if so adds it and sets the {@code target} before the operation.</li> 051 * </ul> 052 * 053 * @param <T> The type of Bloom Filter that is used for the layers. 054 * @since 4.5.0-M2 055 */ 056public class LayeredBloomFilter<T extends BloomFilter<T>> implements BloomFilter<LayeredBloomFilter<T>>, BloomFilterExtractor { 057 058 /** 059 * A class used to locate matching filters across all the layers. 060 */ 061 private class Finder implements Predicate<BloomFilter> { 062 int[] result = new int[layerManager.getDepth()]; 063 int bfIdx; 064 int resultIdx; 065 BloomFilter<?> bf; 066 067 Finder(final BloomFilter<?> bf) { 068 this.bf = bf; 069 } 070 071 int[] getResult() { 072 return Arrays.copyOf(result, resultIdx); 073 } 074 075 @Override 076 public boolean test(final BloomFilter x) { 077 if (x.contains(bf)) { 078 result[resultIdx++] = bfIdx; 079 } 080 bfIdx++; 081 return true; 082 } 083 } 084 085 private final Shape shape; 086 087 private final LayerManager<T> layerManager; 088 089 /** 090 * Constructs a new instance. 091 * 092 * @param shape the Shape of the enclosed Bloom filters 093 * @param layerManager the LayerManager to manage the layers. 094 */ 095 public LayeredBloomFilter(final Shape shape, final LayerManager<T> layerManager) { 096 this.shape = shape; 097 this.layerManager = layerManager; 098 } 099 100 @Override 101 public int cardinality() { 102 return SetOperations.cardinality(this); 103 } 104 105 @Override 106 public int characteristics() { 107 return 0; 108 } 109 110 /** 111 * Forces the execution of the cleanup Consumer that was provided when the associated LayerManager was built. 112 * 113 * @see LayerManager.Builder#setCleanup(java.util.function.Consumer) 114 */ 115 public void cleanup() { 116 layerManager.cleanup(); 117 } 118 119 @Override 120 public final void clear() { 121 layerManager.clear(); 122 } 123 124 @Override 125 public boolean contains(final BitMapExtractor bitMapExtractor) { 126 return contains(createFilter(bitMapExtractor)); 127 } 128 129 /** 130 * Returns {@code true} if this any layer contained by this filter contains the specified filter. 131 * <p> 132 * If the {@code other} is a BloomFilterExtractor each filter within the {@code other} is checked to see if it exits within this filter. 133 * </p> 134 * 135 * @param other the other Bloom filter 136 * @return {@code true} if this filter contains the other filter. 137 */ 138 @Override 139 public boolean contains(final BloomFilter other) { 140 return other instanceof BloomFilterExtractor ? contains((BloomFilterExtractor) other) : !processBloomFilters(x -> !x.contains(other)); 141 } 142 143 /** 144 * Returns {@code true} if each filter within the {@code bloomFilterExtractor} exits within this filter. 145 * 146 * @param bloomFilterExtractor the BloomFilterExtractor that provides the filters to check for. 147 * @return {@code true} if this filter contains all of the filters contained in the {@code bloomFilterExtractor}. 148 */ 149 public boolean contains(final BloomFilterExtractor bloomFilterExtractor) { 150 final boolean[] result = { true }; 151 // return false when we have found a match to short circuit checks 152 return bloomFilterExtractor.processBloomFilters(x -> { 153 result[0] &= contains(x); 154 return result[0]; 155 }); 156 } 157 158 @Override 159 public boolean contains(final Hasher hasher) { 160 return contains(createFilter(hasher)); 161 } 162 163 @Override 164 public boolean contains(final IndexExtractor indexExtractor) { 165 return contains(createFilter(indexExtractor)); 166 } 167 168 /** 169 * Creates a new instance of this {@link LayeredBloomFilter} with the same properties as the current one. 170 * 171 * @return a copy of this {@link LayeredBloomFilter}. 172 */ 173 @Override 174 public LayeredBloomFilter<T> copy() { 175 return new LayeredBloomFilter<>(shape, layerManager.copy()); 176 } 177 178 /** 179 * Creates a Bloom filter from a BitMapExtractor. 180 * 181 * @param bitMapExtractor the BitMapExtractor to create the filter from. 182 * @return the BloomFilter. 183 */ 184 private SimpleBloomFilter createFilter(final BitMapExtractor bitMapExtractor) { 185 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 186 bf.merge(bitMapExtractor); 187 return bf; 188 } 189 190 /** 191 * Creates a Bloom filter from a Hasher. 192 * 193 * @param hasher the hasher to create the filter from. 194 * @return the BloomFilter. 195 */ 196 private SimpleBloomFilter createFilter(final Hasher hasher) { 197 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 198 bf.merge(hasher); 199 return bf; 200 } 201 202 /** 203 * Creates a Bloom filter from an IndexExtractor. 204 * 205 * @param indexExtractor the IndexExtractor to create the filter from. 206 * @return the BloomFilter. 207 */ 208 private SimpleBloomFilter createFilter(final IndexExtractor indexExtractor) { 209 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 210 bf.merge(indexExtractor); 211 return bf; 212 } 213 214 @Override 215 public int estimateN() { 216 return flatten().estimateN(); 217 } 218 219 @Override 220 public int estimateUnion(final BloomFilter other) { 221 Objects.requireNonNull(other, "other"); 222 final BloomFilter cpy = this.flatten(); 223 cpy.merge(other); 224 return cpy.estimateN(); 225 } 226 227 /** 228 * Finds the layers in which the BitMapExtractor is found. 229 * 230 * @param bitMapExtractor the BitMapExtractor to search for. 231 * @return an array of layer indices in which the Bloom filter is found. 232 */ 233 public int[] find(final BitMapExtractor bitMapExtractor) { 234 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 235 bf.merge(bitMapExtractor); 236 return find(bf); 237 } 238 239 /** 240 * Finds the layers in which the Bloom filter is found. 241 * 242 * @param bf the Bloom filter to search for. 243 * @return an array of layer indices in which the Bloom filter is found. 244 */ 245 public int[] find(final BloomFilter bf) { 246 final Finder finder = new Finder(bf); 247 processBloomFilters(finder); 248 return finder.getResult(); 249 } 250 251 /** 252 * Finds the layers in which the Hasher is found. 253 * 254 * @param hasher the Hasher to search for. 255 * @return an array of layer indices in which the Bloom filter is found. 256 */ 257 public int[] find(final Hasher hasher) { 258 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 259 bf.merge(hasher); 260 return find(bf); 261 } 262 263 /** 264 * Finds the layers in which the IndexExtractor is found. 265 * 266 * @param indexExtractor the Index extractor to search for. 267 * @return an array of layer indices in which the Bloom filter is found. 268 */ 269 public int[] find(final IndexExtractor indexExtractor) { 270 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 271 bf.merge(indexExtractor); 272 return find(bf); 273 } 274 275 /** 276 * Create a standard (non-layered) Bloom filter by merging all of the layers. If the filter is empty this method will return an empty Bloom filter. 277 * 278 * @return the merged bloom filter. 279 */ 280 @Override 281 public SimpleBloomFilter flatten() { 282 final SimpleBloomFilter bf = new SimpleBloomFilter(shape); 283 processBloomFilters(bf::merge); 284 return bf; 285 } 286 287 /** 288 * Gets the Bloom filter at the specified depth 289 * 290 * @param depth the depth of the filter to return. 291 * @return the Bloom filter at the specified depth. 292 * @throws NoSuchElementException if depth is not in the range [0,getDepth()) 293 */ 294 public T get(final int depth) { 295 return layerManager.get(depth); 296 } 297 298 /** 299 * Gets the depth of the deepest layer. The minimum value returned by this method is 1. 300 * 301 * @return the depth of the deepest layer. 302 */ 303 public final int getDepth() { 304 return layerManager.getDepth(); 305 } 306 307 @Override 308 public final Shape getShape() { 309 return shape; 310 } 311 312 @Override 313 public boolean isEmpty() { 314 return processBloomFilters(BloomFilter::isEmpty); 315 } 316 317 @Override 318 public boolean merge(final BitMapExtractor bitMapExtractor) { 319 return layerManager.getTarget().merge(bitMapExtractor); 320 } 321 322 @Override 323 public boolean merge(final BloomFilter bf) { 324 return layerManager.getTarget().merge(bf); 325 } 326 327 @Override 328 public boolean merge(final IndexExtractor indexExtractor) { 329 return layerManager.getTarget().merge(indexExtractor); 330 } 331 332 /** 333 * Forces and advance to the next layer. This method will clean-up the current layers and generate a new filter layer. In most cases is it unnecessary to 334 * call this method directly. 335 * 336 * @see LayerManager.Builder#setCleanup(java.util.function.Consumer) 337 * @see LayerManager.Builder#setExtendCheck(Predicate) 338 */ 339 public void next() { 340 layerManager.next(); 341 } 342 343 @Override 344 public boolean processBitMaps(final LongPredicate predicate) { 345 return flatten().processBitMaps(predicate); 346 } 347 348 /** 349 * Processes the Bloom filters in depth order with the most recent filters first. Each filter is passed to the predicate in turn. The function exits on the 350 * first {@code false} returned by the predicate. 351 * 352 * @param bloomFilterPredicate the predicate to execute. 353 * @return {@code true} if all filters passed the predicate, {@code false} otherwise. 354 */ 355 @Override 356 public final boolean processBloomFilters(final Predicate<BloomFilter> bloomFilterPredicate) { 357 return layerManager.processBloomFilters(bloomFilterPredicate); 358 } 359 360 @Override 361 public boolean processIndices(final IntPredicate predicate) { 362 return processBloomFilters(bf -> bf.processIndices(predicate)); 363 } 364 365}