001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Arrays; 020import java.util.NoSuchElementException; 021import java.util.Objects; 022import java.util.function.IntPredicate; 023import java.util.function.LongPredicate; 024import java.util.function.Predicate; 025 026/** 027 * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun 028 * (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd 029 * International Conference on Advanced Computer Theory and Engineering (ICACTE 030 * 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN 031 * 978-1-4244-6539-2, S2CID 3108985 032 * <p> 033 * In short, Layered Bloom filter contains several bloom filters arranged in 034 * layers. 035 * </p> 036 * <ul> 037 * <li>When membership in the filter is checked each layer in turn is checked 038 * and if a match is found {@code true} is returned.</li> 039 * <li>When merging each bloom filter is merged into the newest filter in the 040 * list of layers.</li> 041 * <li>When questions of cardinality are asked the cardinality of the union of 042 * the enclosed Bloom filters is used.</li> 043 * </ul> 044 * <p> 045 * The net result is that the layered Bloom filter can be populated with more 046 * items than the Shape would indicate and yet still return a false positive 047 * rate in line with the Shape and not the over population. 048 * </p> 049 * <p> 050 * This implementation uses a LayerManager to handle the manipulation of the 051 * layers. 052 * </p> 053 * <ul> 054 * <li>Level 0 is the oldest layer and the highest level is the newest.</li> 055 * <li>There is always at least one enclosed filter.</li> 056 * <li>The newest filter is the {@code target} into which merges are performed. 057 * <li>Whenever the target is retrieved, or a {@code merge} operation is 058 * performed the code checks if any older layers should be removed, and if so 059 * removes them. It also checks it a new layer should be added, and if so adds 060 * it and sets the {@code target} before the operation.</li> 061 * </ul> 062 * @since 4.5 063 */ 064public class LayeredBloomFilter implements BloomFilter, BloomFilterProducer { 065 /** 066 * A class used to locate matching filters across all the layers. 067 */ 068 private class Finder implements Predicate<BloomFilter> { 069 int[] result = new int[layerManager.getDepth()]; 070 int bfIdx; 071 int resultIdx; 072 BloomFilter bf; 073 074 Finder(BloomFilter bf) { 075 this.bf = bf; 076 } 077 078 int[] getResult() { 079 return Arrays.copyOf(result, resultIdx); 080 } 081 082 @Override 083 public boolean test(BloomFilter x) { 084 if (x.contains(bf)) { 085 result[resultIdx++] = bfIdx; 086 } 087 bfIdx++; 088 return true; 089 } 090 } 091 /** 092 * Creates a fixed size layered bloom filter that adds new filters to the list, 093 * but never merges them. List will never exceed maxDepth. As additional filters 094 * are added earlier filters are removed. 095 * 096 * @param shape The shape for the enclosed Bloom filters. 097 * @param maxDepth The maximum depth of layers. 098 * @return An empty layered Bloom filter of the specified shape and depth. 099 */ 100 public static LayeredBloomFilter fixed(final Shape shape, int maxDepth) { 101 LayerManager manager = LayerManager.builder().setExtendCheck(LayerManager.ExtendCheck.advanceOnPopulated()) 102 .setCleanup(LayerManager.Cleanup.onMaxSize(maxDepth)).setSupplier(() -> new SimpleBloomFilter(shape)).build(); 103 return new LayeredBloomFilter(shape, manager); 104 } 105 106 private final Shape shape; 107 108 private LayerManager layerManager; 109 110 /** 111 * Constructor. 112 * 113 * @param shape the Shape of the enclosed Bloom filters 114 * @param layerManager the LayerManager to manage the layers. 115 */ 116 public LayeredBloomFilter(Shape shape, LayerManager layerManager) { 117 this.shape = shape; 118 this.layerManager = layerManager; 119 } 120 121 @Override 122 public int cardinality() { 123 return SetOperations.cardinality(this); 124 } 125 126 @Override 127 public int characteristics() { 128 return 0; 129 } 130 131 @Override 132 public final void clear() { 133 layerManager.clear(); 134 } 135 136 @Override 137 public boolean contains(final BitMapProducer bitMapProducer) { 138 return contains(createFilter(bitMapProducer)); 139 } 140 141 /** 142 * Returns {@code true} if this any layer contained by this filter contains the 143 * specified filter. 144 * <p> 145 * If the {@code other} is a BloomFilterProducer each filter within the 146 * {@code other} is checked to see if it exits within this filter. 147 * </p> 148 * 149 * @param other the other Bloom filter 150 * @return {@code true} if this filter contains the other filter. 151 */ 152 @Override 153 public boolean contains(final BloomFilter other) { 154 return other instanceof BloomFilterProducer ? contains((BloomFilterProducer) other) 155 : !forEachBloomFilter(x -> !x.contains(other)); 156 } 157 158 /** 159 * Returns {@code true} if each filter within the {@code producer} exits within 160 * this filter. 161 * 162 * @param producer the BloomFilterProducer that provides the filters to check 163 * for. 164 * @return {@code true} if this filter contains all of the filters contained in 165 * the {@code producer}. 166 */ 167 public boolean contains(final BloomFilterProducer producer) { 168 boolean[] result = { true }; 169 // return false when we have found a match to short circuit checks 170 return producer.forEachBloomFilter(x -> { 171 result[0] &= contains(x); 172 return result[0]; 173 }); 174 } 175 176 @Override 177 public boolean contains(final Hasher hasher) { 178 return contains(createFilter(hasher)); 179 } 180 181 @Override 182 public boolean contains(IndexProducer indexProducer) { 183 return contains(createFilter(indexProducer)); 184 } 185 186 @Override 187 public LayeredBloomFilter copy() { 188 return new LayeredBloomFilter(shape, layerManager.copy()); 189 } 190 191 /** 192 * Creates a Bloom filter from a BitMapProducer. 193 * 194 * @param bitMapProducer the BitMapProducer to create the filter from. 195 * @return the BloomFilter. 196 */ 197 private BloomFilter createFilter(final BitMapProducer bitMapProducer) { 198 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 199 bf.merge(bitMapProducer); 200 return bf; 201 } 202 203 /** 204 * Creates a Bloom filter from a Hasher. 205 * 206 * @param hasher the hasher to create the filter from. 207 * @return the BloomFilter. 208 */ 209 private BloomFilter createFilter(final Hasher hasher) { 210 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 211 bf.merge(hasher); 212 return bf; 213 } 214 215 /** 216 * Creates a Bloom filter from an IndexProducer. 217 * 218 * @param indexProducer the IndexProducer to create the filter from. 219 * @return the BloomFilter. 220 */ 221 private BloomFilter createFilter(final IndexProducer indexProducer) { 222 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 223 bf.merge(indexProducer); 224 return bf; 225 } 226 227 @Override 228 public int estimateN() { 229 return flatten().estimateN(); 230 } 231 232 @Override 233 public int estimateUnion(final BloomFilter other) { 234 Objects.requireNonNull(other, "other"); 235 final BloomFilter cpy = this.flatten(); 236 cpy.merge(other); 237 return cpy.estimateN(); 238 } 239 240 /** 241 * Finds the layers in which the BitMapProducer is found. 242 * 243 * @param bitMapProducer the BitMapProducer to search for. 244 * @return an array of layer indices in which the Bloom filter is found. 245 */ 246 public int[] find(final BitMapProducer bitMapProducer) { 247 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 248 bf.merge(bitMapProducer); 249 return find(bf); 250 } 251 252 /** 253 * Finds the layers in which the Bloom filter is found. 254 * 255 * @param bf the Bloom filter to search for. 256 * @return an array of layer indices in which the Bloom filter is found. 257 */ 258 public int[] find(BloomFilter bf) { 259 Finder finder = new Finder(bf); 260 forEachBloomFilter(finder); 261 return finder.getResult(); 262 } 263 264 /** 265 * Finds the layers in which the Hasher is found. 266 * 267 * @param hasher the Hasher to search for. 268 * @return an array of layer indices in which the Bloom filter is found. 269 */ 270 public int[] find(final Hasher hasher) { 271 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 272 bf.merge(hasher); 273 return find(bf); 274 } 275 276 /** 277 * Finds the layers in which the IndexProducer is found. 278 * 279 * @param indexProducer the Index producer to search for. 280 * @return an array of layer indices in which the Bloom filter is found. 281 */ 282 public int[] find(final IndexProducer indexProducer) { 283 SimpleBloomFilter bf = new SimpleBloomFilter(shape); 284 bf.merge(indexProducer); 285 return find(bf); 286 } 287 288 /** 289 * Create a standard (non-layered) Bloom filter by merging all of the layers. If 290 * the filter is empty this method will return an empty Bloom filter. 291 * 292 * @return the merged bloom filter. 293 */ 294 @Override 295 public BloomFilter flatten() { 296 BloomFilter bf = new SimpleBloomFilter(shape); 297 forEachBloomFilter(bf::merge); 298 return bf; 299 } 300 301 @Override 302 public boolean forEachBitMap(LongPredicate predicate) { 303 return flatten().forEachBitMap(predicate); 304 } 305 306 /** 307 * Processes the Bloom filters in depth order with the most recent filters 308 * first. Each filter is passed to the predicate in turn. The function exits on 309 * the first {@code false} returned by the predicate. 310 * 311 * @param bloomFilterPredicate the predicate to execute. 312 * @return {@code true} if all filters passed the predicate, {@code false} 313 * otherwise. 314 */ 315 @Override 316 public final boolean forEachBloomFilter(Predicate<BloomFilter> bloomFilterPredicate) { 317 return layerManager.forEachBloomFilter(bloomFilterPredicate); 318 } 319 320 @Override 321 public boolean forEachIndex(IntPredicate predicate) { 322 return forEachBloomFilter(bf -> bf.forEachIndex(predicate)); 323 } 324 325 /** 326 * Gets the Bloom filter at the specified depth 327 * 328 * @param depth the depth of the filter to return. 329 * @return the Bloom filter at the specified depth. 330 * @throws NoSuchElementException if depth is not in the range [0,getDepth()) 331 */ 332 public BloomFilter get(int depth) { 333 return layerManager.get(depth); 334 } 335 336 /** 337 * Gets the depth of the deepest layer. The minimum value returned by this 338 * method is 1. 339 * 340 * @return the depth of the deepest layer. 341 */ 342 public final int getDepth() { 343 return layerManager.getDepth(); 344 } 345 346 @Override 347 public final Shape getShape() { 348 return shape; 349 } 350 351 @Override 352 public boolean isEmpty() { 353 return forEachBloomFilter(BloomFilter::isEmpty); 354 } 355 356 @Override 357 public boolean merge(BitMapProducer bitMapProducer) { 358 return layerManager.getTarget().merge(bitMapProducer); 359 } 360 361 @Override 362 public boolean merge(BloomFilter bf) { 363 return layerManager.getTarget().merge(bf); 364 } 365 366 @Override 367 public boolean merge(IndexProducer indexProducer) { 368 return layerManager.getTarget().merge(indexProducer); 369 } 370 371 /** 372 * Forces and advance to the next layer. Executes the same logic as when 373 * LayerManager.extendCheck returns {@code true} 374 * 375 * @see LayerManager 376 */ 377 public void next() { 378 layerManager.next(); 379 } 380}