View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Arrays;
20  import java.util.NoSuchElementException;
21  import java.util.Objects;
22  import java.util.function.IntPredicate;
23  import java.util.function.LongPredicate;
24  import java.util.function.Predicate;
25  
26  /**
27   * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun
28   * (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd
29   * International Conference on Advanced Computer Theory and Engineering (ICACTE
30   * 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN
31   * 978-1-4244-6539-2, S2CID 3108985
32   * <p>
33   * In short, Layered Bloom filter contains several bloom filters arranged in
34   * layers.
35   * </p>
36   * <ul>
37   * <li>When membership in the filter is checked each layer in turn is checked
38   * and if a match is found {@code true} is returned.</li>
39   * <li>When merging each bloom filter is merged into the newest filter in the
40   * list of layers.</li>
41   * <li>When questions of cardinality are asked the cardinality of the union of
42   * the enclosed Bloom filters is used.</li>
43   * </ul>
44   * <p>
45   * The net result is that the layered Bloom filter can be populated with more
46   * items than the Shape would indicate and yet still return a false positive
47   * rate in line with the Shape and not the over population.
48   * </p>
49   * <p>
50   * This implementation uses a LayerManager to handle the manipulation of the
51   * layers.
52   * </p>
53   * <ul>
54   * <li>Level 0 is the oldest layer and the highest level is the newest.</li>
55   * <li>There is always at least one enclosed filter.</li>
56   * <li>The newest filter is the {@code target} into which merges are performed.
57   * <li>Whenever the target is retrieved, or a {@code merge} operation is
58   * performed the code checks if any older layers should be removed, and if so
59   * removes them. It also checks it a new layer should be added, and if so adds
60   * it and sets the {@code target} before the operation.</li>
61   * </ul>
62   * @since 4.5
63   */
64  public class LayeredBloomFilter implements BloomFilter, BloomFilterProducer {
65      /**
66       * A class used to locate matching filters across all the layers.
67       */
68      private class Finder implements Predicate<BloomFilter> {
69          int[] result = new int[layerManager.getDepth()];
70          int bfIdx;
71          int resultIdx;
72          BloomFilter bf;
73  
74          Finder(BloomFilter bf) {
75              this.bf = bf;
76          }
77  
78          int[] getResult() {
79              return Arrays.copyOf(result, resultIdx);
80          }
81  
82          @Override
83          public boolean test(BloomFilter x) {
84              if (x.contains(bf)) {
85                  result[resultIdx++] = bfIdx;
86              }
87              bfIdx++;
88              return true;
89          }
90      }
91      /**
92       * Creates a fixed size layered bloom filter that adds new filters to the list,
93       * but never merges them. List will never exceed maxDepth. As additional filters
94       * are added earlier filters are removed.
95       *
96       * @param shape    The shape for the enclosed Bloom filters.
97       * @param maxDepth The maximum depth of layers.
98       * @return An empty layered Bloom filter of the specified shape and depth.
99       */
100     public static LayeredBloomFilter fixed(final Shape shape, int maxDepth) {
101         LayerManager manager = LayerManager.builder().setExtendCheck(LayerManager.ExtendCheck.advanceOnPopulated())
102                 .setCleanup(LayerManager.Cleanup.onMaxSize(maxDepth)).setSupplier(() -> new SimpleBloomFilter(shape)).build();
103         return new LayeredBloomFilter(shape, manager);
104     }
105 
106     private final Shape shape;
107 
108     private LayerManager layerManager;
109 
110     /**
111      * Constructor.
112      *
113      * @param shape        the Shape of the enclosed Bloom filters
114      * @param layerManager the LayerManager to manage the layers.
115      */
116     public LayeredBloomFilter(Shape shape, LayerManager layerManager) {
117         this.shape = shape;
118         this.layerManager = layerManager;
119     }
120 
121     @Override
122     public int cardinality() {
123         return SetOperations.cardinality(this);
124     }
125 
126     @Override
127     public int characteristics() {
128         return 0;
129     }
130 
131     @Override
132     public final void clear() {
133         layerManager.clear();
134     }
135 
136     @Override
137     public boolean contains(final BitMapProducer bitMapProducer) {
138         return contains(createFilter(bitMapProducer));
139     }
140 
141     /**
142      * Returns {@code true} if this any layer contained by this filter contains the
143      * specified filter.
144      * <p>
145      * If the {@code other} is a BloomFilterProducer each filter within the
146      * {@code other} is checked to see if it exits within this filter.
147      * </p>
148      *
149      * @param other the other Bloom filter
150      * @return {@code true} if this filter contains the other filter.
151      */
152     @Override
153     public boolean contains(final BloomFilter other) {
154         return other instanceof BloomFilterProducer ? contains((BloomFilterProducer) other)
155                 : !forEachBloomFilter(x -> !x.contains(other));
156     }
157 
158     /**
159      * Returns {@code true} if each filter within the {@code producer} exits within
160      * this filter.
161      *
162      * @param producer the BloomFilterProducer that provides the filters to check
163      *                 for.
164      * @return {@code true} if this filter contains all of the filters contained in
165      *         the {@code producer}.
166      */
167     public boolean contains(final BloomFilterProducer producer) {
168         boolean[] result = { true };
169         // return false when we have found a match to short circuit checks
170         return producer.forEachBloomFilter(x -> {
171             result[0] &= contains(x);
172             return result[0];
173         });
174     }
175 
176     @Override
177     public boolean contains(final Hasher hasher) {
178         return contains(createFilter(hasher));
179     }
180 
181     @Override
182     public boolean contains(IndexProducer indexProducer) {
183         return contains(createFilter(indexProducer));
184     }
185 
186     @Override
187     public LayeredBloomFilter copy() {
188         return new LayeredBloomFilter(shape, layerManager.copy());
189     }
190 
191     /**
192      * Creates a Bloom filter from a BitMapProducer.
193      *
194      * @param bitMapProducer the BitMapProducer to create the filter from.
195      * @return the BloomFilter.
196      */
197     private BloomFilter createFilter(final BitMapProducer bitMapProducer) {
198         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
199         bf.merge(bitMapProducer);
200         return bf;
201     }
202 
203     /**
204      * Creates a Bloom filter from a Hasher.
205      *
206      * @param hasher the hasher to create the filter from.
207      * @return the BloomFilter.
208      */
209     private BloomFilter createFilter(final Hasher hasher) {
210         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
211         bf.merge(hasher);
212         return bf;
213     }
214 
215     /**
216      * Creates a Bloom filter from an IndexProducer.
217      *
218      * @param indexProducer the IndexProducer to create the filter from.
219      * @return the BloomFilter.
220      */
221     private BloomFilter createFilter(final IndexProducer indexProducer) {
222         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
223         bf.merge(indexProducer);
224         return bf;
225     }
226 
227     @Override
228     public int estimateN() {
229         return flatten().estimateN();
230     }
231 
232     @Override
233     public int estimateUnion(final BloomFilter other) {
234         Objects.requireNonNull(other, "other");
235         final BloomFilter cpy = this.flatten();
236         cpy.merge(other);
237         return cpy.estimateN();
238     }
239 
240     /**
241      * Finds the layers in which the BitMapProducer is found.
242      *
243      * @param bitMapProducer the BitMapProducer to search for.
244      * @return an array of layer indices in which the Bloom filter is found.
245      */
246     public int[] find(final BitMapProducer bitMapProducer) {
247         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
248         bf.merge(bitMapProducer);
249         return find(bf);
250     }
251 
252     /**
253      * Finds the layers in which the Bloom filter is found.
254      *
255      * @param bf the Bloom filter to search for.
256      * @return an array of layer indices in which the Bloom filter is found.
257      */
258     public int[] find(BloomFilter bf) {
259         Finder finder = new Finder(bf);
260         forEachBloomFilter(finder);
261         return finder.getResult();
262     }
263 
264     /**
265      * Finds the layers in which the Hasher is found.
266      *
267      * @param hasher the Hasher to search for.
268      * @return an array of layer indices in which the Bloom filter is found.
269      */
270     public int[] find(final Hasher hasher) {
271         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
272         bf.merge(hasher);
273         return find(bf);
274     }
275 
276     /**
277      * Finds the layers in which the IndexProducer is found.
278      *
279      * @param indexProducer the Index producer to search for.
280      * @return an array of layer indices in which the Bloom filter is found.
281      */
282     public int[] find(final IndexProducer indexProducer) {
283         SimpleBloomFilter bf = new SimpleBloomFilter(shape);
284         bf.merge(indexProducer);
285         return find(bf);
286     }
287 
288     /**
289      * Create a standard (non-layered) Bloom filter by merging all of the layers. If
290      * the filter is empty this method will return an empty Bloom filter.
291      *
292      * @return the merged bloom filter.
293      */
294     @Override
295     public BloomFilter flatten() {
296         BloomFilter bf = new SimpleBloomFilter(shape);
297         forEachBloomFilter(bf::merge);
298         return bf;
299     }
300 
301     @Override
302     public boolean forEachBitMap(LongPredicate predicate) {
303         return flatten().forEachBitMap(predicate);
304     }
305 
306     /**
307      * Processes the Bloom filters in depth order with the most recent filters
308      * first. Each filter is passed to the predicate in turn. The function exits on
309      * the first {@code false} returned by the predicate.
310      *
311      * @param bloomFilterPredicate the predicate to execute.
312      * @return {@code true} if all filters passed the predicate, {@code false}
313      *         otherwise.
314      */
315     @Override
316     public final boolean forEachBloomFilter(Predicate<BloomFilter> bloomFilterPredicate) {
317         return layerManager.forEachBloomFilter(bloomFilterPredicate);
318     }
319 
320     @Override
321     public boolean forEachIndex(IntPredicate predicate) {
322         return forEachBloomFilter(bf -> bf.forEachIndex(predicate));
323     }
324 
325     /**
326      * Gets the Bloom filter at the specified depth
327      *
328      * @param depth the depth of the filter to return.
329      * @return the Bloom filter at the specified depth.
330      * @throws NoSuchElementException if depth is not in the range [0,getDepth())
331      */
332     public BloomFilter get(int depth) {
333         return layerManager.get(depth);
334     }
335 
336     /**
337      * Gets the depth of the deepest layer. The minimum value returned by this
338      * method is 1.
339      *
340      * @return the depth of the deepest layer.
341      */
342     public final int getDepth() {
343         return layerManager.getDepth();
344     }
345 
346     @Override
347     public final Shape getShape() {
348         return shape;
349     }
350 
351     @Override
352     public boolean isEmpty() {
353         return forEachBloomFilter(BloomFilter::isEmpty);
354     }
355 
356     @Override
357     public boolean merge(BitMapProducer bitMapProducer) {
358         return layerManager.getTarget().merge(bitMapProducer);
359     }
360 
361     @Override
362     public boolean merge(BloomFilter bf) {
363         return layerManager.getTarget().merge(bf);
364     }
365 
366     @Override
367     public boolean merge(IndexProducer indexProducer) {
368         return layerManager.getTarget().merge(indexProducer);
369     }
370 
371     /**
372      * Forces and advance to the next layer. Executes the same logic as when
373      * LayerManager.extendCheck returns {@code true}
374      *
375      * @see LayerManager
376      */
377     public void next() {
378         layerManager.next();
379     }
380 }