1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.bloomfilter;
18
19 import java.util.Arrays;
20 import java.util.NoSuchElementException;
21 import java.util.Objects;
22 import java.util.function.IntPredicate;
23 import java.util.function.LongPredicate;
24 import java.util.function.Predicate;
25
26 /**
27 * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun
28 * (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd
29 * International Conference on Advanced Computer Theory and Engineering (ICACTE
30 * 2010), vol. 1, pp. V1-586-V1-591, doi:10.1109/ICACTE.2010.5578947, ISBN
31 * 978-1-4244-6539-2, S2CID 3108985
32 * <p>
33 * In short, Layered Bloom filter contains several bloom filters arranged in
34 * layers.
35 * </p>
36 * <ul>
37 * <li>When membership in the filter is checked each layer in turn is checked
38 * and if a match is found {@code true} is returned.</li>
39 * <li>When merging each bloom filter is merged into the newest filter in the
40 * list of layers.</li>
41 * <li>When questions of cardinality are asked the cardinality of the union of
42 * the enclosed Bloom filters is used.</li>
43 * </ul>
44 * <p>
45 * The net result is that the layered Bloom filter can be populated with more
46 * items than the Shape would indicate and yet still return a false positive
47 * rate in line with the Shape and not the over population.
48 * </p>
49 * <p>
50 * This implementation uses a LayerManager to handle the manipulation of the
51 * layers.
52 * </p>
53 * <ul>
54 * <li>Level 0 is the oldest layer and the highest level is the newest.</li>
55 * <li>There is always at least one enclosed filter.</li>
56 * <li>The newest filter is the {@code target} into which merges are performed.
57 * <li>Whenever the target is retrieved, or a {@code merge} operation is
58 * performed the code checks if any older layers should be removed, and if so
59 * removes them. It also checks it a new layer should be added, and if so adds
60 * it and sets the {@code target} before the operation.</li>
61 * </ul>
62 * @since 4.5
63 */
64 public class LayeredBloomFilter implements BloomFilter, BloomFilterProducer {
65 /**
66 * A class used to locate matching filters across all the layers.
67 */
68 private class Finder implements Predicate<BloomFilter> {
69 int[] result = new int[layerManager.getDepth()];
70 int bfIdx;
71 int resultIdx;
72 BloomFilter bf;
73
74 Finder(BloomFilter bf) {
75 this.bf = bf;
76 }
77
78 int[] getResult() {
79 return Arrays.copyOf(result, resultIdx);
80 }
81
82 @Override
83 public boolean test(BloomFilter x) {
84 if (x.contains(bf)) {
85 result[resultIdx++] = bfIdx;
86 }
87 bfIdx++;
88 return true;
89 }
90 }
91 /**
92 * Creates a fixed size layered bloom filter that adds new filters to the list,
93 * but never merges them. List will never exceed maxDepth. As additional filters
94 * are added earlier filters are removed.
95 *
96 * @param shape The shape for the enclosed Bloom filters.
97 * @param maxDepth The maximum depth of layers.
98 * @return An empty layered Bloom filter of the specified shape and depth.
99 */
100 public static LayeredBloomFilter fixed(final Shape shape, int maxDepth) {
101 LayerManager manager = LayerManager.builder().setExtendCheck(LayerManager.ExtendCheck.advanceOnPopulated())
102 .setCleanup(LayerManager.Cleanup.onMaxSize(maxDepth)).setSupplier(() -> new SimpleBloomFilter(shape)).build();
103 return new LayeredBloomFilter(shape, manager);
104 }
105
106 private final Shape shape;
107
108 private LayerManager layerManager;
109
110 /**
111 * Constructor.
112 *
113 * @param shape the Shape of the enclosed Bloom filters
114 * @param layerManager the LayerManager to manage the layers.
115 */
116 public LayeredBloomFilter(Shape shape, LayerManager layerManager) {
117 this.shape = shape;
118 this.layerManager = layerManager;
119 }
120
121 @Override
122 public int cardinality() {
123 return SetOperations.cardinality(this);
124 }
125
126 @Override
127 public int characteristics() {
128 return 0;
129 }
130
131 @Override
132 public final void clear() {
133 layerManager.clear();
134 }
135
136 @Override
137 public boolean contains(final BitMapProducer bitMapProducer) {
138 return contains(createFilter(bitMapProducer));
139 }
140
141 /**
142 * Returns {@code true} if this any layer contained by this filter contains the
143 * specified filter.
144 * <p>
145 * If the {@code other} is a BloomFilterProducer each filter within the
146 * {@code other} is checked to see if it exits within this filter.
147 * </p>
148 *
149 * @param other the other Bloom filter
150 * @return {@code true} if this filter contains the other filter.
151 */
152 @Override
153 public boolean contains(final BloomFilter other) {
154 return other instanceof BloomFilterProducer ? contains((BloomFilterProducer) other)
155 : !forEachBloomFilter(x -> !x.contains(other));
156 }
157
158 /**
159 * Returns {@code true} if each filter within the {@code producer} exits within
160 * this filter.
161 *
162 * @param producer the BloomFilterProducer that provides the filters to check
163 * for.
164 * @return {@code true} if this filter contains all of the filters contained in
165 * the {@code producer}.
166 */
167 public boolean contains(final BloomFilterProducer producer) {
168 boolean[] result = { true };
169 // return false when we have found a match to short circuit checks
170 return producer.forEachBloomFilter(x -> {
171 result[0] &= contains(x);
172 return result[0];
173 });
174 }
175
176 @Override
177 public boolean contains(final Hasher hasher) {
178 return contains(createFilter(hasher));
179 }
180
181 @Override
182 public boolean contains(IndexProducer indexProducer) {
183 return contains(createFilter(indexProducer));
184 }
185
186 @Override
187 public LayeredBloomFilter copy() {
188 return new LayeredBloomFilter(shape, layerManager.copy());
189 }
190
191 /**
192 * Creates a Bloom filter from a BitMapProducer.
193 *
194 * @param bitMapProducer the BitMapProducer to create the filter from.
195 * @return the BloomFilter.
196 */
197 private BloomFilter createFilter(final BitMapProducer bitMapProducer) {
198 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
199 bf.merge(bitMapProducer);
200 return bf;
201 }
202
203 /**
204 * Creates a Bloom filter from a Hasher.
205 *
206 * @param hasher the hasher to create the filter from.
207 * @return the BloomFilter.
208 */
209 private BloomFilter createFilter(final Hasher hasher) {
210 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
211 bf.merge(hasher);
212 return bf;
213 }
214
215 /**
216 * Creates a Bloom filter from an IndexProducer.
217 *
218 * @param indexProducer the IndexProducer to create the filter from.
219 * @return the BloomFilter.
220 */
221 private BloomFilter createFilter(final IndexProducer indexProducer) {
222 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
223 bf.merge(indexProducer);
224 return bf;
225 }
226
227 @Override
228 public int estimateN() {
229 return flatten().estimateN();
230 }
231
232 @Override
233 public int estimateUnion(final BloomFilter other) {
234 Objects.requireNonNull(other, "other");
235 final BloomFilter cpy = this.flatten();
236 cpy.merge(other);
237 return cpy.estimateN();
238 }
239
240 /**
241 * Finds the layers in which the BitMapProducer is found.
242 *
243 * @param bitMapProducer the BitMapProducer to search for.
244 * @return an array of layer indices in which the Bloom filter is found.
245 */
246 public int[] find(final BitMapProducer bitMapProducer) {
247 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
248 bf.merge(bitMapProducer);
249 return find(bf);
250 }
251
252 /**
253 * Finds the layers in which the Bloom filter is found.
254 *
255 * @param bf the Bloom filter to search for.
256 * @return an array of layer indices in which the Bloom filter is found.
257 */
258 public int[] find(BloomFilter bf) {
259 Finder finder = new Finder(bf);
260 forEachBloomFilter(finder);
261 return finder.getResult();
262 }
263
264 /**
265 * Finds the layers in which the Hasher is found.
266 *
267 * @param hasher the Hasher to search for.
268 * @return an array of layer indices in which the Bloom filter is found.
269 */
270 public int[] find(final Hasher hasher) {
271 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
272 bf.merge(hasher);
273 return find(bf);
274 }
275
276 /**
277 * Finds the layers in which the IndexProducer is found.
278 *
279 * @param indexProducer the Index producer to search for.
280 * @return an array of layer indices in which the Bloom filter is found.
281 */
282 public int[] find(final IndexProducer indexProducer) {
283 SimpleBloomFilter bf = new SimpleBloomFilter(shape);
284 bf.merge(indexProducer);
285 return find(bf);
286 }
287
288 /**
289 * Create a standard (non-layered) Bloom filter by merging all of the layers. If
290 * the filter is empty this method will return an empty Bloom filter.
291 *
292 * @return the merged bloom filter.
293 */
294 @Override
295 public BloomFilter flatten() {
296 BloomFilter bf = new SimpleBloomFilter(shape);
297 forEachBloomFilter(bf::merge);
298 return bf;
299 }
300
301 @Override
302 public boolean forEachBitMap(LongPredicate predicate) {
303 return flatten().forEachBitMap(predicate);
304 }
305
306 /**
307 * Processes the Bloom filters in depth order with the most recent filters
308 * first. Each filter is passed to the predicate in turn. The function exits on
309 * the first {@code false} returned by the predicate.
310 *
311 * @param bloomFilterPredicate the predicate to execute.
312 * @return {@code true} if all filters passed the predicate, {@code false}
313 * otherwise.
314 */
315 @Override
316 public final boolean forEachBloomFilter(Predicate<BloomFilter> bloomFilterPredicate) {
317 return layerManager.forEachBloomFilter(bloomFilterPredicate);
318 }
319
320 @Override
321 public boolean forEachIndex(IntPredicate predicate) {
322 return forEachBloomFilter(bf -> bf.forEachIndex(predicate));
323 }
324
325 /**
326 * Gets the Bloom filter at the specified depth
327 *
328 * @param depth the depth of the filter to return.
329 * @return the Bloom filter at the specified depth.
330 * @throws NoSuchElementException if depth is not in the range [0,getDepth())
331 */
332 public BloomFilter get(int depth) {
333 return layerManager.get(depth);
334 }
335
336 /**
337 * Gets the depth of the deepest layer. The minimum value returned by this
338 * method is 1.
339 *
340 * @return the depth of the deepest layer.
341 */
342 public final int getDepth() {
343 return layerManager.getDepth();
344 }
345
346 @Override
347 public final Shape getShape() {
348 return shape;
349 }
350
351 @Override
352 public boolean isEmpty() {
353 return forEachBloomFilter(BloomFilter::isEmpty);
354 }
355
356 @Override
357 public boolean merge(BitMapProducer bitMapProducer) {
358 return layerManager.getTarget().merge(bitMapProducer);
359 }
360
361 @Override
362 public boolean merge(BloomFilter bf) {
363 return layerManager.getTarget().merge(bf);
364 }
365
366 @Override
367 public boolean merge(IndexProducer indexProducer) {
368 return layerManager.getTarget().merge(indexProducer);
369 }
370
371 /**
372 * Forces and advance to the next layer. Executes the same logic as when
373 * LayerManager.extendCheck returns {@code true}
374 *
375 * @see LayerManager
376 */
377 public void next() {
378 layerManager.next();
379 }
380 }