View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Objects;
20  
21  /**
22   * The interface that describes a Bloom filter.
23   * <p>
24   * <em>See implementation notes for {@link BitMapExtractor} and {@link IndexExtractor}.</em>
25   * </p>
26   *
27   * @param <T> The BloomFilter type.
28   * @see BitMapExtractor
29   * @see IndexExtractor
30   * @since 4.5.0-M1
31   */
32  public interface BloomFilter<T extends BloomFilter<T>> extends IndexExtractor, BitMapExtractor {
33  
34      /**
35       * The sparse characteristic used to determine the best method for matching: {@value}.
36       * <p>
37       * For `sparse` implementations the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations the
38       * {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier.
39       * </p>
40       */
41      int SPARSE = 0x1;
42  
43      /**
44       * Gets the cardinality (number of enabled bits) of this Bloom filter.
45       *
46       * <p>This is also known as the Hamming value or Hamming number.</p>
47       *
48       * @return the cardinality of this filter
49       */
50      int cardinality();
51  
52      // Query Operations
53  
54      /**
55       * Gets the characteristics of the filter.
56       * <p>
57       * Characteristics are defined as bits within the characteristics integer.
58       * </p>
59       *
60       * @return the characteristics for this bloom filter.
61       */
62      int characteristics();
63  
64      /**
65       * Clears the filter to by resetting it to its initial, unpopulated state.
66       */
67      void clear();
68  
69      /**
70       * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
71       * bitMapExtractor.
72       *
73       * @param bitMapExtractor the {@code BitMapExtractor} to provide the bit maps.
74       * @return {@code true} if this filter is enabled for all bits specified by the bit maps
75       */
76      default boolean contains(final BitMapExtractor bitMapExtractor) {
77          return processBitMapPairs(bitMapExtractor, (x, y) -> (x & y) == y);
78      }
79  
80      /**
81       * Returns {@code true} if this filter contains the specified filter.
82       *
83       * <p>Specifically this
84       * returns {@code true} if this filter is enabled for all bits that are enabled in the
85       * {@code other} filter. Using the bit representations this is
86       * effectively {@code (this AND other) == other}.</p>
87       *
88       * @param other the other Bloom filter
89       * @return true if all enabled bits in the other filter are enabled in this filter.
90       */
91      default boolean contains(final BloomFilter<?> other) {
92          Objects.requireNonNull(other, "other");
93          return (characteristics() & SPARSE) != 0 ? contains((IndexExtractor) other) : contains((BitMapExtractor) other);
94      }
95  
96      /**
97       * Returns {@code true} if this filter contains the bits specified in the hasher.
98       *
99       * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
100      * identified by the {@code hasher}. Using the bit map representations this is
101      * effectively {@code (this AND hasher) == hasher}.</p>
102      *
103      * @param hasher the hasher to provide the indexes
104      * @return true if this filter is enabled for all bits specified by the hasher
105      */
106     default boolean contains(final Hasher hasher) {
107         Objects.requireNonNull(hasher, "Hasher");
108         final Shape shape = getShape();
109         return contains(hasher.indices(shape));
110     }
111 
112     /**
113      * Returns {@code true} if this filter contains the indices specified IndexExtractor.
114      *
115      * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
116      * identified by the {@code IndexExtractor}.</p>
117      *
118      * @param indexExtractor the IndexExtractor to provide the indexes
119      * @return {@code true} if this filter is enabled for all bits specified by the IndexExtractor
120      */
121     boolean contains(IndexExtractor indexExtractor);
122 
123     /**
124      * Creates a new instance of this {@link BloomFilter} with the same properties as the current one.
125      *
126      * @return a copy of this {@link BloomFilter}.
127      */
128     T copy();
129 
130     // update operations
131 
132     /**
133      * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
134      *
135      * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
136      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
137      *
138      * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
139      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
140      * may be thrown.</em></p>
141      *
142      * @param other The other Bloom filter
143      * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
144      * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
145      * @see #estimateN()
146      * @see Shape
147      */
148     default int estimateIntersection(final BloomFilter<?> other) {
149         Objects.requireNonNull(other, "other");
150         final double eThis = getShape().estimateN(cardinality());
151         final double eOther = getShape().estimateN(other.cardinality());
152         if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
153             // if both are infinite the union is infinite and we return Integer.MAX_VALUE
154             return Integer.MAX_VALUE;
155         }
156         long estimate;
157         // if one is infinite the intersection is the other.
158         if (Double.isInfinite(eThis)) {
159             estimate = Math.round(eOther);
160         } else if (Double.isInfinite(eOther)) {
161             estimate = Math.round(eThis);
162         } else {
163             final T union = this.copy();
164             union.merge(other);
165             final double eUnion = getShape().estimateN(union.cardinality());
166             if (Double.isInfinite(eUnion)) {
167                 throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
168             }
169             // maximum estimate value using integer values is: 46144189292 thus
170             // eThis + eOther cannot overflow the long value.
171             estimate = Math.round(eThis + eOther - eUnion);
172             estimate = estimate < 0 ? 0 : estimate;
173         }
174         return estimate > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) estimate;
175     }
176 
177     /**
178      * Estimates the number of items in the Bloom filter.
179      *
180      * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
181      * shape and cardinality of this filter.</p>
182      *
183      * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
184      * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
185      *
186      * <p><em>Note:</em></p>
187      * <ul>
188      * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
189      * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
190      * </ul>
191      *
192      * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
193      * estimate is larger than Integer.MAX_VALUE.
194      * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
195      * @see Shape#estimateN(int)
196      * @see Shape
197      */
198     default int estimateN() {
199         final double d = getShape().estimateN(cardinality());
200         if (Double.isInfinite(d)) {
201             return Integer.MAX_VALUE;
202         }
203         if (Double.isNaN(d)) {
204             throw new IllegalArgumentException("Cardinality too large: " + cardinality());
205         }
206         final long l = Math.round(d);
207         return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
208     }
209 
210     /**
211      * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
212      *
213      * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
214      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
215      *
216      * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
217      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
218      * may be thrown.</em></p>
219      *
220      * @param other The other Bloom filter
221      * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
222      * estimate is larger than Integer.MAX_VALUE.
223      * @see #estimateN()
224      * @see Shape
225      */
226     default int estimateUnion(final BloomFilter<?> other) {
227         Objects.requireNonNull(other, "other");
228         final T copy = this.copy();
229         copy.merge(other);
230         return copy.estimateN();
231     }
232 
233     /**
234      * Gets the shape that was used when the filter was built.
235      * @return The shape the filter was built with.
236      */
237     Shape getShape();
238 
239     // Counting Operations
240 
241     /**
242      * Determines if all the bits are off. This is equivalent to
243      * {@code cardinality() == 0}.
244      *
245      * <p>
246      * <em>Note: This method is optimized for non-sparse filters.</em> Implementers
247      * are encouraged to implement faster checks if possible.
248      * </p>
249      *
250      * @return {@code true} if no bits are enabled, {@code false} otherwise.
251      */
252     default boolean isEmpty() {
253         return processBitMaps(y -> y == 0);
254     }
255 
256     /**
257      * Determines if the bloom filter is "full".
258      *
259      * <p>Full is defined as having no unset bits.</p>
260      *
261      * @return {@code true} if the filter is full, {@code false} otherwise.
262      */
263     default boolean isFull() {
264         return cardinality() == getShape().getNumberOfBits();
265     }
266 
267     /**
268      * Merges the specified hasher into this Bloom filter. Specifically all
269      * bit indexes that are identified by the {@code bitMapExtractor} will be enabled in this filter.
270      *
271      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
272      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
273      * enabled in the {@code bitMapExtractor}.</em>  This state may occur in complex Bloom filter implementations like
274      * counting Bloom filters.</p>
275      *
276      * @param bitMapExtractor The BitMapExtractor to merge.
277      * @return true if the merge was successful
278      * @throws IllegalArgumentException if bitMapExtractor sends illegal value.
279      */
280     boolean merge(BitMapExtractor bitMapExtractor);
281 
282     /**
283      * Merges the specified Bloom filter into this Bloom filter.
284      *
285      * <p>Specifically all
286      * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
287      *
288      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
289      * enabled. A {@code false} result indicates that this filter may or may not contain
290      * the {@code other} Bloom filter.</em>  This state may occur in complex Bloom filter implementations like
291      * counting Bloom filters.</p>
292      *
293      * @param other The bloom filter to merge into this one.
294      * @return true if the merge was successful
295      */
296     default boolean merge(final BloomFilter<?> other) {
297         return (characteristics() & SPARSE) != 0 ? merge((IndexExtractor) other) : merge((BitMapExtractor) other);
298     }
299 
300     /**
301      * Merges the specified hasher into this Bloom filter. Specifically all
302      * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
303      *
304      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
305      * enabled. A {@code false} result indicates that this filter may or may not contain
306      * the {@code hasher} values.</em>  This state may occur in complex Bloom filter implementations like
307      * counting Bloom filters.</p>
308      *
309      * @param hasher The hasher to merge.
310      * @return true if the merge was successful
311      * @throws IllegalArgumentException if hasher produces an illegal value.
312      */
313     default boolean merge(final Hasher hasher) {
314         Objects.requireNonNull(hasher, "hasher");
315         return merge(hasher.indices(getShape()));
316     }
317 
318     /**
319      * Merges the specified IndexExtractor into this Bloom filter. Specifically all
320      * bit indexes that are identified by the {@code indexExtractor} will be enabled in this filter.
321      *
322      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
323      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
324      * the {@code indexExtractor}.</em>  This state may occur in complex Bloom filter implementations like
325      * counting Bloom filters.</p>
326      *
327      * @param indexExtractor The IndexExtractor to merge.
328      * @return true if the merge was successful
329      * @throws IllegalArgumentException if indexExtractor sends illegal value.
330      */
331     boolean merge(IndexExtractor indexExtractor);
332 
333     /**
334      * Most Bloom filters create unique IndexExtractors.
335      */
336     @Override
337     default IndexExtractor uniqueIndices() {
338         return this;
339     }
340 }