View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Objects;
20  
21  /**
22   * The interface that describes a Bloom filter.
23   * <p>
24   * <em>See implementation notes for BitMapProducer and IndexProducer.</em>
25   * </p>
26   * @see BitMapProducer
27   * @see IndexProducer
28   * @since 4.5
29   */
30  public interface BloomFilter extends IndexProducer, BitMapProducer {
31  
32      /**
33       * The sparse characteristic used to determine the best method for matching.
34       * <p>For `sparse` implementations
35       * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations
36       * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier
37       * for the implementation to produce indexes of bit map blocks.</p>
38       */
39      int SPARSE = 0x1;
40  
41      /**
42       * Gets the cardinality (number of enabled bits) of this Bloom filter.
43       *
44       * <p>This is also known as the Hamming value or Hamming number.</p>
45       *
46       * @return the cardinality of this filter
47       */
48      int cardinality();
49  
50      // Query Operations
51  
52      /**
53       * Returns the characteristics of the filter.
54       * <p>
55       * Characteristics are defined as bits within the characteristics integer.
56       * @return the characteristics for this bloom filter.
57       */
58      int characteristics();
59  
60      /**
61       * Resets the filter to its initial, unpopulated state.
62       */
63      void clear();
64  
65      /**
66       * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
67       * bitMapProducer.
68       *
69       * @param bitMapProducer the {@code BitMapProducer} to provide the bit maps.
70       * @return {@code true} if this filter is enabled for all bits specified by the bit maps
71       */
72      default boolean contains(final BitMapProducer bitMapProducer) {
73          return forEachBitMapPair(bitMapProducer, (x, y) -> (x & y) == y);
74      }
75  
76      /**
77       * Returns {@code true} if this filter contains the specified filter.
78       *
79       * <p>Specifically this
80       * returns {@code true} if this filter is enabled for all bits that are enabled in the
81       * {@code other} filter. Using the bit representations this is
82       * effectively {@code (this AND other) == other}.</p>
83       *
84       * @param other the other Bloom filter
85       * @return true if all enabled bits in the other filter are enabled in this filter.
86       */
87      default boolean contains(final BloomFilter other) {
88          Objects.requireNonNull(other, "other");
89          return (characteristics() & SPARSE) != 0 ? contains((IndexProducer) other) : contains((BitMapProducer) other);
90      }
91  
92      /**
93       * Returns {@code true} if this filter contains the bits specified in the hasher.
94       *
95       * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
96       * identified by the {@code hasher}. Using the bit map representations this is
97       * effectively {@code (this AND hasher) == hasher}.</p>
98       *
99       * @param hasher the hasher to provide the indexes
100      * @return true if this filter is enabled for all bits specified by the hasher
101      */
102     default boolean contains(final Hasher hasher) {
103         Objects.requireNonNull(hasher, "Hasher");
104         final Shape shape = getShape();
105         return contains(hasher.indices(shape));
106     }
107 
108     /**
109      * Returns {@code true} if this filter contains the indices specified IndexProducer.
110      *
111      * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
112      * identified by the {@code IndexProducer}.</p>
113      *
114      * @param indexProducer the IndexProducer to provide the indexes
115      * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer
116      */
117     boolean contains(IndexProducer indexProducer);
118 
119     /**
120      * Creates a new instance of the BloomFilter with the same properties as the current one.
121      * @return a copy of this BloomFilter
122      */
123     BloomFilter copy();
124 
125     // update operations
126 
127     /**
128      * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
129      *
130      * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
131      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
132      *
133      * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
134      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
135      * may be thrown.</em></p>
136      *
137      * @param other The other Bloom filter
138      * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
139      * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
140      * @see #estimateN()
141      * @see Shape
142      */
143     default int estimateIntersection(final BloomFilter other) {
144         Objects.requireNonNull(other, "other");
145         final double eThis = getShape().estimateN(cardinality());
146         final double eOther = getShape().estimateN(other.cardinality());
147         if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
148             // if both are infinite the union is infinite and we return Integer.MAX_VALUE
149             return Integer.MAX_VALUE;
150         }
151         long estimate;
152         // if one is infinite the intersection is the other.
153         if (Double.isInfinite(eThis)) {
154             estimate = Math.round(eOther);
155         } else if (Double.isInfinite(eOther)) {
156             estimate = Math.round(eThis);
157         } else {
158             final BloomFilter union = this.copy();
159             union.merge(other);
160             final double eUnion = getShape().estimateN(union.cardinality());
161             if (Double.isInfinite(eUnion)) {
162                 throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
163             }
164             // maximum estimate value using integer values is: 46144189292 thus
165             // eThis + eOther can not overflow the long value.
166             estimate = Math.round(eThis + eOther - eUnion);
167             estimate = estimate < 0 ? 0 : estimate;
168         }
169         return estimate>Integer.MAX_VALUE?Integer.MAX_VALUE:(int) estimate;
170     }
171 
172     /**
173      * Estimates the number of items in the Bloom filter.
174      *
175      * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
176      * shape and cardinality of this filter.</p>
177      *
178      * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
179      * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
180      *
181      * <p><em>Note:</em></p>
182      * <ul>
183      * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
184      * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
185      * </ul>
186      *
187      * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
188      * estimate is larger than Integer.MAX_VALUE.
189      * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
190      * @see Shape#estimateN(int)
191      * @see Shape
192      */
193     default int estimateN() {
194         final double d = getShape().estimateN(cardinality());
195         if (Double.isInfinite(d)) {
196             return Integer.MAX_VALUE;
197         }
198         if (Double.isNaN(d)) {
199             throw new IllegalArgumentException("Cardinality too large: " + cardinality());
200         }
201         final long l = Math.round(d);
202         return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
203     }
204 
205     /**
206      * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
207      *
208      * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
209      * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
210      *
211      * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
212      * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
213      * may be thrown.</em></p>
214      *
215      * @param other The other Bloom filter
216      * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
217      * estimate is larger than Integer.MAX_VALUE.
218      * @see #estimateN()
219      * @see Shape
220      */
221     default int estimateUnion(final BloomFilter other) {
222         Objects.requireNonNull(other, "other");
223         final BloomFilter cpy = this.copy();
224         cpy.merge(other);
225         return cpy.estimateN();
226     }
227 
228     /**
229      * Gets the shape that was used when the filter was built.
230      * @return The shape the filter was built with.
231      */
232     Shape getShape();
233 
234     // Counting Operations
235 
236     /**
237      * Determines if all the bits are off. This is equivalent to
238      * {@code cardinality() == 0}.
239      *
240      * <p>
241      * <em>Note: This method is optimised for non-sparse filters.</em> Implementers
242      * are encouraged to implement faster checks if possible.
243      * </p>
244      *
245      * @return {@code true} if no bits are enabled, {@code false} otherwise.
246      */
247     default boolean isEmpty() {
248         return forEachBitMap(y -> y == 0);
249     }
250 
251     /**
252      * Determines if the bloom filter is "full".
253      *
254      * <p>Full is defined as having no unset bits.</p>
255      *
256      * @return {@code true} if the filter is full, {@code false} otherwise.
257      */
258     default boolean isFull() {
259         return cardinality() == getShape().getNumberOfBits();
260     }
261 
262     /**
263      * Merges the specified hasher into this Bloom filter. Specifically all
264      * bit indexes that are identified by the {@code producer} will be enabled in this filter.
265      *
266      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
267      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
268      * enabled in the {@code producer}.</em>  This state may occur in complex Bloom filter implementations like
269      * counting Bloom filters.</p>
270      *
271      * @param bitMapProducer The producer to merge.
272      * @return true if the merge was successful
273      * @throws IllegalArgumentException if producer sends illegal value.
274      */
275     boolean merge(BitMapProducer bitMapProducer);
276 
277     /**
278      * Merges the specified Bloom filter into this Bloom filter.
279      *
280      * <p>Specifically all
281      * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
282      *
283      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
284      * enabled. A {@code false} result indicates that this filter may or may not contain
285      * the {@code other} Bloom filter.</em>  This state may occur in complex Bloom filter implementations like
286      * counting Bloom filters.</p>
287      *
288      * @param other The bloom filter to merge into this one.
289      * @return true if the merge was successful
290      */
291     default boolean merge(final BloomFilter other) {
292         return (characteristics() & SPARSE) != 0 ? merge((IndexProducer) other) : merge((BitMapProducer) other);
293     }
294 
295     /**
296      * Merges the specified hasher into this Bloom filter. Specifically all
297      * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
298      *
299      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
300      * enabled. A {@code false} result indicates that this filter may or may not contain
301      * the {@code hasher} values.</em>  This state may occur in complex Bloom filter implementations like
302      * counting Bloom filters.</p>
303      *
304      * @param hasher The hasher to merge.
305      * @return true if the merge was successful
306      * @throws IllegalArgumentException if hasher produces an illegal value.
307      */
308     default boolean merge(final Hasher hasher) {
309         Objects.requireNonNull(hasher, "hasher");
310         return merge(hasher.indices(getShape()));
311     }
312 
313     /**
314      * Merges the specified IndexProducer into this Bloom filter. Specifically all
315      * bit indexes that are identified by the {@code producer} will be enabled in this filter.
316      *
317      * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
318      * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
319      * the {@code producer}.</em>  This state may occur in complex Bloom filter implementations like
320      * counting Bloom filters.</p>
321      *
322      * @param indexProducer The IndexProducer to merge.
323      * @return true if the merge was successful
324      * @throws IllegalArgumentException if producer sends illegal value.
325      */
326     boolean merge(IndexProducer indexProducer);
327 
328     /**
329      * Most Bloom filters create unique IndexProducers.
330      */
331     @Override
332     default IndexProducer uniqueIndices() {
333         return this;
334     }
335 }