1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.bloomfilter;
18
19 import java.util.Objects;
20
21 /**
22 * The interface that describes a Bloom filter.
23 * <p>
24 * <em>See implementation notes for {@link BitMapExtractor} and {@link IndexExtractor}.</em>
25 * </p>
26 *
27 * @param <T> The BloomFilter type.
28 * @see BitMapExtractor
29 * @see IndexExtractor
30 * @since 4.5.0-M1
31 */
32 public interface BloomFilter<T extends BloomFilter<T>> extends IndexExtractor, BitMapExtractor {
33
34 /**
35 * The sparse characteristic used to determine the best method for matching: {@value}.
36 * <p>
37 * For `sparse` implementations the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations the
38 * {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier.
39 * </p>
40 */
41 int SPARSE = 0x1;
42
43 /**
44 * Gets the cardinality (number of enabled bits) of this Bloom filter.
45 *
46 * <p>This is also known as the Hamming value or Hamming number.</p>
47 *
48 * @return the cardinality of this filter
49 */
50 int cardinality();
51
52 // Query Operations
53
54 /**
55 * Gets the characteristics of the filter.
56 * <p>
57 * Characteristics are defined as bits within the characteristics integer.
58 * </p>
59 *
60 * @return the characteristics for this bloom filter.
61 */
62 int characteristics();
63
64 /**
65 * Clears the filter to by resetting it to its initial, unpopulated state.
66 */
67 void clear();
68
69 /**
70 * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
71 * bitMapExtractor.
72 *
73 * @param bitMapExtractor the {@code BitMapExtractor} to provide the bit maps.
74 * @return {@code true} if this filter is enabled for all bits specified by the bit maps
75 */
76 default boolean contains(final BitMapExtractor bitMapExtractor) {
77 return processBitMapPairs(bitMapExtractor, (x, y) -> (x & y) == y);
78 }
79
80 /**
81 * Returns {@code true} if this filter contains the specified filter.
82 *
83 * <p>Specifically this
84 * returns {@code true} if this filter is enabled for all bits that are enabled in the
85 * {@code other} filter. Using the bit representations this is
86 * effectively {@code (this AND other) == other}.</p>
87 *
88 * @param other the other Bloom filter
89 * @return true if all enabled bits in the other filter are enabled in this filter.
90 */
91 default boolean contains(final BloomFilter<?> other) {
92 Objects.requireNonNull(other, "other");
93 return (characteristics() & SPARSE) != 0 ? contains((IndexExtractor) other) : contains((BitMapExtractor) other);
94 }
95
96 /**
97 * Returns {@code true} if this filter contains the bits specified in the hasher.
98 *
99 * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
100 * identified by the {@code hasher}. Using the bit map representations this is
101 * effectively {@code (this AND hasher) == hasher}.</p>
102 *
103 * @param hasher the hasher to provide the indexes
104 * @return true if this filter is enabled for all bits specified by the hasher
105 */
106 default boolean contains(final Hasher hasher) {
107 Objects.requireNonNull(hasher, "Hasher");
108 final Shape shape = getShape();
109 return contains(hasher.indices(shape));
110 }
111
112 /**
113 * Returns {@code true} if this filter contains the indices specified IndexExtractor.
114 *
115 * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
116 * identified by the {@code IndexExtractor}.</p>
117 *
118 * @param indexExtractor the IndexExtractor to provide the indexes
119 * @return {@code true} if this filter is enabled for all bits specified by the IndexExtractor
120 */
121 boolean contains(IndexExtractor indexExtractor);
122
123 /**
124 * Creates a new instance of this {@link BloomFilter} with the same properties as the current one.
125 *
126 * @return a copy of this {@link BloomFilter}.
127 */
128 T copy();
129
130 // update operations
131
132 /**
133 * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
134 *
135 * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
136 * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
137 *
138 * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape. If called on Bloom
139 * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
140 * may be thrown.</em></p>
141 *
142 * @param other The other Bloom filter
143 * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
144 * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
145 * @see #estimateN()
146 * @see Shape
147 */
148 default int estimateIntersection(final BloomFilter<?> other) {
149 Objects.requireNonNull(other, "other");
150 final double eThis = getShape().estimateN(cardinality());
151 final double eOther = getShape().estimateN(other.cardinality());
152 if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
153 // if both are infinite the union is infinite and we return Integer.MAX_VALUE
154 return Integer.MAX_VALUE;
155 }
156 long estimate;
157 // if one is infinite the intersection is the other.
158 if (Double.isInfinite(eThis)) {
159 estimate = Math.round(eOther);
160 } else if (Double.isInfinite(eOther)) {
161 estimate = Math.round(eThis);
162 } else {
163 final T union = this.copy();
164 union.merge(other);
165 final double eUnion = getShape().estimateN(union.cardinality());
166 if (Double.isInfinite(eUnion)) {
167 throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
168 }
169 // maximum estimate value using integer values is: 46144189292 thus
170 // eThis + eOther cannot overflow the long value.
171 estimate = Math.round(eThis + eOther - eUnion);
172 estimate = estimate < 0 ? 0 : estimate;
173 }
174 return estimate > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) estimate;
175 }
176
177 /**
178 * Estimates the number of items in the Bloom filter.
179 *
180 * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
181 * shape and cardinality of this filter.</p>
182 *
183 * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
184 * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
185 *
186 * <p><em>Note:</em></p>
187 * <ul>
188 * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
189 * <li>if cardinality > numberOfBits, then an IllegalArgumentException is thrown.</li>
190 * </ul>
191 *
192 * @return an estimate of the number of items in the bloom filter. Will return Integer.MAX_VALUE if the
193 * estimate is larger than Integer.MAX_VALUE.
194 * @throws IllegalArgumentException if the cardinality is > numberOfBits as defined in Shape.
195 * @see Shape#estimateN(int)
196 * @see Shape
197 */
198 default int estimateN() {
199 final double d = getShape().estimateN(cardinality());
200 if (Double.isInfinite(d)) {
201 return Integer.MAX_VALUE;
202 }
203 if (Double.isNaN(d)) {
204 throw new IllegalArgumentException("Cardinality too large: " + cardinality());
205 }
206 final long l = Math.round(d);
207 return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
208 }
209
210 /**
211 * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
212 *
213 * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
214 * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
215 *
216 * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape. If called on Bloom
217 * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
218 * may be thrown.</em></p>
219 *
220 * @param other The other Bloom filter
221 * @return an estimate of the number of items in the union. Will return Integer.MAX_VALUE if the
222 * estimate is larger than Integer.MAX_VALUE.
223 * @see #estimateN()
224 * @see Shape
225 */
226 default int estimateUnion(final BloomFilter<?> other) {
227 Objects.requireNonNull(other, "other");
228 final T copy = this.copy();
229 copy.merge(other);
230 return copy.estimateN();
231 }
232
233 /**
234 * Gets the shape that was used when the filter was built.
235 * @return The shape the filter was built with.
236 */
237 Shape getShape();
238
239 // Counting Operations
240
241 /**
242 * Determines if all the bits are off. This is equivalent to
243 * {@code cardinality() == 0}.
244 *
245 * <p>
246 * <em>Note: This method is optimized for non-sparse filters.</em> Implementers
247 * are encouraged to implement faster checks if possible.
248 * </p>
249 *
250 * @return {@code true} if no bits are enabled, {@code false} otherwise.
251 */
252 default boolean isEmpty() {
253 return processBitMaps(y -> y == 0);
254 }
255
256 /**
257 * Determines if the bloom filter is "full".
258 *
259 * <p>Full is defined as having no unset bits.</p>
260 *
261 * @return {@code true} if the filter is full, {@code false} otherwise.
262 */
263 default boolean isFull() {
264 return cardinality() == getShape().getNumberOfBits();
265 }
266
267 /**
268 * Merges the specified hasher into this Bloom filter. Specifically all
269 * bit indexes that are identified by the {@code bitMapExtractor} will be enabled in this filter.
270 *
271 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
272 * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
273 * enabled in the {@code bitMapExtractor}.</em> This state may occur in complex Bloom filter implementations like
274 * counting Bloom filters.</p>
275 *
276 * @param bitMapExtractor The BitMapExtractor to merge.
277 * @return true if the merge was successful
278 * @throws IllegalArgumentException if bitMapExtractor sends illegal value.
279 */
280 boolean merge(BitMapExtractor bitMapExtractor);
281
282 /**
283 * Merges the specified Bloom filter into this Bloom filter.
284 *
285 * <p>Specifically all
286 * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
287 *
288 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
289 * enabled. A {@code false} result indicates that this filter may or may not contain
290 * the {@code other} Bloom filter.</em> This state may occur in complex Bloom filter implementations like
291 * counting Bloom filters.</p>
292 *
293 * @param other The bloom filter to merge into this one.
294 * @return true if the merge was successful
295 */
296 default boolean merge(final BloomFilter<?> other) {
297 return (characteristics() & SPARSE) != 0 ? merge((IndexExtractor) other) : merge((BitMapExtractor) other);
298 }
299
300 /**
301 * Merges the specified hasher into this Bloom filter. Specifically all
302 * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
303 *
304 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
305 * enabled. A {@code false} result indicates that this filter may or may not contain
306 * the {@code hasher} values.</em> This state may occur in complex Bloom filter implementations like
307 * counting Bloom filters.</p>
308 *
309 * @param hasher The hasher to merge.
310 * @return true if the merge was successful
311 * @throws IllegalArgumentException if hasher produces an illegal value.
312 */
313 default boolean merge(final Hasher hasher) {
314 Objects.requireNonNull(hasher, "hasher");
315 return merge(hasher.indices(getShape()));
316 }
317
318 /**
319 * Merges the specified IndexExtractor into this Bloom filter. Specifically all
320 * bit indexes that are identified by the {@code indexExtractor} will be enabled in this filter.
321 *
322 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
323 * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
324 * the {@code indexExtractor}.</em> This state may occur in complex Bloom filter implementations like
325 * counting Bloom filters.</p>
326 *
327 * @param indexExtractor The IndexExtractor to merge.
328 * @return true if the merge was successful
329 * @throws IllegalArgumentException if indexExtractor sends illegal value.
330 */
331 boolean merge(IndexExtractor indexExtractor);
332
333 /**
334 * Most Bloom filters create unique IndexExtractors.
335 */
336 @Override
337 default IndexExtractor uniqueIndices() {
338 return this;
339 }
340 }