Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.bloomfilter;
018
019import java.util.Objects;
020
021/**
022 * The interface that describes a Bloom filter.
023 * <p>
024 * <em>See implementation notes for BitMapExtractor and IndexExtractor.</em>
025 * </p>
026 * @see BitMapExtractor
027 * @see IndexExtractor
028 * @since 4.5
029 */
030public interface BloomFilter extends IndexExtractor, BitMapExtractor {
031
032    /**
033     * The sparse characteristic used to determine the best method for matching.
034     * <p>For `sparse` implementations
035     * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations
036     * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier
037     * for the implementation to produce indexes of bit map blocks.</p>
038     */
039    int SPARSE = 0x1;
040
041    /**
042     * Gets the cardinality (number of enabled bits) of this Bloom filter.
043     *
044     * <p>This is also known as the Hamming value or Hamming number.</p>
045     *
046     * @return the cardinality of this filter
047     */
048    int cardinality();
049
050    // Query Operations
051
052    /**
053     * Returns the characteristics of the filter.
054     * <p>
055     * Characteristics are defined as bits within the characteristics integer.
056     * @return the characteristics for this bloom filter.
057     */
058    int characteristics();
059
060    /**
061     * Resets the filter to its initial, unpopulated state.
062     */
063    void clear();
064
065    /**
066     * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
067     * bitMapExtractor.
068     *
069     * @param bitMapExtractor the {@code BitMapExtractor} to provide the bit maps.
070     * @return {@code true} if this filter is enabled for all bits specified by the bit maps
071     */
072    default boolean contains(final BitMapExtractor bitMapExtractor) {
073        return processBitMapPairs(bitMapExtractor, (x, y) -> (x & y) == y);
074    }
075
076    /**
077     * Returns {@code true} if this filter contains the specified filter.
078     *
079     * <p>Specifically this
080     * returns {@code true} if this filter is enabled for all bits that are enabled in the
081     * {@code other} filter. Using the bit representations this is
082     * effectively {@code (this AND other) == other}.</p>
083     *
084     * @param other the other Bloom filter
085     * @return true if all enabled bits in the other filter are enabled in this filter.
086     */
087    default boolean contains(final BloomFilter other) {
088        Objects.requireNonNull(other, "other");
089        return (characteristics() & SPARSE) != 0 ? contains((IndexExtractor) other) : contains((BitMapExtractor) other);
090    }
091
092    /**
093     * Returns {@code true} if this filter contains the bits specified in the hasher.
094     *
095     * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
096     * identified by the {@code hasher}. Using the bit map representations this is
097     * effectively {@code (this AND hasher) == hasher}.</p>
098     *
099     * @param hasher the hasher to provide the indexes
100     * @return true if this filter is enabled for all bits specified by the hasher
101     */
102    default boolean contains(final Hasher hasher) {
103        Objects.requireNonNull(hasher, "Hasher");
104        final Shape shape = getShape();
105        return contains(hasher.indices(shape));
106    }
107
108    /**
109     * Returns {@code true} if this filter contains the indices specified IndexExtractor.
110     *
111     * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
112     * identified by the {@code IndexExtractor}.</p>
113     *
114     * @param indexExtractor the IndexExtractor to provide the indexes
115     * @return {@code true} if this filter is enabled for all bits specified by the IndexExtractor
116     */
117    boolean contains(IndexExtractor indexExtractor);
118
119    /**
120     * Creates a new instance of the BloomFilter with the same properties as the current one.
121     *
122     * @param <T> Type of BloomFilter.
123     * @return a copy of this BloomFilter
124     */
125    <T extends BloomFilter> T copy();
126
127    // update operations
128
129    /**
130     * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
131     *
132     * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
133     * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
134     *
135     * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
136     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
137     * may be thrown.</em></p>
138     *
139     * @param other The other Bloom filter
140     * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
141     * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
142     * @see #estimateN()
143     * @see Shape
144     */
145    default int estimateIntersection(final BloomFilter other) {
146        Objects.requireNonNull(other, "other");
147        final double eThis = getShape().estimateN(cardinality());
148        final double eOther = getShape().estimateN(other.cardinality());
149        if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
150            // if both are infinite the union is infinite and we return Integer.MAX_VALUE
151            return Integer.MAX_VALUE;
152        }
153        long estimate;
154        // if one is infinite the intersection is the other.
155        if (Double.isInfinite(eThis)) {
156            estimate = Math.round(eOther);
157        } else if (Double.isInfinite(eOther)) {
158            estimate = Math.round(eThis);
159        } else {
160            final BloomFilter union = this.copy();
161            union.merge(other);
162            final double eUnion = getShape().estimateN(union.cardinality());
163            if (Double.isInfinite(eUnion)) {
164                throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
165            }
166            // maximum estimate value using integer values is: 46144189292 thus
167            // eThis + eOther can not overflow the long value.
168            estimate = Math.round(eThis + eOther - eUnion);
169            estimate = estimate < 0 ? 0 : estimate;
170        }
171        return estimate > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) estimate;
172    }
173
174    /**
175     * Estimates the number of items in the Bloom filter.
176     *
177     * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
178     * shape and cardinality of this filter.</p>
179     *
180     * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
181     * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
182     *
183     * <p><em>Note:</em></p>
184     * <ul>
185     * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
186     * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
187     * </ul>
188     *
189     * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
190     * estimate is larger than Integer.MAX_VALUE.
191     * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
192     * @see Shape#estimateN(int)
193     * @see Shape
194     */
195    default int estimateN() {
196        final double d = getShape().estimateN(cardinality());
197        if (Double.isInfinite(d)) {
198            return Integer.MAX_VALUE;
199        }
200        if (Double.isNaN(d)) {
201            throw new IllegalArgumentException("Cardinality too large: " + cardinality());
202        }
203        final long l = Math.round(d);
204        return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
205    }
206
207    /**
208     * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
209     *
210     * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
211     * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
212     *
213     * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
214     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
215     * may be thrown.</em></p>
216     *
217     * @param other The other Bloom filter
218     * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
219     * estimate is larger than Integer.MAX_VALUE.
220     * @see #estimateN()
221     * @see Shape
222     */
223    default int estimateUnion(final BloomFilter other) {
224        Objects.requireNonNull(other, "other");
225        final BloomFilter cpy = this.copy();
226        cpy.merge(other);
227        return cpy.estimateN();
228    }
229
230    /**
231     * Gets the shape that was used when the filter was built.
232     * @return The shape the filter was built with.
233     */
234    Shape getShape();
235
236    // Counting Operations
237
238    /**
239     * Determines if all the bits are off. This is equivalent to
240     * {@code cardinality() == 0}.
241     *
242     * <p>
243     * <em>Note: This method is optimised for non-sparse filters.</em> Implementers
244     * are encouraged to implement faster checks if possible.
245     * </p>
246     *
247     * @return {@code true} if no bits are enabled, {@code false} otherwise.
248     */
249    default boolean isEmpty() {
250        return processBitMaps(y -> y == 0);
251    }
252
253    /**
254     * Determines if the bloom filter is "full".
255     *
256     * <p>Full is defined as having no unset bits.</p>
257     *
258     * @return {@code true} if the filter is full, {@code false} otherwise.
259     */
260    default boolean isFull() {
261        return cardinality() == getShape().getNumberOfBits();
262    }
263
264    /**
265     * Merges the specified hasher into this Bloom filter. Specifically all
266     * bit indexes that are identified by the {@code bitMapExtractor} will be enabled in this filter.
267     *
268     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
269     * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
270     * enabled in the {@code bitMapExtractor}.</em>  This state may occur in complex Bloom filter implementations like
271     * counting Bloom filters.</p>
272     *
273     * @param bitMapExtractor The BitMapExtractor to merge.
274     * @return true if the merge was successful
275     * @throws IllegalArgumentException if bitMapExtractor sends illegal value.
276     */
277    boolean merge(BitMapExtractor bitMapExtractor);
278
279    /**
280     * Merges the specified Bloom filter into this Bloom filter.
281     *
282     * <p>Specifically all
283     * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
284     *
285     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
286     * enabled. A {@code false} result indicates that this filter may or may not contain
287     * the {@code other} Bloom filter.</em>  This state may occur in complex Bloom filter implementations like
288     * counting Bloom filters.</p>
289     *
290     * @param other The bloom filter to merge into this one.
291     * @return true if the merge was successful
292     */
293    default boolean merge(final BloomFilter other) {
294        return (characteristics() & SPARSE) != 0 ? merge((IndexExtractor) other) : merge((BitMapExtractor) other);
295    }
296
297    /**
298     * Merges the specified hasher into this Bloom filter. Specifically all
299     * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
300     *
301     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
302     * enabled. A {@code false} result indicates that this filter may or may not contain
303     * the {@code hasher} values.</em>  This state may occur in complex Bloom filter implementations like
304     * counting Bloom filters.</p>
305     *
306     * @param hasher The hasher to merge.
307     * @return true if the merge was successful
308     * @throws IllegalArgumentException if hasher produces an illegal value.
309     */
310    default boolean merge(final Hasher hasher) {
311        Objects.requireNonNull(hasher, "hasher");
312        return merge(hasher.indices(getShape()));
313    }
314
315    /**
316     * Merges the specified IndexExtractor into this Bloom filter. Specifically all
317     * bit indexes that are identified by the {@code indexExtractor} will be enabled in this filter.
318     *
319     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
320     * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
321     * the {@code indexExtractor}.</em>  This state may occur in complex Bloom filter implementations like
322     * counting Bloom filters.</p>
323     *
324     * @param indexExtractor The IndexExtractor to merge.
325     * @return true if the merge was successful
326     * @throws IllegalArgumentException if indexExtractor sends illegal value.
327     */
328    boolean merge(IndexExtractor indexExtractor);
329
330    /**
331     * Most Bloom filters create unique IndexExtractors.
332     */
333    @Override
334    default IndexExtractor uniqueIndices() {
335        return this;
336    }
337}