001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.bloomfilter;
018
019import java.util.Objects;
020
021/**
022 * The interface that describes a Bloom filter.
023 * <p>
024 * <em>See implementation notes for BitMapProducer and IndexProducer.</em>
025 * </p>
026 * @see BitMapProducer
027 * @see IndexProducer
028 * @since 4.5
029 */
030public interface BloomFilter extends IndexProducer, BitMapProducer {
031
032    /**
033     * The sparse characteristic used to determine the best method for matching.
034     * <p>For `sparse` implementations
035     * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations
036     * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier
037     * for the implementation to produce indexes of bit map blocks.</p>
038     */
039    int SPARSE = 0x1;
040
041    /**
042     * Gets the cardinality (number of enabled bits) of this Bloom filter.
043     *
044     * <p>This is also known as the Hamming value or Hamming number.</p>
045     *
046     * @return the cardinality of this filter
047     */
048    int cardinality();
049
050    // Query Operations
051
052    /**
053     * Returns the characteristics of the filter.
054     * <p>
055     * Characteristics are defined as bits within the characteristics integer.
056     * @return the characteristics for this bloom filter.
057     */
058    int characteristics();
059
060    /**
061     * Resets the filter to its initial, unpopulated state.
062     */
063    void clear();
064
065    /**
066     * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the
067     * bitMapProducer.
068     *
069     * @param bitMapProducer the {@code BitMapProducer} to provide the bit maps.
070     * @return {@code true} if this filter is enabled for all bits specified by the bit maps
071     */
072    default boolean contains(final BitMapProducer bitMapProducer) {
073        return forEachBitMapPair(bitMapProducer, (x, y) -> (x & y) == y);
074    }
075
076    /**
077     * Returns {@code true} if this filter contains the specified filter.
078     *
079     * <p>Specifically this
080     * returns {@code true} if this filter is enabled for all bits that are enabled in the
081     * {@code other} filter. Using the bit representations this is
082     * effectively {@code (this AND other) == other}.</p>
083     *
084     * @param other the other Bloom filter
085     * @return true if all enabled bits in the other filter are enabled in this filter.
086     */
087    default boolean contains(final BloomFilter other) {
088        Objects.requireNonNull(other, "other");
089        return (characteristics() & SPARSE) != 0 ? contains((IndexProducer) other) : contains((BitMapProducer) other);
090    }
091
092    /**
093     * Returns {@code true} if this filter contains the bits specified in the hasher.
094     *
095     * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
096     * identified by the {@code hasher}. Using the bit map representations this is
097     * effectively {@code (this AND hasher) == hasher}.</p>
098     *
099     * @param hasher the hasher to provide the indexes
100     * @return true if this filter is enabled for all bits specified by the hasher
101     */
102    default boolean contains(final Hasher hasher) {
103        Objects.requireNonNull(hasher, "Hasher");
104        final Shape shape = getShape();
105        return contains(hasher.indices(shape));
106    }
107
108    /**
109     * Returns {@code true} if this filter contains the indices specified IndexProducer.
110     *
111     * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes
112     * identified by the {@code IndexProducer}.</p>
113     *
114     * @param indexProducer the IndexProducer to provide the indexes
115     * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer
116     */
117    boolean contains(IndexProducer indexProducer);
118
119    /**
120     * Creates a new instance of the BloomFilter with the same properties as the current one.
121     * @return a copy of this BloomFilter
122     */
123    BloomFilter copy();
124
125    // update operations
126
127    /**
128     * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
129     *
130     * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
131     * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
132     *
133     * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape.  If called on Bloom
134     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
135     * may be thrown.</em></p>
136     *
137     * @param other The other Bloom filter
138     * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
139     * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
140     * @see #estimateN()
141     * @see Shape
142     */
143    default int estimateIntersection(final BloomFilter other) {
144        Objects.requireNonNull(other, "other");
145        final double eThis = getShape().estimateN(cardinality());
146        final double eOther = getShape().estimateN(other.cardinality());
147        if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
148            // if both are infinite the union is infinite and we return Integer.MAX_VALUE
149            return Integer.MAX_VALUE;
150        }
151        long estimate;
152        // if one is infinite the intersection is the other.
153        if (Double.isInfinite(eThis)) {
154            estimate = Math.round(eOther);
155        } else if (Double.isInfinite(eOther)) {
156            estimate = Math.round(eThis);
157        } else {
158            final BloomFilter union = this.copy();
159            union.merge(other);
160            final double eUnion = getShape().estimateN(union.cardinality());
161            if (Double.isInfinite(eUnion)) {
162                throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
163            }
164            // maximum estimate value using integer values is: 46144189292 thus
165            // eThis + eOther can not overflow the long value.
166            estimate = Math.round(eThis + eOther - eUnion);
167            estimate = estimate < 0 ? 0 : estimate;
168        }
169        return estimate>Integer.MAX_VALUE?Integer.MAX_VALUE:(int) estimate;
170    }
171
172    /**
173     * Estimates the number of items in the Bloom filter.
174     *
175     * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
176     * shape and cardinality of this filter.</p>
177     *
178     * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
179     * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
180     *
181     * <p><em>Note:</em></p>
182     * <ul>
183     * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
184     * <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
185     * </ul>
186     *
187     * @return an estimate of the number of items in the bloom filter.  Will return Integer.MAX_VALUE if the
188     * estimate is larger than Integer.MAX_VALUE.
189     * @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
190     * @see Shape#estimateN(int)
191     * @see Shape
192     */
193    default int estimateN() {
194        final double d = getShape().estimateN(cardinality());
195        if (Double.isInfinite(d)) {
196            return Integer.MAX_VALUE;
197        }
198        if (Double.isNaN(d)) {
199            throw new IllegalArgumentException("Cardinality too large: " + cardinality());
200        }
201        final long l = Math.round(d);
202        return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
203    }
204
205    /**
206     * Estimates the number of items in the union of this Bloom filter with the other bloom filter.
207     *
208     * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
209     * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p>
210     *
211     * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape.  If called on Bloom
212     * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
213     * may be thrown.</em></p>
214     *
215     * @param other The other Bloom filter
216     * @return an estimate of the number of items in the union.  Will return Integer.MAX_VALUE if the
217     * estimate is larger than Integer.MAX_VALUE.
218     * @see #estimateN()
219     * @see Shape
220     */
221    default int estimateUnion(final BloomFilter other) {
222        Objects.requireNonNull(other, "other");
223        final BloomFilter cpy = this.copy();
224        cpy.merge(other);
225        return cpy.estimateN();
226    }
227
228    /**
229     * Gets the shape that was used when the filter was built.
230     * @return The shape the filter was built with.
231     */
232    Shape getShape();
233
234    // Counting Operations
235
236    /**
237     * Determines if all the bits are off. This is equivalent to
238     * {@code cardinality() == 0}.
239     *
240     * <p>
241     * <em>Note: This method is optimised for non-sparse filters.</em> Implementers
242     * are encouraged to implement faster checks if possible.
243     * </p>
244     *
245     * @return {@code true} if no bits are enabled, {@code false} otherwise.
246     */
247    default boolean isEmpty() {
248        return forEachBitMap(y -> y == 0);
249    }
250
251    /**
252     * Determines if the bloom filter is "full".
253     *
254     * <p>Full is defined as having no unset bits.</p>
255     *
256     * @return {@code true} if the filter is full, {@code false} otherwise.
257     */
258    default boolean isFull() {
259        return cardinality() == getShape().getNumberOfBits();
260    }
261
262    /**
263     * Merges the specified hasher into this Bloom filter. Specifically all
264     * bit indexes that are identified by the {@code producer} will be enabled in this filter.
265     *
266     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
267     * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes
268     * enabled in the {@code producer}.</em>  This state may occur in complex Bloom filter implementations like
269     * counting Bloom filters.</p>
270     *
271     * @param bitMapProducer The producer to merge.
272     * @return true if the merge was successful
273     * @throws IllegalArgumentException if producer sends illegal value.
274     */
275    boolean merge(BitMapProducer bitMapProducer);
276
277    /**
278     * Merges the specified Bloom filter into this Bloom filter.
279     *
280     * <p>Specifically all
281     * bit indexes that are identified by the {@code other} will be enabled in this filter.</p>
282     *
283     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
284     * enabled. A {@code false} result indicates that this filter may or may not contain
285     * the {@code other} Bloom filter.</em>  This state may occur in complex Bloom filter implementations like
286     * counting Bloom filters.</p>
287     *
288     * @param other The bloom filter to merge into this one.
289     * @return true if the merge was successful
290     */
291    default boolean merge(final BloomFilter other) {
292        return (characteristics() & SPARSE) != 0 ? merge((IndexProducer) other) : merge((BitMapProducer) other);
293    }
294
295    /**
296     * Merges the specified hasher into this Bloom filter. Specifically all
297     * bit indexes that are identified by the {@code hasher} will be enabled in this filter.
298     *
299     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
300     * enabled. A {@code false} result indicates that this filter may or may not contain
301     * the {@code hasher} values.</em>  This state may occur in complex Bloom filter implementations like
302     * counting Bloom filters.</p>
303     *
304     * @param hasher The hasher to merge.
305     * @return true if the merge was successful
306     * @throws IllegalArgumentException if hasher produces an illegal value.
307     */
308    default boolean merge(final Hasher hasher) {
309        Objects.requireNonNull(hasher, "hasher");
310        return merge(hasher.indices(getShape()));
311    }
312
313    /**
314     * Merges the specified IndexProducer into this Bloom filter. Specifically all
315     * bit indexes that are identified by the {@code producer} will be enabled in this filter.
316     *
317     * <p><em>Note: This method should return {@code true} even if no additional bit indexes were
318     * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of
319     * the {@code producer}.</em>  This state may occur in complex Bloom filter implementations like
320     * counting Bloom filters.</p>
321     *
322     * @param indexProducer The IndexProducer to merge.
323     * @return true if the merge was successful
324     * @throws IllegalArgumentException if producer sends illegal value.
325     */
326    boolean merge(IndexProducer indexProducer);
327
328    /**
329     * Most Bloom filters create unique IndexProducers.
330     */
331    @Override
332    default IndexProducer uniqueIndices() {
333        return this;
334    }
335}