001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Objects; 020 021/** 022 * The interface that describes a Bloom filter. 023 * <p> 024 * <em>See implementation notes for {@link BitMapExtractor} and {@link IndexExtractor}.</em> 025 * </p> 026 * 027 * @param <T> The BloomFilter type. 028 * @see BitMapExtractor 029 * @see IndexExtractor 030 * @since 4.5.0-M1 031 */ 032public interface BloomFilter<T extends BloomFilter<T>> extends IndexExtractor, BitMapExtractor { 033 034 /** 035 * The sparse characteristic used to determine the best method for matching: {@value}. 036 * <p> 037 * For `sparse` implementations the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations the 038 * {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier. 039 * </p> 040 */ 041 int SPARSE = 0x1; 042 043 /** 044 * Gets the cardinality (number of enabled bits) of this Bloom filter. 045 * 046 * <p>This is also known as the Hamming value or Hamming number.</p> 047 * 048 * @return the cardinality of this filter 049 */ 050 int cardinality(); 051 052 // Query Operations 053 054 /** 055 * Gets the characteristics of the filter. 056 * <p> 057 * Characteristics are defined as bits within the characteristics integer. 058 * </p> 059 * 060 * @return the characteristics for this bloom filter. 061 */ 062 int characteristics(); 063 064 /** 065 * Clears the filter to by resetting it to its initial, unpopulated state. 066 */ 067 void clear(); 068 069 /** 070 * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the 071 * bitMapExtractor. 072 * 073 * @param bitMapExtractor the {@code BitMapExtractor} to provide the bit maps. 074 * @return {@code true} if this filter is enabled for all bits specified by the bit maps 075 */ 076 default boolean contains(final BitMapExtractor bitMapExtractor) { 077 return processBitMapPairs(bitMapExtractor, (x, y) -> (x & y) == y); 078 } 079 080 /** 081 * Returns {@code true} if this filter contains the specified filter. 082 * 083 * <p>Specifically this 084 * returns {@code true} if this filter is enabled for all bits that are enabled in the 085 * {@code other} filter. Using the bit representations this is 086 * effectively {@code (this AND other) == other}.</p> 087 * 088 * @param other the other Bloom filter 089 * @return true if all enabled bits in the other filter are enabled in this filter. 090 */ 091 default boolean contains(final BloomFilter<?> other) { 092 Objects.requireNonNull(other, "other"); 093 return (characteristics() & SPARSE) != 0 ? contains((IndexExtractor) other) : contains((BitMapExtractor) other); 094 } 095 096 /** 097 * Returns {@code true} if this filter contains the bits specified in the hasher. 098 * 099 * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes 100 * identified by the {@code hasher}. Using the bit map representations this is 101 * effectively {@code (this AND hasher) == hasher}.</p> 102 * 103 * @param hasher the hasher to provide the indexes 104 * @return true if this filter is enabled for all bits specified by the hasher 105 */ 106 default boolean contains(final Hasher hasher) { 107 Objects.requireNonNull(hasher, "Hasher"); 108 final Shape shape = getShape(); 109 return contains(hasher.indices(shape)); 110 } 111 112 /** 113 * Returns {@code true} if this filter contains the indices specified IndexExtractor. 114 * 115 * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes 116 * identified by the {@code IndexExtractor}.</p> 117 * 118 * @param indexExtractor the IndexExtractor to provide the indexes 119 * @return {@code true} if this filter is enabled for all bits specified by the IndexExtractor 120 */ 121 boolean contains(IndexExtractor indexExtractor); 122 123 /** 124 * Creates a new instance of this {@link BloomFilter} with the same properties as the current one. 125 * 126 * @return a copy of this {@link BloomFilter}. 127 */ 128 T copy(); 129 130 // update operations 131 132 /** 133 * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter. 134 * 135 * <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both 136 * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p> 137 * 138 * <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape. If called on Bloom 139 * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException} 140 * may be thrown.</em></p> 141 * 142 * @param other The other Bloom filter 143 * @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned. 144 * @throws IllegalArgumentException if the estimated N for the union of the filters is infinite. 145 * @see #estimateN() 146 * @see Shape 147 */ 148 default int estimateIntersection(final BloomFilter<?> other) { 149 Objects.requireNonNull(other, "other"); 150 final double eThis = getShape().estimateN(cardinality()); 151 final double eOther = getShape().estimateN(other.cardinality()); 152 if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) { 153 // if both are infinite the union is infinite and we return Integer.MAX_VALUE 154 return Integer.MAX_VALUE; 155 } 156 long estimate; 157 // if one is infinite the intersection is the other. 158 if (Double.isInfinite(eThis)) { 159 estimate = Math.round(eOther); 160 } else if (Double.isInfinite(eOther)) { 161 estimate = Math.round(eThis); 162 } else { 163 final T union = this.copy(); 164 union.merge(other); 165 final double eUnion = getShape().estimateN(union.cardinality()); 166 if (Double.isInfinite(eUnion)) { 167 throw new IllegalArgumentException("The estimated N for the union of the filters is infinite"); 168 } 169 // maximum estimate value using integer values is: 46144189292 thus 170 // eThis + eOther cannot overflow the long value. 171 estimate = Math.round(eThis + eOther - eUnion); 172 estimate = estimate < 0 ? 0 : estimate; 173 } 174 return estimate > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) estimate; 175 } 176 177 /** 178 * Estimates the number of items in the Bloom filter. 179 * 180 * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the 181 * shape and cardinality of this filter.</p> 182 * 183 * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter 184 * by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p> 185 * 186 * <p><em>Note:</em></p> 187 * <ul> 188 * <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li> 189 * <li>if cardinality > numberOfBits, then an IllegalArgumentException is thrown.</li> 190 * </ul> 191 * 192 * @return an estimate of the number of items in the bloom filter. Will return Integer.MAX_VALUE if the 193 * estimate is larger than Integer.MAX_VALUE. 194 * @throws IllegalArgumentException if the cardinality is > numberOfBits as defined in Shape. 195 * @see Shape#estimateN(int) 196 * @see Shape 197 */ 198 default int estimateN() { 199 final double d = getShape().estimateN(cardinality()); 200 if (Double.isInfinite(d)) { 201 return Integer.MAX_VALUE; 202 } 203 if (Double.isNaN(d)) { 204 throw new IllegalArgumentException("Cardinality too large: " + cardinality()); 205 } 206 final long l = Math.round(d); 207 return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l; 208 } 209 210 /** 211 * Estimates the number of items in the union of this Bloom filter with the other bloom filter. 212 * 213 * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either 214 * of the filters by rounding the value from the calculation described in the {@link Shape} class Javadoc.</p> 215 * 216 * <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape. If called on Bloom 217 * filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException} 218 * may be thrown.</em></p> 219 * 220 * @param other The other Bloom filter 221 * @return an estimate of the number of items in the union. Will return Integer.MAX_VALUE if the 222 * estimate is larger than Integer.MAX_VALUE. 223 * @see #estimateN() 224 * @see Shape 225 */ 226 default int estimateUnion(final BloomFilter<?> other) { 227 Objects.requireNonNull(other, "other"); 228 final T copy = this.copy(); 229 copy.merge(other); 230 return copy.estimateN(); 231 } 232 233 /** 234 * Gets the shape that was used when the filter was built. 235 * @return The shape the filter was built with. 236 */ 237 Shape getShape(); 238 239 // Counting Operations 240 241 /** 242 * Determines if all the bits are off. This is equivalent to 243 * {@code cardinality() == 0}. 244 * 245 * <p> 246 * <em>Note: This method is optimized for non-sparse filters.</em> Implementers 247 * are encouraged to implement faster checks if possible. 248 * </p> 249 * 250 * @return {@code true} if no bits are enabled, {@code false} otherwise. 251 */ 252 default boolean isEmpty() { 253 return processBitMaps(y -> y == 0); 254 } 255 256 /** 257 * Determines if the bloom filter is "full". 258 * 259 * <p>Full is defined as having no unset bits.</p> 260 * 261 * @return {@code true} if the filter is full, {@code false} otherwise. 262 */ 263 default boolean isFull() { 264 return cardinality() == getShape().getNumberOfBits(); 265 } 266 267 /** 268 * Merges the specified hasher into this Bloom filter. Specifically all 269 * bit indexes that are identified by the {@code bitMapExtractor} will be enabled in this filter. 270 * 271 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were 272 * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes 273 * enabled in the {@code bitMapExtractor}.</em> This state may occur in complex Bloom filter implementations like 274 * counting Bloom filters.</p> 275 * 276 * @param bitMapExtractor The BitMapExtractor to merge. 277 * @return true if the merge was successful 278 * @throws IllegalArgumentException if bitMapExtractor sends illegal value. 279 */ 280 boolean merge(BitMapExtractor bitMapExtractor); 281 282 /** 283 * Merges the specified Bloom filter into this Bloom filter. 284 * 285 * <p>Specifically all 286 * bit indexes that are identified by the {@code other} will be enabled in this filter.</p> 287 * 288 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were 289 * enabled. A {@code false} result indicates that this filter may or may not contain 290 * the {@code other} Bloom filter.</em> This state may occur in complex Bloom filter implementations like 291 * counting Bloom filters.</p> 292 * 293 * @param other The bloom filter to merge into this one. 294 * @return true if the merge was successful 295 */ 296 default boolean merge(final BloomFilter<?> other) { 297 return (characteristics() & SPARSE) != 0 ? merge((IndexExtractor) other) : merge((BitMapExtractor) other); 298 } 299 300 /** 301 * Merges the specified hasher into this Bloom filter. Specifically all 302 * bit indexes that are identified by the {@code hasher} will be enabled in this filter. 303 * 304 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were 305 * enabled. A {@code false} result indicates that this filter may or may not contain 306 * the {@code hasher} values.</em> This state may occur in complex Bloom filter implementations like 307 * counting Bloom filters.</p> 308 * 309 * @param hasher The hasher to merge. 310 * @return true if the merge was successful 311 * @throws IllegalArgumentException if hasher produces an illegal value. 312 */ 313 default boolean merge(final Hasher hasher) { 314 Objects.requireNonNull(hasher, "hasher"); 315 return merge(hasher.indices(getShape())); 316 } 317 318 /** 319 * Merges the specified IndexExtractor into this Bloom filter. Specifically all 320 * bit indexes that are identified by the {@code indexExtractor} will be enabled in this filter. 321 * 322 * <p><em>Note: This method should return {@code true} even if no additional bit indexes were 323 * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of 324 * the {@code indexExtractor}.</em> This state may occur in complex Bloom filter implementations like 325 * counting Bloom filters.</p> 326 * 327 * @param indexExtractor The IndexExtractor to merge. 328 * @return true if the merge was successful 329 * @throws IllegalArgumentException if indexExtractor sends illegal value. 330 */ 331 boolean merge(IndexExtractor indexExtractor); 332 333 /** 334 * Most Bloom filters create unique IndexExtractors. 335 */ 336 @Override 337 default IndexExtractor uniqueIndices() { 338 return this; 339 } 340}