View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Objects;
20  
21  /**
22   * The interface that describes a Bloom filter that associates a count with each
23   * bit index rather than a bit.  This allows reversal of merge operations with
24   * remove operations.
25   *
26   * <p>A counting Bloom filter is expected to function identically to a standard
27   * Bloom filter that is the merge of all the Bloom filters that have been added
28   * to and not later subtracted from the counting Bloom filter. The functional
29   * state of a CountingBloomFilter at the start and end of a series of merge and
30   * subsequent remove operations of the same Bloom filters, irrespective of
31   * remove order, is expected to be the same.</p>
32   *
33   * <p>Removal of a filter that has not previously been merged results in an
34   * invalid state where the cells no longer represent a sum of merged Bloom
35   * filters. It is impossible to validate merge and remove exactly without
36   * explicitly storing all filters. Consequently such an operation may go
37   * undetected. The CountingBloomFilter maintains a state flag that is used as a
38   * warning that an operation was performed that resulted in invalid cells and
39   * thus an invalid state. For example this may occur if a cell for an index was
40   * set to negative following a remove operation.</p>
41   *
42   * <p>Implementations should document the expected state of the filter after an
43   * operation that generates invalid cells, and any potential recovery options.
44   * An implementation may support a reversal of the operation to restore the
45   * state to that prior to the operation. In the event that invalid cells are
46   * adjusted to a valid range then it should be documented if there has been
47   * irreversible information loss.</p>
48   *
49   * <p>Implementations may choose to throw an exception during an operation that
50   * generates invalid cells. Implementations should document the expected state
51   * of the filter after such an operation. For example are the cells not updated,
52   * partially updated or updated entirely before the exception is raised.</p>
53   *
54   * @see CellExtractor
55   * @since 4.5.0-M1
56   */
57  public interface CountingBloomFilter extends BloomFilter<CountingBloomFilter>, CellExtractor {
58  
59      // Query Operations
60  
61      /**
62       * Adds the specified CellExtractor to this Bloom filter.
63       *
64       * <p>Specifically
65       * all cells for the indexes identified by the {@code other} will be incremented
66       * by their corresponding values in the {@code other}.</p>
67       *
68       * <p>This method will return {@code true} if the filter is valid after the operation.</p>
69       *
70       * @param other the CellExtractor to add.
71       * @return {@code true} if the addition was successful and the state is valid
72       * @see #isValid()
73       * @see #subtract(CellExtractor)
74       */
75      boolean add(CellExtractor other);
76  
77      /**
78       * Gets the maximum allowable value for a cell count in this Counting filter.
79       *
80       * @return the maximum allowable value for a cell count in this Counting filter.
81       */
82      int getMaxCell();
83  
84      /**
85       * Determines the maximum number of times the BitMapExtractor could have been merged into this counting filter.
86       *
87       * @param bitMapExtractor the BitMapExtractor to provide the indices.
88       * @return the maximum number of times the BitMapExtractor could have been inserted.
89       */
90      default int getMaxInsert(final BitMapExtractor bitMapExtractor) {
91          if (!contains(bitMapExtractor)) {
92              return 0;
93          }
94          final long[] bitMaps = bitMapExtractor.asBitMapArray();
95          final int[] max = { Integer.MAX_VALUE };
96          processCells((x, y) -> {
97              if ((bitMaps[BitMaps.getLongIndex(x)] & BitMaps.getLongBit(x)) != 0) {
98                  max[0] = max[0] <= y ? max[0] : y;
99              }
100             return true;
101         });
102         return max[0];
103     }
104 
105     /**
106      * Determines the maximum number of times the Bloom filter could have been merged into this counting filter.
107      *
108      * @param bloomFilter the Bloom filter the check for.
109      * @return the maximum number of times the Bloom filter could have been inserted.
110      */
111     default int getMaxInsert(final BloomFilter<?> bloomFilter) {
112         return getMaxInsert((BitMapExtractor) bloomFilter);
113     }
114 
115     /**
116      * Determines the maximum number of times the Cell Extractor could have been added.
117      *
118      * @param cellExtractor the extractor of cells.
119      * @return the maximum number of times the CellExtractor could have been inserted.
120      */
121     int getMaxInsert(CellExtractor cellExtractor);
122 
123     /**
124      * Determines the maximum number of times the Hasher could have been merged into this counting filter.
125      *
126      * @param hasher the Hasher to provide the indices.
127      * @return the maximum number of times the hasher could have been inserted.
128      */
129     default int getMaxInsert(final Hasher hasher) {
130         return getMaxInsert(hasher.indices(getShape()));
131     }
132 
133     /**
134      * Determines the maximum number of times the IndexExtractor could have been merged into this counting filter.
135      * <p>
136      * To determine how many times an indexExtractor could have been added create a CellExtractor from the indexExtractor and check that
137      * </p>
138      *
139      * @param indexExtractor the extractor to drive the count check.
140      * @return the maximum number of times the IndexExtractor could have been inserted.
141      * @see #getMaxInsert(CellExtractor)
142      */
143     default int getMaxInsert(final IndexExtractor indexExtractor) {
144         return getMaxInsert(CellExtractor.from(indexExtractor.uniqueIndices()));
145     }
146 
147     /**
148      * Returns {@code true} if the internal state is valid.
149      *
150      * <p>This flag is a warning that an addition or
151      * subtraction of cells from this filter resulted in an invalid cell for one or more
152      * indexes. For example this may occur if a cell for an index was
153      * set to negative following a subtraction operation, or overflows the value specified by {@code getMaxCell()} following an
154      * addition operation.</p>
155      *
156      * <p>A counting Bloom filter that has an invalid state is no longer ensured to function
157      * identically to a standard Bloom filter instance that is the merge of all the Bloom filters
158      * that have been added to and not later subtracted from this counting Bloom filter.</p>
159      *
160      * <p>Note: The change to an invalid state may or may not be reversible. Implementations
161      * are expected to document their policy on recovery from an addition or removal operation
162      * that generated an invalid state.</p>
163      *
164      * @return {@code true} if the state is valid
165      */
166     boolean isValid();
167 
168     /**
169      * Merges the specified BitMap extractor into this Bloom filter.
170      *
171      * <p>Specifically: all cells for the indexes identified by the {@code bitMapExtractor} will be incremented by 1.</p>
172      *
173      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
174      *
175      * @param bitMapExtractor the BitMapExtractor
176      * @return {@code true} if the removal was successful and the state is valid
177      * @see #isValid()
178      * @see #add(CellExtractor)
179      */
180     @Override
181     default boolean merge(final BitMapExtractor bitMapExtractor) {
182         return merge(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
183     }
184 
185     /**
186      * Merges the specified Bloom filter into this Bloom filter.
187      *
188      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be incremented by 1.</p>
189      *
190      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
191      * IndexExtractor.</p>
192      *
193      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
194      *
195      * @param other the other Bloom filter
196      * @return {@code true} if the removal was successful and the state is valid
197      * @see #isValid()
198      * @see #add(CellExtractor)
199      */
200     @Override
201     default boolean merge(final BloomFilter<?> other) {
202         Objects.requireNonNull(other, "other");
203         return merge((IndexExtractor) other);
204     }
205 
206     /**
207      * Merges the specified Hasher into this Bloom filter.
208      *
209      * <p>Specifically: all cells for the unique indexes identified by the {@code hasher} will be incremented by 1.</p>
210      *
211      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
212      *
213      * @param hasher the hasher
214      * @return {@code true} if the removal was successful and the state is valid
215      * @see #isValid()
216      * @see #add(CellExtractor)
217      */
218     @Override
219     default boolean merge(final Hasher hasher) {
220         Objects.requireNonNull(hasher, "hasher");
221         return merge(hasher.indices(getShape()));
222     }
223 
224     /**
225      * Merges the specified index extractor into this Bloom filter.
226      *
227      * <p>Specifically: all unique cells for the indices identified by the {@code indexExtractor} will be incremented by 1.</p>
228      *
229      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
230      *
231      * <p>Notes:</p>
232      * <ul>
233      * <li>If indices that are returned multiple times should be incremented multiple times convert the IndexExtractor
234      * to a CellExtractor and add that.</li>
235      * <li>Implementations should throw {@code IllegalArgumentException} and no other exception on bad input.</li>
236      * </ul>
237      * @param indexExtractor the IndexExtractor
238      * @return {@code true} if the removal was successful and the state is valid
239      * @see #isValid()
240      * @see #add(CellExtractor)
241      */
242     @Override
243     default boolean merge(final IndexExtractor indexExtractor) {
244         Objects.requireNonNull(indexExtractor, "indexExtractor");
245         try {
246             return add(CellExtractor.from(indexExtractor.uniqueIndices()));
247         } catch (final IndexOutOfBoundsException e) {
248             throw new IllegalArgumentException(
249                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
250         }
251     }
252 
253     /**
254      * Removes the specified BitMapExtractor from this Bloom filter.
255      *
256      * <p>Specifically all cells for the indices produced by the {@code bitMapExtractor} will be
257      * decremented by 1.</p>
258      *
259      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
260      *
261      * @param bitMapExtractor the BitMapExtractor to provide the indexes
262      * @return {@code true} if the removal was successful and the state is valid
263      * @see #isValid()
264      * @see #subtract(CellExtractor)
265      */
266     default boolean remove(final BitMapExtractor bitMapExtractor) {
267         return remove(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
268     }
269 
270     /**
271      * Removes the specified Bloom filter from this Bloom filter.
272      *
273      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be decremented by 1.</p>
274      *
275      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
276      * IndexExtractor.</p>
277      *
278      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
279      *
280      * @param other the other Bloom filter
281      * @return {@code true} if the removal was successful and the state is valid
282      * @see #isValid()
283      * @see #subtract(CellExtractor)
284      */
285     default boolean remove(final BloomFilter<?> other) {
286         return remove((IndexExtractor) other);
287     }
288 
289     /**
290      * Removes the unique values from the specified hasher from this Bloom filter.
291      *
292      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
293      * decremented by 1.</p>
294      *
295      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
296      *
297      * @param hasher the hasher to provide the indexes
298      * @return {@code true} if the removal was successful and the state is valid
299      * @see #isValid()
300      * @see #subtract(CellExtractor)
301      */
302     default boolean remove(final Hasher hasher) {
303         Objects.requireNonNull(hasher, "hasher");
304         return remove(hasher.indices(getShape()));
305     }
306 
307     /**
308      * Removes the values from the specified IndexExtractor from the Bloom filter from this Bloom filter.
309      *
310      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
311      * decremented by 1.</p>
312      *
313      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
314      *
315      * <p>Note: If indices that are returned multiple times should be decremented multiple times convert the IndexExtractor
316      * to a CellExtractor and subtract that.</p>
317      *
318      * @param indexExtractor the IndexExtractor to provide the indexes
319      * @return {@code true} if the removal was successful and the state is valid
320      * @see #isValid()
321      * @see #subtract(CellExtractor)
322      */
323     default boolean remove(final IndexExtractor indexExtractor) {
324         Objects.requireNonNull(indexExtractor, "indexExtractor");
325         try {
326             return subtract(CellExtractor.from(indexExtractor.uniqueIndices()));
327         } catch (final IndexOutOfBoundsException e) {
328             throw new IllegalArgumentException(
329                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()));
330         }
331     }
332 
333     /**
334      * Adds the specified CellExtractor to this Bloom filter.
335      *
336      * <p>Specifically
337      * all cells for the indexes identified by the {@code other} will be decremented
338      * by their corresponding values in the {@code other}.</p>
339      *
340      * <p>This method will return true if the filter is valid after the operation.</p>
341      *
342      * @param other the CellExtractor to subtract.
343      * @return {@code true} if the subtraction was successful and the state is valid
344      * @see #isValid()
345      * @see #add(CellExtractor)
346      */
347     boolean subtract(CellExtractor other);
348 
349     /**
350      * The default implementation is a no-op since the counting bloom filter returns an unique IndexExtractor by default.
351      * @return this counting Bloom filter.
352      */
353     @Override
354     default IndexExtractor uniqueIndices() {
355         return this;
356     }
357 }