View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Objects;
20  
21  /**
22   * The interface that describes a Bloom filter that associates a count with each
23   * bit index rather than a bit.  This allows reversal of merge operations with
24   * remove operations.
25   *
26   * <p>A counting Bloom filter is expected to function identically to a standard
27   * Bloom filter that is the merge of all the Bloom filters that have been added
28   * to and not later subtracted from the counting Bloom filter. The functional
29   * state of a CountingBloomFilter at the start and end of a series of merge and
30   * subsequent remove operations of the same Bloom filters, irrespective of
31   * remove order, is expected to be the same.</p>
32   *
33   * <p>Removal of a filter that has not previously been merged results in an
34   * invalid state where the cells no longer represent a sum of merged Bloom
35   * filters. It is impossible to validate merge and remove exactly without
36   * explicitly storing all filters. Consequently such an operation may go
37   * undetected. The CountingBloomFilter maintains a state flag that is used as a
38   * warning that an operation was performed that resulted in invalid cells and
39   * thus an invalid state. For example this may occur if a cell for an index was
40   * set to negative following a remove operation.</p>
41   *
42   * <p>Implementations should document the expected state of the filter after an
43   * operation that generates invalid cells, and any potential recovery options.
44   * An implementation may support a reversal of the operation to restore the
45   * state to that prior to the operation. In the event that invalid cells are
46   * adjusted to a valid range then it should be documented if there has been
47   * irreversible information loss.</p>
48   *
49   * <p>Implementations may choose to throw an exception during an operation that
50   * generates invalid cells. Implementations should document the expected state
51   * of the filter after such an operation. For example are the cells not updated,
52   * partially updated or updated entirely before the exception is raised.</p>
53   *
54   * @see CellProducer
55   * @since 4.5
56   */
57  public interface CountingBloomFilter extends BloomFilter, CellProducer {
58  
59      // Query Operations
60  
61      /**
62       * Adds the specified CellProducer to this Bloom filter.
63       *
64       * <p>Specifically
65       * all cells for the indexes identified by the {@code other} will be incremented
66       * by their corresponding values in the {@code other}.</p>
67       *
68       * <p>This method will return {@code true} if the filter is valid after the operation.</p>
69       *
70       * @param other the CellProducer to add.
71       * @return {@code true} if the addition was successful and the state is valid
72       * @see #isValid()
73       * @see #subtract(CellProducer)
74       */
75      boolean add(CellProducer other);
76  
77      /**
78       * Creates a new instance of the CountingBloomFilter with the same properties as the current one.
79       * @return a copy of this CountingBloomFilter
80       */
81      @Override
82      CountingBloomFilter copy();
83  
84      /**
85       * Returns the maximum allowable value for a cell count in this Counting filter.
86       * @return the maximum allowable value for a cell count in this Counting filter.
87       */
88      int getMaxCell();
89  
90      /**
91       * Determines the maximum number of times the BitMapProducer could have been merged into this
92       * counting filter.
93       * @param bitMapProducer the BitMapProducer to provide the indices.
94       * @return the maximum number of times the BitMapProducer could have been inserted.
95       */
96      default int getMaxInsert(final BitMapProducer bitMapProducer) {
97          if (!contains(bitMapProducer)) {
98              return 0;
99          }
100         final long[] bitMaps = bitMapProducer.asBitMapArray();
101         final int[] max = { Integer.MAX_VALUE };
102         forEachCell((x, y) -> {
103             if ((bitMaps[BitMap.getLongIndex(x)] & BitMap.getLongBit(x)) != 0) {
104                 max[0] = max[0] <= y ? max[0] : y;
105             }
106             return true;
107         });
108         return max[0];
109     }
110 
111     /**
112      * Determines the maximum number of times the Bloom filter could have been merged
113      * into this counting filter.
114      * @param bloomFilter the Bloom filter the check for.
115      * @return the maximum number of times the Bloom filter could have been inserted.
116      */
117     default int getMaxInsert(final BloomFilter bloomFilter) {
118         return getMaxInsert((BitMapProducer) bloomFilter);
119     }
120 
121     /**
122      * Determines the maximum number of times the Cell Producer could have been add.
123      * @param cellProducer the producer of cells.
124      * @return the maximum number of times the CellProducer could have been inserted.
125      */
126     int getMaxInsert(CellProducer cellProducer);
127 
128     /**
129      * Determines the maximum number of times the Hasher could have been merged into this
130      * counting filter.
131      * @param hasher the Hasher to provide the indices.
132      * @return the maximum number of times the hasher could have been inserted.
133      */
134     default int getMaxInsert(final Hasher hasher) {
135         return getMaxInsert(hasher.indices(getShape()));
136     }
137 
138     // Modification Operations
139 
140     /**
141      * Determines the maximum number of times the IndexProducer could have been merged
142      * into this counting filter.
143      * <p>To determine how many times an indxProducer could have been added create a CellProducer
144      * from the indexProducer and check that</p>
145      * @param idxProducer the producer to drive the count check.
146      * @return the maximum number of times the IndexProducer could have been inserted.
147      * @see #getMaxInsert(CellProducer)
148      */
149     default int getMaxInsert(final IndexProducer idxProducer) {
150         return getMaxInsert(CellProducer.from(idxProducer.uniqueIndices()) );
151     }
152 
153     /**
154      * Returns {@code true} if the internal state is valid.
155      *
156      * <p>This flag is a warning that an addition or
157      * subtraction of cells from this filter resulted in an invalid cell for one or more
158      * indexes. For example this may occur if a cell for an index was
159      * set to negative following a subtraction operation, or overflows the value specified by {@code getMaxCell()} following an
160      * addition operation.</p>
161      *
162      * <p>A counting Bloom filter that has an invalid state is no longer ensured to function
163      * identically to a standard Bloom filter instance that is the merge of all the Bloom filters
164      * that have been added to and not later subtracted from this counting Bloom filter.</p>
165      *
166      * <p>Note: The change to an invalid state may or may not be reversible. Implementations
167      * are expected to document their policy on recovery from an addition or removal operation
168      * that generated an invalid state.</p>
169      *
170      * @return {@code true} if the state is valid
171      */
172     boolean isValid();
173 
174     /**
175      * Merges the specified BitMap producer into this Bloom filter.
176      *
177      * <p>Specifically: all cells for the indexes identified by the {@code bitMapProducer} will be incremented by 1.</p>
178      *
179      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
180      *
181      * @param bitMapProducer the BitMapProducer
182      * @return {@code true} if the removal was successful and the state is valid
183      * @see #isValid()
184      * @see #add(CellProducer)
185      */
186     @Override
187     default boolean merge(final BitMapProducer bitMapProducer) {
188         Objects.requireNonNull(bitMapProducer, "bitMapProducer");
189         return merge(IndexProducer.fromBitMapProducer(bitMapProducer));
190     }
191 
192     /**
193      * Merges the specified Bloom filter into this Bloom filter.
194      *
195      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be incremented by 1.</p>
196      *
197      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
198      * IndexProducer.</p>
199      *
200      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
201      *
202      * @param other the other Bloom filter
203      * @return {@code true} if the removal was successful and the state is valid
204      * @see #isValid()
205      * @see #add(CellProducer)
206      */
207     @Override
208     default boolean merge(final BloomFilter other) {
209         Objects.requireNonNull(other, "other");
210         return merge((IndexProducer) other);
211     }
212 
213     /**
214      * Merges the specified Hasher into this Bloom filter.
215      *
216      * <p>Specifically: all cells for the unique indexes identified by the {@code hasher} will be incremented by 1.</p>
217      *
218      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
219      *
220      * @param hasher the hasher
221      * @return {@code true} if the removal was successful and the state is valid
222      * @see #isValid()
223      * @see #add(CellProducer)
224      */
225     @Override
226     default boolean merge(final Hasher hasher) {
227         Objects.requireNonNull(hasher, "hasher");
228         return merge(hasher.indices(getShape()));
229     }
230 
231     /**
232      * Merges the specified index producer into this Bloom filter.
233      *
234      * <p>Specifically: all unique cells for the indices identified by the {@code indexProducer} will be incremented by 1.</p>
235      *
236      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
237      *
238      * <p>Note: If indices that are returned multiple times should be incremented multiple times convert the IndexProducer
239      * to a CellProducer and add that.</p>
240      *
241      * @param indexProducer the IndexProducer
242      * @return {@code true} if the removal was successful and the state is valid
243      * @see #isValid()
244      * @see #add(CellProducer)
245      */
246     @Override
247     default boolean merge(final IndexProducer indexProducer) {
248         Objects.requireNonNull(indexProducer, "indexProducer");
249         try {
250             return add(CellProducer.from(indexProducer.uniqueIndices()));
251         } catch (final IndexOutOfBoundsException e) {
252             throw new IllegalArgumentException(
253                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
254         }
255     }
256 
257     /**
258      * Removes the specified BitMapProducer from this Bloom filter.
259      *
260      * <p>Specifically all cells for the indices produced by the {@code bitMapProducer} will be
261      * decremented by 1.</p>
262      *
263      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
264      *
265      * @param bitMapProducer the BitMapProducer to provide the indexes
266      * @return {@code true} if the removal was successful and the state is valid
267      * @see #isValid()
268      * @see #subtract(CellProducer)
269      */
270     default boolean remove(final BitMapProducer bitMapProducer) {
271         Objects.requireNonNull(bitMapProducer, "bitMapProducer");
272         return remove(IndexProducer.fromBitMapProducer(bitMapProducer));
273     }
274 
275     /**
276      * Removes the specified Bloom filter from this Bloom filter.
277      *
278      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be decremented by 1.</p>
279      *
280      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
281      * IndexProducer.</p>
282      *
283      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
284      *
285      * @param other the other Bloom filter
286      * @return {@code true} if the removal was successful and the state is valid
287      * @see #isValid()
288      * @see #subtract(CellProducer)
289      */
290     default boolean remove(final BloomFilter other) {
291         Objects.requireNonNull(other, "other");
292         return remove((IndexProducer) other);
293     }
294 
295     /**
296      * Removes the unique values from the specified hasher from this Bloom filter.
297      *
298      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
299      * decremented by 1.</p>
300      *
301      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
302      *
303      * @param hasher the hasher to provide the indexes
304      * @return {@code true} if the removal was successful and the state is valid
305      * @see #isValid()
306      * @see #subtract(CellProducer)
307      */
308     default boolean remove(final Hasher hasher) {
309         Objects.requireNonNull(hasher, "hasher");
310         return remove(hasher.indices(getShape()));
311     }
312 
313     /**
314      * Removes the values from the specified IndexProducer from the Bloom filter from this Bloom filter.
315      *
316      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
317      * decremented by 1.</p>
318      *
319      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
320      *
321      * <p>Note: If indices that are returned multiple times should be decremented multiple times convert the IndexProducer
322      * to a CellProducer and subtract that.</p>
323      *
324      * @param indexProducer the IndexProducer to provide the indexes
325      * @return {@code true} if the removal was successful and the state is valid
326      * @see #isValid()
327      * @see #subtract(CellProducer)
328      */
329     default boolean remove(final IndexProducer indexProducer) {
330         Objects.requireNonNull(indexProducer, "indexProducer");
331         try {
332             return subtract(CellProducer.from(indexProducer.uniqueIndices()));
333         } catch (final IndexOutOfBoundsException e) {
334             throw new IllegalArgumentException(
335                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()));
336         }
337     }
338 
339 
340     /**
341      * Adds the specified CellProducer to this Bloom filter.
342      *
343      * <p>Specifically
344      * all cells for the indexes identified by the {@code other} will be decremented
345      * by their corresponding values in the {@code other}.</p>
346      *
347      * <p>This method will return true if the filter is valid after the operation.</p>
348      *
349      * @param other the CellProducer to subtract.
350      * @return {@code true} if the subtraction was successful and the state is valid
351      * @see #isValid()
352      * @see #add(CellProducer)
353      */
354     boolean subtract(CellProducer other);
355 
356     @Override
357     default IndexProducer uniqueIndices() {
358         return this;
359     }
360 }