View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.collections4.bloomfilter;
18  
19  import java.util.Arrays;
20  import java.util.Objects;
21  import java.util.function.IntPredicate;
22  import java.util.function.LongPredicate;
23  import java.util.stream.IntStream;
24  
25  /**
26   * A counting Bloom filter using an int array to track cells for each enabled bit.
27   *
28   * <p>
29   * Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The
30   * operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the
31   * operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details.
32   * </p>
33   *
34   * <p>
35   * All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
36   * filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and
37   * not later subtracted from the counting Bloom filter.
38   * </p>
39   *
40   * <p>
41   * The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an
42   * implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75
43   * million items using 20 hash functions per item with a memory consumption of approximately 8 GB.
44   * </p>
45   *
46   * @see Shape
47   * @see CellExtractor
48   * @since 4.5.0-M1
49   */
50  public final class ArrayCountingBloomFilter implements CountingBloomFilter {
51  
52      /**
53       * The shape of this Bloom filter.
54       */
55      private final Shape shape;
56  
57      /**
58       * The cell for each bit index in the filter.
59       */
60      private final int[] cells;
61  
62      /**
63       * The state flag. This is a bitwise {@code OR} of the entire history of all updated
64       * cells. If negative then a negative cell or integer overflow has occurred on
65       * one or more cells in the history of the filter and the state is invalid.
66       *
67       * <p>Maintenance of this state flag is branch-free for improved performance. It
68       * eliminates a conditional check for a negative cell during remove/subtract
69       * operations and a conditional check for integer overflow during merge/add
70       * operations.</p>
71       *
72       * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
73       * that overflows indicates that the number of items in the filter exceeds the
74       * maximum possible size (number of bits) of any Bloom filter constrained by
75       * integer indices. At this point the filter is most likely full (all bits are
76       * non-zero) and thus useless.</p>
77       *
78       * <p>Negative cells are a concern if the filter is used incorrectly by
79       * removing an item that was never added. It is expected that a user of a
80       * counting Bloom filter will not perform this action as it is a mistake.
81       * Enabling an explicit recovery path for negative or overflow cells is a major
82       * performance burden not deemed necessary for the unlikely scenarios when an
83       * invalid state is created. Maintenance of the state flag is a concession to
84       * flag improper use that should not have a major performance impact.</p>
85       */
86      private int state;
87  
88      private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
89          this.shape = source.shape;
90          this.state = source.state;
91          this.cells = source.cells.clone();
92      }
93  
94      /**
95       * Constructs an empty counting Bloom filter with the specified shape.
96       *
97       * @param shape the shape of the filter
98       */
99      public ArrayCountingBloomFilter(final Shape shape) {
100         Objects.requireNonNull(shape, "shape");
101         this.shape = shape;
102         cells = new int[shape.getNumberOfBits()];
103     }
104 
105     @Override
106     public boolean add(final CellExtractor other) {
107         Objects.requireNonNull(other, "other");
108         other.processCells(this::add);
109         return isValid();
110     }
111 
112     /**
113      * Add to the cell for the bit index.
114      *
115      * @param idx the index
116      * @param addend the amount to add
117      * @return {@code true} always.
118      */
119     private boolean add(final int idx, final int addend) {
120         try {
121             final int updated = cells[idx] + addend;
122             state |= updated;
123             cells[idx] = updated;
124             return true;
125         } catch (final IndexOutOfBoundsException e) {
126             throw new IllegalArgumentException(
127                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
128         }
129     }
130 
131     @Override
132     public int[] asIndexArray() {
133         return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
134     }
135 
136     @Override
137     public int cardinality() {
138         return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
139     }
140 
141     @Override
142     public int characteristics() {
143         return SPARSE;
144     }
145 
146     @Override
147     public void clear() {
148         Arrays.fill(cells, 0);
149     }
150 
151     @Override
152     public boolean contains(final BitMapExtractor bitMapExtractor) {
153         return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
154     }
155 
156     @Override
157     public boolean contains(final IndexExtractor indexExtractor) {
158         return indexExtractor.processIndices(idx -> cells[idx] != 0);
159     }
160 
161     /**
162      * Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one.
163      *
164      * @return a copy of this BloomFilter.
165      */
166     @Override
167     public ArrayCountingBloomFilter copy() {
168         return new ArrayCountingBloomFilter(this);
169     }
170 
171     @Override
172     public int getMaxCell() {
173         return Integer.MAX_VALUE;
174     }
175 
176     @Override
177     public int getMaxInsert(final CellExtractor cellExtractor) {
178         final int[] max = { Integer.MAX_VALUE };
179         cellExtractor.processCells((x, y) -> {
180             final int count = cells[x] / y;
181             if (count < max[0]) {
182                 max[0] = count;
183             }
184             return max[0] > 0;
185         });
186         return max[0];
187     }
188 
189     @Override
190     public Shape getShape() {
191         return shape;
192     }
193 
194     /**
195      * {@inheritDoc}
196      *
197      * <p>
198      * <em>Implementation note</em>
199      * </p>
200      *
201      * <p>
202      * The state transition to invalid is permanent.
203      * </p>
204      *
205      * <p>
206      * This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated
207      * invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state
208      * prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}.
209      * </p>
210      */
211     @Override
212     public boolean isValid() {
213         return state >= 0;
214     }
215 
216     @Override
217     public boolean processBitMaps(final LongPredicate consumer) {
218         Objects.requireNonNull(consumer, "consumer");
219         final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1;
220         int i = 0;
221         long value;
222         // must break final block separate as the number of bits may not fall on the long boundary
223         for (int j = 0; j < blocksm1; j++) {
224             value = 0;
225             for (int k = 0; k < Long.SIZE; k++) {
226                 if (cells[i++] != 0) {
227                     value |= BitMaps.getLongBit(k);
228                 }
229             }
230             if (!consumer.test(value)) {
231                 return false;
232             }
233         }
234         // Final block
235         value = 0;
236         for (int k = 0; i < cells.length; k++) {
237             if (cells[i++] != 0) {
238                 value |= BitMaps.getLongBit(k);
239             }
240         }
241         return consumer.test(value);
242     }
243 
244     @Override
245     public boolean processCells(final CellPredicate consumer) {
246         Objects.requireNonNull(consumer, "consumer");
247         for (int i = 0; i < cells.length; i++) {
248             if (cells[i] != 0 && !consumer.test(i, cells[i])) {
249                 return false;
250             }
251         }
252         return true;
253     }
254 
255     @Override
256     public boolean processIndices(final IntPredicate consumer) {
257         Objects.requireNonNull(consumer, "consumer");
258         for (int i = 0; i < cells.length; i++) {
259             if (cells[i] != 0 && !consumer.test(i)) {
260                 return false;
261             }
262         }
263         return true;
264     }
265 
266     @Override
267     public boolean subtract(final CellExtractor other) {
268         Objects.requireNonNull(other, "other");
269         other.processCells(this::subtract);
270         return isValid();
271     }
272 
273     /**
274      * Subtracts from the cell for the bit index.
275      *
276      * @param idx the index
277      * @param subtrahend the amount to subtract
278      * @return {@code true} always.
279      */
280     private boolean subtract(final int idx, final int subtrahend) {
281         try {
282             final int updated = cells[idx] - subtrahend;
283             state |= updated;
284             cells[idx] = updated;
285             return true;
286         } catch (final IndexOutOfBoundsException e) {
287             throw new IllegalArgumentException(
288                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
289         }
290     }
291 }