1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.collections4.bloomfilter;
18
19 import java.util.Arrays;
20 import java.util.Objects;
21 import java.util.function.IntPredicate;
22 import java.util.function.LongPredicate;
23 import java.util.stream.IntStream;
24
25 /**
26 * A counting Bloom filter using an int array to track cells for each enabled bit.
27 *
28 * <p>
29 * Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The
30 * operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the
31 * operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details.
32 * </p>
33 *
34 * <p>
35 * All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
36 * filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and
37 * not later subtracted from the counting Bloom filter.
38 * </p>
39 *
40 * <p>
41 * The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an
42 * implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75
43 * million items using 20 hash functions per item with a memory consumption of approximately 8 GB.
44 * </p>
45 *
46 * @see Shape
47 * @see CellExtractor
48 * @since 4.5.0-M1
49 */
50 public final class ArrayCountingBloomFilter implements CountingBloomFilter {
51
52 /**
53 * The shape of this Bloom filter.
54 */
55 private final Shape shape;
56
57 /**
58 * The cell for each bit index in the filter.
59 */
60 private final int[] cells;
61
62 /**
63 * The state flag. This is a bitwise {@code OR} of the entire history of all updated
64 * cells. If negative then a negative cell or integer overflow has occurred on
65 * one or more cells in the history of the filter and the state is invalid.
66 *
67 * <p>Maintenance of this state flag is branch-free for improved performance. It
68 * eliminates a conditional check for a negative cell during remove/subtract
69 * operations and a conditional check for integer overflow during merge/add
70 * operations.</p>
71 *
72 * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
73 * that overflows indicates that the number of items in the filter exceeds the
74 * maximum possible size (number of bits) of any Bloom filter constrained by
75 * integer indices. At this point the filter is most likely full (all bits are
76 * non-zero) and thus useless.</p>
77 *
78 * <p>Negative cells are a concern if the filter is used incorrectly by
79 * removing an item that was never added. It is expected that a user of a
80 * counting Bloom filter will not perform this action as it is a mistake.
81 * Enabling an explicit recovery path for negative or overflow cells is a major
82 * performance burden not deemed necessary for the unlikely scenarios when an
83 * invalid state is created. Maintenance of the state flag is a concession to
84 * flag improper use that should not have a major performance impact.</p>
85 */
86 private int state;
87
88 private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
89 this.shape = source.shape;
90 this.state = source.state;
91 this.cells = source.cells.clone();
92 }
93
94 /**
95 * Constructs an empty counting Bloom filter with the specified shape.
96 *
97 * @param shape the shape of the filter
98 */
99 public ArrayCountingBloomFilter(final Shape shape) {
100 Objects.requireNonNull(shape, "shape");
101 this.shape = shape;
102 cells = new int[shape.getNumberOfBits()];
103 }
104
105 @Override
106 public boolean add(final CellExtractor other) {
107 Objects.requireNonNull(other, "other");
108 other.processCells(this::add);
109 return isValid();
110 }
111
112 /**
113 * Add to the cell for the bit index.
114 *
115 * @param idx the index
116 * @param addend the amount to add
117 * @return {@code true} always.
118 */
119 private boolean add(final int idx, final int addend) {
120 try {
121 final int updated = cells[idx] + addend;
122 state |= updated;
123 cells[idx] = updated;
124 return true;
125 } catch (final IndexOutOfBoundsException e) {
126 throw new IllegalArgumentException(
127 String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
128 }
129 }
130
131 @Override
132 public int[] asIndexArray() {
133 return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
134 }
135
136 @Override
137 public int cardinality() {
138 return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
139 }
140
141 @Override
142 public int characteristics() {
143 return SPARSE;
144 }
145
146 @Override
147 public void clear() {
148 Arrays.fill(cells, 0);
149 }
150
151 @Override
152 public boolean contains(final BitMapExtractor bitMapExtractor) {
153 return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
154 }
155
156 @Override
157 public boolean contains(final IndexExtractor indexExtractor) {
158 return indexExtractor.processIndices(idx -> cells[idx] != 0);
159 }
160
161 /**
162 * Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one.
163 *
164 * @return a copy of this BloomFilter.
165 */
166 @Override
167 public ArrayCountingBloomFilter copy() {
168 return new ArrayCountingBloomFilter(this);
169 }
170
171 @Override
172 public int getMaxCell() {
173 return Integer.MAX_VALUE;
174 }
175
176 @Override
177 public int getMaxInsert(final CellExtractor cellExtractor) {
178 final int[] max = { Integer.MAX_VALUE };
179 cellExtractor.processCells((x, y) -> {
180 final int count = cells[x] / y;
181 if (count < max[0]) {
182 max[0] = count;
183 }
184 return max[0] > 0;
185 });
186 return max[0];
187 }
188
189 @Override
190 public Shape getShape() {
191 return shape;
192 }
193
194 /**
195 * {@inheritDoc}
196 *
197 * <p>
198 * <em>Implementation note</em>
199 * </p>
200 *
201 * <p>
202 * The state transition to invalid is permanent.
203 * </p>
204 *
205 * <p>
206 * This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated
207 * invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state
208 * prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}.
209 * </p>
210 */
211 @Override
212 public boolean isValid() {
213 return state >= 0;
214 }
215
216 @Override
217 public boolean processBitMaps(final LongPredicate consumer) {
218 Objects.requireNonNull(consumer, "consumer");
219 final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1;
220 int i = 0;
221 long value;
222 // must break final block separate as the number of bits may not fall on the long boundary
223 for (int j = 0; j < blocksm1; j++) {
224 value = 0;
225 for (int k = 0; k < Long.SIZE; k++) {
226 if (cells[i++] != 0) {
227 value |= BitMaps.getLongBit(k);
228 }
229 }
230 if (!consumer.test(value)) {
231 return false;
232 }
233 }
234 // Final block
235 value = 0;
236 for (int k = 0; i < cells.length; k++) {
237 if (cells[i++] != 0) {
238 value |= BitMaps.getLongBit(k);
239 }
240 }
241 return consumer.test(value);
242 }
243
244 @Override
245 public boolean processCells(final CellPredicate consumer) {
246 Objects.requireNonNull(consumer, "consumer");
247 for (int i = 0; i < cells.length; i++) {
248 if (cells[i] != 0 && !consumer.test(i, cells[i])) {
249 return false;
250 }
251 }
252 return true;
253 }
254
255 @Override
256 public boolean processIndices(final IntPredicate consumer) {
257 Objects.requireNonNull(consumer, "consumer");
258 for (int i = 0; i < cells.length; i++) {
259 if (cells[i] != 0 && !consumer.test(i)) {
260 return false;
261 }
262 }
263 return true;
264 }
265
266 @Override
267 public boolean subtract(final CellExtractor other) {
268 Objects.requireNonNull(other, "other");
269 other.processCells(this::subtract);
270 return isValid();
271 }
272
273 /**
274 * Subtracts from the cell for the bit index.
275 *
276 * @param idx the index
277 * @param subtrahend the amount to subtract
278 * @return {@code true} always.
279 */
280 private boolean subtract(final int idx, final int subtrahend) {
281 try {
282 final int updated = cells[idx] - subtrahend;
283 state |= updated;
284 cells[idx] = updated;
285 return true;
286 } catch (final IndexOutOfBoundsException e) {
287 throw new IllegalArgumentException(
288 String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
289 }
290 }
291 }