001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.bloomfilter;
018
019import java.util.Arrays;
020import java.util.Objects;
021import java.util.function.IntPredicate;
022import java.util.function.LongPredicate;
023import java.util.stream.IntStream;
024
025/**
026 * A counting Bloom filter using an int array to track cells for each enabled bit.
027 *
028 * <p>
029 * Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The
030 * operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the
031 * operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details.
032 * </p>
033 *
034 * <p>
035 * All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
036 * filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and
037 * not later subtracted from the counting Bloom filter.
038 * </p>
039 *
040 * <p>
041 * The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an
042 * implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75
043 * million items using 20 hash functions per item with a memory consumption of approximately 8 GB.
044 * </p>
045 *
046 * @see Shape
047 * @see CellExtractor
048 * @since 4.5.0-M1
049 */
050public final class ArrayCountingBloomFilter implements CountingBloomFilter {
051
052    /**
053     * The shape of this Bloom filter.
054     */
055    private final Shape shape;
056
057    /**
058     * The cell for each bit index in the filter.
059     */
060    private final int[] cells;
061
062    /**
063     * The state flag. This is a bitwise {@code OR} of the entire history of all updated
064     * cells. If negative then a negative cell or integer overflow has occurred on
065     * one or more cells in the history of the filter and the state is invalid.
066     *
067     * <p>Maintenance of this state flag is branch-free for improved performance. It
068     * eliminates a conditional check for a negative cell during remove/subtract
069     * operations and a conditional check for integer overflow during merge/add
070     * operations.</p>
071     *
072     * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
073     * that overflows indicates that the number of items in the filter exceeds the
074     * maximum possible size (number of bits) of any Bloom filter constrained by
075     * integer indices. At this point the filter is most likely full (all bits are
076     * non-zero) and thus useless.</p>
077     *
078     * <p>Negative cells are a concern if the filter is used incorrectly by
079     * removing an item that was never added. It is expected that a user of a
080     * counting Bloom filter will not perform this action as it is a mistake.
081     * Enabling an explicit recovery path for negative or overflow cells is a major
082     * performance burden not deemed necessary for the unlikely scenarios when an
083     * invalid state is created. Maintenance of the state flag is a concession to
084     * flag improper use that should not have a major performance impact.</p>
085     */
086    private int state;
087
088    private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
089        this.shape = source.shape;
090        this.state = source.state;
091        this.cells = source.cells.clone();
092    }
093
094    /**
095     * Constructs an empty counting Bloom filter with the specified shape.
096     *
097     * @param shape the shape of the filter
098     */
099    public ArrayCountingBloomFilter(final Shape shape) {
100        Objects.requireNonNull(shape, "shape");
101        this.shape = shape;
102        cells = new int[shape.getNumberOfBits()];
103    }
104
105    @Override
106    public boolean add(final CellExtractor other) {
107        Objects.requireNonNull(other, "other");
108        other.processCells(this::add);
109        return isValid();
110    }
111
112    /**
113     * Add to the cell for the bit index.
114     *
115     * @param idx the index
116     * @param addend the amount to add
117     * @return {@code true} always.
118     */
119    private boolean add(final int idx, final int addend) {
120        try {
121            final int updated = cells[idx] + addend;
122            state |= updated;
123            cells[idx] = updated;
124            return true;
125        } catch (final IndexOutOfBoundsException e) {
126            throw new IllegalArgumentException(
127                    String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
128        }
129    }
130
131    @Override
132    public int[] asIndexArray() {
133        return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
134    }
135
136    @Override
137    public int cardinality() {
138        return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
139    }
140
141    @Override
142    public int characteristics() {
143        return SPARSE;
144    }
145
146    @Override
147    public void clear() {
148        Arrays.fill(cells, 0);
149    }
150
151    @Override
152    public boolean contains(final BitMapExtractor bitMapExtractor) {
153        return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
154    }
155
156    @Override
157    public boolean contains(final IndexExtractor indexExtractor) {
158        return indexExtractor.processIndices(idx -> cells[idx] != 0);
159    }
160
161    /**
162     * Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one.
163     *
164     * @return a copy of this BloomFilter.
165     */
166    @Override
167    public ArrayCountingBloomFilter copy() {
168        return new ArrayCountingBloomFilter(this);
169    }
170
171    @Override
172    public int getMaxCell() {
173        return Integer.MAX_VALUE;
174    }
175
176    @Override
177    public int getMaxInsert(final CellExtractor cellExtractor) {
178        final int[] max = { Integer.MAX_VALUE };
179        cellExtractor.processCells((x, y) -> {
180            final int count = cells[x] / y;
181            if (count < max[0]) {
182                max[0] = count;
183            }
184            return max[0] > 0;
185        });
186        return max[0];
187    }
188
189    @Override
190    public Shape getShape() {
191        return shape;
192    }
193
194    /**
195     * {@inheritDoc}
196     *
197     * <p>
198     * <em>Implementation note</em>
199     * </p>
200     *
201     * <p>
202     * The state transition to invalid is permanent.
203     * </p>
204     *
205     * <p>
206     * This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated
207     * invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state
208     * prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}.
209     * </p>
210     */
211    @Override
212    public boolean isValid() {
213        return state >= 0;
214    }
215
216    @Override
217    public boolean processBitMaps(final LongPredicate consumer) {
218        Objects.requireNonNull(consumer, "consumer");
219        final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1;
220        int i = 0;
221        long value;
222        // must break final block separate as the number of bits may not fall on the long boundary
223        for (int j = 0; j < blocksm1; j++) {
224            value = 0;
225            for (int k = 0; k < Long.SIZE; k++) {
226                if (cells[i++] != 0) {
227                    value |= BitMaps.getLongBit(k);
228                }
229            }
230            if (!consumer.test(value)) {
231                return false;
232            }
233        }
234        // Final block
235        value = 0;
236        for (int k = 0; i < cells.length; k++) {
237            if (cells[i++] != 0) {
238                value |= BitMaps.getLongBit(k);
239            }
240        }
241        return consumer.test(value);
242    }
243
244    @Override
245    public boolean processCells(final CellPredicate consumer) {
246        Objects.requireNonNull(consumer, "consumer");
247        for (int i = 0; i < cells.length; i++) {
248            if (cells[i] != 0 && !consumer.test(i, cells[i])) {
249                return false;
250            }
251        }
252        return true;
253    }
254
255    @Override
256    public boolean processIndices(final IntPredicate consumer) {
257        Objects.requireNonNull(consumer, "consumer");
258        for (int i = 0; i < cells.length; i++) {
259            if (cells[i] != 0 && !consumer.test(i)) {
260                return false;
261            }
262        }
263        return true;
264    }
265
266    @Override
267    public boolean subtract(final CellExtractor other) {
268        Objects.requireNonNull(other, "other");
269        other.processCells(this::subtract);
270        return isValid();
271    }
272
273    /**
274     * Subtracts from the cell for the bit index.
275     *
276     * @param idx the index
277     * @param subtrahend the amount to subtract
278     * @return {@code true} always.
279     */
280    private boolean subtract(final int idx, final int subtrahend) {
281        try {
282            final int updated = cells[idx] - subtrahend;
283            state |= updated;
284            cells[idx] = updated;
285            return true;
286        } catch (final IndexOutOfBoundsException e) {
287            throw new IllegalArgumentException(
288                    String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
289        }
290    }
291}