ArrayCountingBloomFilter.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Arrays;
import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.LongPredicate;
import java.util.stream.IntStream;
/**
* A counting Bloom filter using an int array to track cells for each enabled bit.
*
* <p>
* Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The
* operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the
* operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details.
* </p>
*
* <p>
* All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
* filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and
* not later subtracted from the counting Bloom filter.
* </p>
*
* <p>
* The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an
* implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75
* million items using 20 hash functions per item with a memory consumption of approximately 8 GB.
* </p>
*
* @see Shape
* @see CellExtractor
* @since 4.5.0-M1
*/
public final class ArrayCountingBloomFilter implements CountingBloomFilter {
/**
* The shape of this Bloom filter.
*/
private final Shape shape;
/**
* The cell for each bit index in the filter.
*/
private final int[] cells;
/**
* The state flag. This is a bitwise {@code OR} of the entire history of all updated
* cells. If negative then a negative cell or integer overflow has occurred on
* one or more cells in the history of the filter and the state is invalid.
*
* <p>Maintenance of this state flag is branch-free for improved performance. It
* eliminates a conditional check for a negative cell during remove/subtract
* operations and a conditional check for integer overflow during merge/add
* operations.</p>
*
* <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
* that overflows indicates that the number of items in the filter exceeds the
* maximum possible size (number of bits) of any Bloom filter constrained by
* integer indices. At this point the filter is most likely full (all bits are
* non-zero) and thus useless.</p>
*
* <p>Negative cells are a concern if the filter is used incorrectly by
* removing an item that was never added. It is expected that a user of a
* counting Bloom filter will not perform this action as it is a mistake.
* Enabling an explicit recovery path for negative or overflow cells is a major
* performance burden not deemed necessary for the unlikely scenarios when an
* invalid state is created. Maintenance of the state flag is a concession to
* flag improper use that should not have a major performance impact.</p>
*/
private int state;
private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
this.shape = source.shape;
this.state = source.state;
this.cells = source.cells.clone();
}
/**
* Constructs an empty counting Bloom filter with the specified shape.
*
* @param shape the shape of the filter
*/
public ArrayCountingBloomFilter(final Shape shape) {
Objects.requireNonNull(shape, "shape");
this.shape = shape;
cells = new int[shape.getNumberOfBits()];
}
@Override
public boolean add(final CellExtractor other) {
Objects.requireNonNull(other, "other");
other.processCells(this::add);
return isValid();
}
/**
* Add to the cell for the bit index.
*
* @param idx the index
* @param addend the amount to add
* @return {@code true} always.
*/
private boolean add(final int idx, final int addend) {
try {
final int updated = cells[idx] + addend;
state |= updated;
cells[idx] = updated;
return true;
} catch (final IndexOutOfBoundsException e) {
throw new IllegalArgumentException(
String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
}
}
@Override
public int[] asIndexArray() {
return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
}
@Override
public int cardinality() {
return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
}
@Override
public int characteristics() {
return SPARSE;
}
@Override
public void clear() {
Arrays.fill(cells, 0);
}
@Override
public boolean contains(final BitMapExtractor bitMapExtractor) {
return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
}
@Override
public boolean contains(final IndexExtractor indexExtractor) {
return indexExtractor.processIndices(idx -> cells[idx] != 0);
}
/**
* Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one.
*
* @return a copy of this BloomFilter.
*/
@Override
public ArrayCountingBloomFilter copy() {
return new ArrayCountingBloomFilter(this);
}
@Override
public int getMaxCell() {
return Integer.MAX_VALUE;
}
@Override
public int getMaxInsert(final CellExtractor cellExtractor) {
final int[] max = { Integer.MAX_VALUE };
cellExtractor.processCells((x, y) -> {
final int count = cells[x] / y;
if (count < max[0]) {
max[0] = count;
}
return max[0] > 0;
});
return max[0];
}
@Override
public Shape getShape() {
return shape;
}
/**
* {@inheritDoc}
*
* <p>
* <em>Implementation note</em>
* </p>
*
* <p>
* The state transition to invalid is permanent.
* </p>
*
* <p>
* This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated
* invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state
* prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}.
* </p>
*/
@Override
public boolean isValid() {
return state >= 0;
}
@Override
public boolean processBitMaps(final LongPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1;
int i = 0;
long value;
// must break final block separate as the number of bits may not fall on the long boundary
for (int j = 0; j < blocksm1; j++) {
value = 0;
for (int k = 0; k < Long.SIZE; k++) {
if (cells[i++] != 0) {
value |= BitMaps.getLongBit(k);
}
}
if (!consumer.test(value)) {
return false;
}
}
// Final block
value = 0;
for (int k = 0; i < cells.length; k++) {
if (cells[i++] != 0) {
value |= BitMaps.getLongBit(k);
}
}
return consumer.test(value);
}
@Override
public boolean processCells(final CellPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
for (int i = 0; i < cells.length; i++) {
if (cells[i] != 0 && !consumer.test(i, cells[i])) {
return false;
}
}
return true;
}
@Override
public boolean processIndices(final IntPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
for (int i = 0; i < cells.length; i++) {
if (cells[i] != 0 && !consumer.test(i)) {
return false;
}
}
return true;
}
@Override
public boolean subtract(final CellExtractor other) {
Objects.requireNonNull(other, "other");
other.processCells(this::subtract);
return isValid();
}
/**
* Subtracts from the cell for the bit index.
*
* @param idx the index
* @param subtrahend the amount to subtract
* @return {@code true} always.
*/
private boolean subtract(final int idx, final int subtrahend) {
try {
final int updated = cells[idx] - subtrahend;
state |= updated;
cells[idx] = updated;
return true;
} catch (final IndexOutOfBoundsException e) {
throw new IllegalArgumentException(
String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
}
}
}