ArrayCountingBloomFilter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.collections4.bloomfilter;

import java.util.Arrays;
import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.LongPredicate;
import java.util.stream.IntStream;

/**
 * A counting Bloom filter using an int array to track cells for each enabled bit.
 *
 * <p>Any operation that results in negative counts or integer overflow of
 * counts will mark this filter as invalid. This transition is not reversible.
 * The operation is completed in full, no exception is raised and the state is
 * set to invalid. This allows the cells for the filter immediately prior to the
 * operation that created the invalid state to be recovered. See the documentation
 * in {@link #isValid()} for details.</p>
 *
 * <p>All the operations in the filter assume the cells are currently valid,
 * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
 * filter is undefined. It will no longer function identically to a standard
 * Bloom filter that is the merge of all the Bloom filters that have been added
 * to and not later subtracted from the counting Bloom filter.</p>
 *
 * <p>The maximum supported number of items that can be stored in the filter is
 * limited by the maximum array size combined with the {@link Shape}. For
 * example an implementation using a {@link Shape} with a false-positive
 * probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store
 * approximately 75 million items using 20 hash functions per item with a memory
 * consumption of approximately 8 GB.
 *
 * @see Shape
 * @see CellProducer
 * @since 4.5
 */
public final class ArrayCountingBloomFilter implements CountingBloomFilter {

    /**
     * The shape of this Bloom filter.
     */
    private final Shape shape;

    /**
     * The cell for each bit index in the filter.
     */
    private final int[] cells;

    /**
     * The state flag. This is a bitwise {@code OR} of the entire history of all updated
     * cells. If negative then a negative cell or integer overflow has occurred on
     * one or more cells in the history of the filter and the state is invalid.
     *
     * <p>Maintenance of this state flag is branch-free for improved performance. It
     * eliminates a conditional check for a negative cell during remove/subtract
     * operations and a conditional check for integer overflow during merge/add
     * operations.</p>
     *
     * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
     * that overflows indicates that the number of items in the filter exceeds the
     * maximum possible size (number of bits) of any Bloom filter constrained by
     * integer indices. At this point the filter is most likely full (all bits are
     * non-zero) and thus useless.</p>
     *
     * <p>Negative cells are a concern if the filter is used incorrectly by
     * removing an item that was never added. It is expected that a user of a
     * counting Bloom filter will not perform this action as it is a mistake.
     * Enabling an explicit recovery path for negative or overflow cells is a major
     * performance burden not deemed necessary for the unlikely scenarios when an
     * invalid state is created. Maintenance of the state flag is a concession to
     * flag improper use that should not have a major performance impact.</p>
     */
    private int state;

    private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
        this.shape = source.shape;
        this.state = source.state;
        this.cells = source.cells.clone();
    }

    /**
     * Constructs an empty counting Bloom filter with the specified shape.
     *
     * @param shape the shape of the filter
     */
    public ArrayCountingBloomFilter(final Shape shape) {
        Objects.requireNonNull(shape, "shape");
        this.shape = shape;
        cells = new int[shape.getNumberOfBits()];
    }

    @Override
    public boolean add(final CellProducer other) {
        Objects.requireNonNull(other, "other");
        other.forEachCell(this::add);
        return isValid();
    }

    /**
     * Add to the cell for the bit index.
     *
     * @param idx the index
     * @param addend the amount to add
     * @return {@code true} always.
     */
    private boolean add(final int idx, final int addend) {
        try {
            final int updated = cells[idx] + addend;
            state |= updated;
            cells[idx] = updated;
            return true;
        } catch (final IndexOutOfBoundsException e) {
            throw new IllegalArgumentException(
                    String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
        }
    }

    @Override
    public int[] asIndexArray() {
        return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
    }

    @Override
    public int cardinality() {
        return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
    }

    @Override
    public int characteristics() {
        return SPARSE;
    }

    @Override
    public void clear() {
        Arrays.fill(cells, 0);
    }

    @Override
    public boolean contains(final BitMapProducer bitMapProducer) {
        return contains(IndexProducer.fromBitMapProducer(bitMapProducer));
    }

    @Override
    public boolean contains(final IndexProducer indexProducer) {
        return indexProducer.forEachIndex(idx -> this.cells[idx] != 0);
    }

    @Override
    public ArrayCountingBloomFilter copy() {
        return new ArrayCountingBloomFilter(this);
    }

    @Override
    public boolean forEachBitMap(final LongPredicate consumer) {
        Objects.requireNonNull(consumer, "consumer");
        final int blocksm1 = BitMap.numberOfBitMaps(cells.length) - 1;
        int i = 0;
        long value;
        // must break final block separate as the number of bits may not fall on the long boundary
        for (int j = 0; j < blocksm1; j++) {
            value = 0;
            for (int k = 0; k < Long.SIZE; k++) {
                if (cells[i++] != 0) {
                    value |= BitMap.getLongBit(k);
                }
            }
            if (!consumer.test(value)) {
                return false;
            }
        }
        // Final block
        value = 0;
        for (int k = 0; i < cells.length; k++) {
            if (cells[i++] != 0) {
                value |= BitMap.getLongBit(k);
            }
        }
        return consumer.test(value);
    }

    @Override
    public boolean forEachCell(final CellProducer.CellConsumer consumer) {
        Objects.requireNonNull(consumer, "consumer");
        for (int i = 0; i < cells.length; i++) {
            if (cells[i] != 0 && !consumer.test(i, cells[i])) {
                return false;
            }
        }
        return true;
    }

    @Override
    public boolean forEachIndex(final IntPredicate consumer) {
        Objects.requireNonNull(consumer, "consumer");
        for (int i = 0; i < cells.length; i++) {
            if (cells[i] != 0 && !consumer.test(i)) {
                return false;
            }
        }
        return true;
    }

    @Override
    public int getMaxCell() {
        return Integer.MAX_VALUE;
    }

    @Override
    public int getMaxInsert(final CellProducer cellProducer) {
        final int[] max = {Integer.MAX_VALUE};
        cellProducer.forEachCell( (x, y) -> {
            final int count = cells[x] / y;
            if (count < max[0]) {
                max[0] = count;
            }
            return max[0] > 0;
        });
        return max[0];
    }

    @Override
    public Shape getShape() {
        return shape;
    }

    /**
     * {@inheritDoc}
     *
     * <p><em>Implementation note</em>
     *
     * <p>The state transition to invalid is permanent.</p>
     *
     * <p>This implementation does not correct negative cells to zero or integer
     * overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that
     * generated invalid cells can be reversed by using the complement of the
     * original operation with the same Bloom filter. This will restore the cells
     * to the state prior to the invalid operation. Cells can then be extracted
     * using {@link #forEachCell(CellConsumer)}.</p>
     */
    @Override
    public boolean isValid() {
        return state >= 0;
    }

    @Override
    public boolean subtract(final CellProducer other) {
        Objects.requireNonNull(other, "other");
        other.forEachCell(this::subtract);
        return isValid();
    }

    /**
     * Subtract from the cell for the bit index.
     *
     * @param idx the index
     * @param subtrahend the amount to subtract
     * @return {@code true} always.
     */
    private boolean subtract(final int idx, final int subtrahend) {
        try {
            final int updated = cells[idx] - subtrahend;
            state |= updated;
            cells[idx] = updated;
            return true;
        } catch (final IndexOutOfBoundsException e) {
            throw new IllegalArgumentException(
                    String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
        }
    }
}