ArrayCountingBloomFilter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.Arrays;
  19. import java.util.Objects;
  20. import java.util.function.IntPredicate;
  21. import java.util.function.LongPredicate;
  22. import java.util.stream.IntStream;

  23. /**
  24.  * A counting Bloom filter using an int array to track cells for each enabled bit.
  25.  *
  26.  * <p>
  27.  * Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The
  28.  * operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the
  29.  * operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details.
  30.  * </p>
  31.  *
  32.  * <p>
  33.  * All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid
  34.  * filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and
  35.  * not later subtracted from the counting Bloom filter.
  36.  * </p>
  37.  *
  38.  * <p>
  39.  * The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an
  40.  * implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75
  41.  * million items using 20 hash functions per item with a memory consumption of approximately 8 GB.
  42.  * </p>
  43.  *
  44.  * @see Shape
  45.  * @see CellExtractor
  46.  * @since 4.5.0-M1
  47.  */
  48. public final class ArrayCountingBloomFilter implements CountingBloomFilter {

  49.     /**
  50.      * The shape of this Bloom filter.
  51.      */
  52.     private final Shape shape;

  53.     /**
  54.      * The cell for each bit index in the filter.
  55.      */
  56.     private final int[] cells;

  57.     /**
  58.      * The state flag. This is a bitwise {@code OR} of the entire history of all updated
  59.      * cells. If negative then a negative cell or integer overflow has occurred on
  60.      * one or more cells in the history of the filter and the state is invalid.
  61.      *
  62.      * <p>Maintenance of this state flag is branch-free for improved performance. It
  63.      * eliminates a conditional check for a negative cell during remove/subtract
  64.      * operations and a conditional check for integer overflow during merge/add
  65.      * operations.</p>
  66.      *
  67.      * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell
  68.      * that overflows indicates that the number of items in the filter exceeds the
  69.      * maximum possible size (number of bits) of any Bloom filter constrained by
  70.      * integer indices. At this point the filter is most likely full (all bits are
  71.      * non-zero) and thus useless.</p>
  72.      *
  73.      * <p>Negative cells are a concern if the filter is used incorrectly by
  74.      * removing an item that was never added. It is expected that a user of a
  75.      * counting Bloom filter will not perform this action as it is a mistake.
  76.      * Enabling an explicit recovery path for negative or overflow cells is a major
  77.      * performance burden not deemed necessary for the unlikely scenarios when an
  78.      * invalid state is created. Maintenance of the state flag is a concession to
  79.      * flag improper use that should not have a major performance impact.</p>
  80.      */
  81.     private int state;

  82.     private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) {
  83.         this.shape = source.shape;
  84.         this.state = source.state;
  85.         this.cells = source.cells.clone();
  86.     }

  87.     /**
  88.      * Constructs an empty counting Bloom filter with the specified shape.
  89.      *
  90.      * @param shape the shape of the filter
  91.      */
  92.     public ArrayCountingBloomFilter(final Shape shape) {
  93.         Objects.requireNonNull(shape, "shape");
  94.         this.shape = shape;
  95.         cells = new int[shape.getNumberOfBits()];
  96.     }

  97.     @Override
  98.     public boolean add(final CellExtractor other) {
  99.         Objects.requireNonNull(other, "other");
  100.         other.processCells(this::add);
  101.         return isValid();
  102.     }

  103.     /**
  104.      * Add to the cell for the bit index.
  105.      *
  106.      * @param idx the index
  107.      * @param addend the amount to add
  108.      * @return {@code true} always.
  109.      */
  110.     private boolean add(final int idx, final int addend) {
  111.         try {
  112.             final int updated = cells[idx] + addend;
  113.             state |= updated;
  114.             cells[idx] = updated;
  115.             return true;
  116.         } catch (final IndexOutOfBoundsException e) {
  117.             throw new IllegalArgumentException(
  118.                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
  119.         }
  120.     }

  121.     @Override
  122.     public int[] asIndexArray() {
  123.         return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray();
  124.     }

  125.     @Override
  126.     public int cardinality() {
  127.         return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count();
  128.     }

  129.     @Override
  130.     public int characteristics() {
  131.         return SPARSE;
  132.     }

  133.     @Override
  134.     public void clear() {
  135.         Arrays.fill(cells, 0);
  136.     }

  137.     @Override
  138.     public boolean contains(final BitMapExtractor bitMapExtractor) {
  139.         return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
  140.     }

  141.     @Override
  142.     public boolean contains(final IndexExtractor indexExtractor) {
  143.         return indexExtractor.processIndices(idx -> cells[idx] != 0);
  144.     }

  145.     /**
  146.      * Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one.
  147.      *
  148.      * @return a copy of this BloomFilter.
  149.      */
  150.     @Override
  151.     public ArrayCountingBloomFilter copy() {
  152.         return new ArrayCountingBloomFilter(this);
  153.     }

  154.     @Override
  155.     public int getMaxCell() {
  156.         return Integer.MAX_VALUE;
  157.     }

  158.     @Override
  159.     public int getMaxInsert(final CellExtractor cellExtractor) {
  160.         final int[] max = { Integer.MAX_VALUE };
  161.         cellExtractor.processCells((x, y) -> {
  162.             final int count = cells[x] / y;
  163.             if (count < max[0]) {
  164.                 max[0] = count;
  165.             }
  166.             return max[0] > 0;
  167.         });
  168.         return max[0];
  169.     }

  170.     @Override
  171.     public Shape getShape() {
  172.         return shape;
  173.     }

  174.     /**
  175.      * {@inheritDoc}
  176.      *
  177.      * <p>
  178.      * <em>Implementation note</em>
  179.      * </p>
  180.      *
  181.      * <p>
  182.      * The state transition to invalid is permanent.
  183.      * </p>
  184.      *
  185.      * <p>
  186.      * This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated
  187.      * invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state
  188.      * prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}.
  189.      * </p>
  190.      */
  191.     @Override
  192.     public boolean isValid() {
  193.         return state >= 0;
  194.     }

  195.     @Override
  196.     public boolean processBitMaps(final LongPredicate consumer) {
  197.         Objects.requireNonNull(consumer, "consumer");
  198.         final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1;
  199.         int i = 0;
  200.         long value;
  201.         // must break final block separate as the number of bits may not fall on the long boundary
  202.         for (int j = 0; j < blocksm1; j++) {
  203.             value = 0;
  204.             for (int k = 0; k < Long.SIZE; k++) {
  205.                 if (cells[i++] != 0) {
  206.                     value |= BitMaps.getLongBit(k);
  207.                 }
  208.             }
  209.             if (!consumer.test(value)) {
  210.                 return false;
  211.             }
  212.         }
  213.         // Final block
  214.         value = 0;
  215.         for (int k = 0; i < cells.length; k++) {
  216.             if (cells[i++] != 0) {
  217.                 value |= BitMaps.getLongBit(k);
  218.             }
  219.         }
  220.         return consumer.test(value);
  221.     }

  222.     @Override
  223.     public boolean processCells(final CellPredicate consumer) {
  224.         Objects.requireNonNull(consumer, "consumer");
  225.         for (int i = 0; i < cells.length; i++) {
  226.             if (cells[i] != 0 && !consumer.test(i, cells[i])) {
  227.                 return false;
  228.             }
  229.         }
  230.         return true;
  231.     }

  232.     @Override
  233.     public boolean processIndices(final IntPredicate consumer) {
  234.         Objects.requireNonNull(consumer, "consumer");
  235.         for (int i = 0; i < cells.length; i++) {
  236.             if (cells[i] != 0 && !consumer.test(i)) {
  237.                 return false;
  238.             }
  239.         }
  240.         return true;
  241.     }

  242.     @Override
  243.     public boolean subtract(final CellExtractor other) {
  244.         Objects.requireNonNull(other, "other");
  245.         other.processCells(this::subtract);
  246.         return isValid();
  247.     }

  248.     /**
  249.      * Subtracts from the cell for the bit index.
  250.      *
  251.      * @param idx the index
  252.      * @param subtrahend the amount to subtract
  253.      * @return {@code true} always.
  254.      */
  255.     private boolean subtract(final int idx, final int subtrahend) {
  256.         try {
  257.             final int updated = cells[idx] - subtrahend;
  258.             state |= updated;
  259.             cells[idx] = updated;
  260.             return true;
  261.         } catch (final IndexOutOfBoundsException e) {
  262.             throw new IllegalArgumentException(
  263.                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
  264.         }
  265.     }
  266. }