CountingBloomFilter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.Objects;

  19. /**
  20.  * The interface that describes a Bloom filter that associates a count with each
  21.  * bit index rather than a bit.  This allows reversal of merge operations with
  22.  * remove operations.
  23.  *
  24.  * <p>A counting Bloom filter is expected to function identically to a standard
  25.  * Bloom filter that is the merge of all the Bloom filters that have been added
  26.  * to and not later subtracted from the counting Bloom filter. The functional
  27.  * state of a CountingBloomFilter at the start and end of a series of merge and
  28.  * subsequent remove operations of the same Bloom filters, irrespective of
  29.  * remove order, is expected to be the same.</p>
  30.  *
  31.  * <p>Removal of a filter that has not previously been merged results in an
  32.  * invalid state where the cells no longer represent a sum of merged Bloom
  33.  * filters. It is impossible to validate merge and remove exactly without
  34.  * explicitly storing all filters. Consequently such an operation may go
  35.  * undetected. The CountingBloomFilter maintains a state flag that is used as a
  36.  * warning that an operation was performed that resulted in invalid cells and
  37.  * thus an invalid state. For example this may occur if a cell for an index was
  38.  * set to negative following a remove operation.</p>
  39.  *
  40.  * <p>Implementations should document the expected state of the filter after an
  41.  * operation that generates invalid cells, and any potential recovery options.
  42.  * An implementation may support a reversal of the operation to restore the
  43.  * state to that prior to the operation. In the event that invalid cells are
  44.  * adjusted to a valid range then it should be documented if there has been
  45.  * irreversible information loss.</p>
  46.  *
  47.  * <p>Implementations may choose to throw an exception during an operation that
  48.  * generates invalid cells. Implementations should document the expected state
  49.  * of the filter after such an operation. For example are the cells not updated,
  50.  * partially updated or updated entirely before the exception is raised.</p>
  51.  *
  52.  * @see CellExtractor
  53.  * @since 4.5.0-M1
  54.  */
  55. public interface CountingBloomFilter extends BloomFilter<CountingBloomFilter>, CellExtractor {

  56.     // Query Operations

  57.     /**
  58.      * Adds the specified CellExtractor to this Bloom filter.
  59.      *
  60.      * <p>Specifically
  61.      * all cells for the indexes identified by the {@code other} will be incremented
  62.      * by their corresponding values in the {@code other}.</p>
  63.      *
  64.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  65.      *
  66.      * @param other the CellExtractor to add.
  67.      * @return {@code true} if the addition was successful and the state is valid
  68.      * @see #isValid()
  69.      * @see #subtract(CellExtractor)
  70.      */
  71.     boolean add(CellExtractor other);

  72.     /**
  73.      * Gets the maximum allowable value for a cell count in this Counting filter.
  74.      *
  75.      * @return the maximum allowable value for a cell count in this Counting filter.
  76.      */
  77.     int getMaxCell();

  78.     /**
  79.      * Determines the maximum number of times the BitMapExtractor could have been merged into this counting filter.
  80.      *
  81.      * @param bitMapExtractor the BitMapExtractor to provide the indices.
  82.      * @return the maximum number of times the BitMapExtractor could have been inserted.
  83.      */
  84.     default int getMaxInsert(final BitMapExtractor bitMapExtractor) {
  85.         if (!contains(bitMapExtractor)) {
  86.             return 0;
  87.         }
  88.         final long[] bitMaps = bitMapExtractor.asBitMapArray();
  89.         final int[] max = { Integer.MAX_VALUE };
  90.         processCells((x, y) -> {
  91.             if ((bitMaps[BitMaps.getLongIndex(x)] & BitMaps.getLongBit(x)) != 0) {
  92.                 max[0] = max[0] <= y ? max[0] : y;
  93.             }
  94.             return true;
  95.         });
  96.         return max[0];
  97.     }

  98.     /**
  99.      * Determines the maximum number of times the Bloom filter could have been merged into this counting filter.
  100.      *
  101.      * @param bloomFilter the Bloom filter the check for.
  102.      * @return the maximum number of times the Bloom filter could have been inserted.
  103.      */
  104.     default int getMaxInsert(final BloomFilter<?> bloomFilter) {
  105.         return getMaxInsert((BitMapExtractor) bloomFilter);
  106.     }

  107.     /**
  108.      * Determines the maximum number of times the Cell Extractor could have been added.
  109.      *
  110.      * @param cellExtractor the extractor of cells.
  111.      * @return the maximum number of times the CellExtractor could have been inserted.
  112.      */
  113.     int getMaxInsert(CellExtractor cellExtractor);

  114.     /**
  115.      * Determines the maximum number of times the Hasher could have been merged into this counting filter.
  116.      *
  117.      * @param hasher the Hasher to provide the indices.
  118.      * @return the maximum number of times the hasher could have been inserted.
  119.      */
  120.     default int getMaxInsert(final Hasher hasher) {
  121.         return getMaxInsert(hasher.indices(getShape()));
  122.     }

  123.     /**
  124.      * Determines the maximum number of times the IndexExtractor could have been merged into this counting filter.
  125.      * <p>
  126.      * To determine how many times an indexExtractor could have been added create a CellExtractor from the indexExtractor and check that
  127.      * </p>
  128.      *
  129.      * @param indexExtractor the extractor to drive the count check.
  130.      * @return the maximum number of times the IndexExtractor could have been inserted.
  131.      * @see #getMaxInsert(CellExtractor)
  132.      */
  133.     default int getMaxInsert(final IndexExtractor indexExtractor) {
  134.         return getMaxInsert(CellExtractor.from(indexExtractor.uniqueIndices()));
  135.     }

  136.     /**
  137.      * Returns {@code true} if the internal state is valid.
  138.      *
  139.      * <p>This flag is a warning that an addition or
  140.      * subtraction of cells from this filter resulted in an invalid cell for one or more
  141.      * indexes. For example this may occur if a cell for an index was
  142.      * set to negative following a subtraction operation, or overflows the value specified by {@code getMaxCell()} following an
  143.      * addition operation.</p>
  144.      *
  145.      * <p>A counting Bloom filter that has an invalid state is no longer ensured to function
  146.      * identically to a standard Bloom filter instance that is the merge of all the Bloom filters
  147.      * that have been added to and not later subtracted from this counting Bloom filter.</p>
  148.      *
  149.      * <p>Note: The change to an invalid state may or may not be reversible. Implementations
  150.      * are expected to document their policy on recovery from an addition or removal operation
  151.      * that generated an invalid state.</p>
  152.      *
  153.      * @return {@code true} if the state is valid
  154.      */
  155.     boolean isValid();

  156.     /**
  157.      * Merges the specified BitMap extractor into this Bloom filter.
  158.      *
  159.      * <p>Specifically: all cells for the indexes identified by the {@code bitMapExtractor} will be incremented by 1.</p>
  160.      *
  161.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  162.      *
  163.      * @param bitMapExtractor the BitMapExtractor
  164.      * @return {@code true} if the removal was successful and the state is valid
  165.      * @see #isValid()
  166.      * @see #add(CellExtractor)
  167.      */
  168.     @Override
  169.     default boolean merge(final BitMapExtractor bitMapExtractor) {
  170.         return merge(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
  171.     }

  172.     /**
  173.      * Merges the specified Bloom filter into this Bloom filter.
  174.      *
  175.      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be incremented by 1.</p>
  176.      *
  177.      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
  178.      * IndexExtractor.</p>
  179.      *
  180.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  181.      *
  182.      * @param other the other Bloom filter
  183.      * @return {@code true} if the removal was successful and the state is valid
  184.      * @see #isValid()
  185.      * @see #add(CellExtractor)
  186.      */
  187.     @Override
  188.     default boolean merge(final BloomFilter<?> other) {
  189.         Objects.requireNonNull(other, "other");
  190.         return merge((IndexExtractor) other);
  191.     }

  192.     /**
  193.      * Merges the specified Hasher into this Bloom filter.
  194.      *
  195.      * <p>Specifically: all cells for the unique indexes identified by the {@code hasher} will be incremented by 1.</p>
  196.      *
  197.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  198.      *
  199.      * @param hasher the hasher
  200.      * @return {@code true} if the removal was successful and the state is valid
  201.      * @see #isValid()
  202.      * @see #add(CellExtractor)
  203.      */
  204.     @Override
  205.     default boolean merge(final Hasher hasher) {
  206.         Objects.requireNonNull(hasher, "hasher");
  207.         return merge(hasher.indices(getShape()));
  208.     }

  209.     /**
  210.      * Merges the specified index extractor into this Bloom filter.
  211.      *
  212.      * <p>Specifically: all unique cells for the indices identified by the {@code indexExtractor} will be incremented by 1.</p>
  213.      *
  214.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  215.      *
  216.      * <p>Notes:</p>
  217.      * <ul>
  218.      * <li>If indices that are returned multiple times should be incremented multiple times convert the IndexExtractor
  219.      * to a CellExtractor and add that.</li>
  220.      * <li>Implementations should throw {@code IllegalArgumentException} and no other exception on bad input.</li>
  221.      * </ul>
  222.      * @param indexExtractor the IndexExtractor
  223.      * @return {@code true} if the removal was successful and the state is valid
  224.      * @see #isValid()
  225.      * @see #add(CellExtractor)
  226.      */
  227.     @Override
  228.     default boolean merge(final IndexExtractor indexExtractor) {
  229.         Objects.requireNonNull(indexExtractor, "indexExtractor");
  230.         try {
  231.             return add(CellExtractor.from(indexExtractor.uniqueIndices()));
  232.         } catch (final IndexOutOfBoundsException e) {
  233.             throw new IllegalArgumentException(
  234.                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e);
  235.         }
  236.     }

  237.     /**
  238.      * Removes the specified BitMapExtractor from this Bloom filter.
  239.      *
  240.      * <p>Specifically all cells for the indices produced by the {@code bitMapExtractor} will be
  241.      * decremented by 1.</p>
  242.      *
  243.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  244.      *
  245.      * @param bitMapExtractor the BitMapExtractor to provide the indexes
  246.      * @return {@code true} if the removal was successful and the state is valid
  247.      * @see #isValid()
  248.      * @see #subtract(CellExtractor)
  249.      */
  250.     default boolean remove(final BitMapExtractor bitMapExtractor) {
  251.         return remove(IndexExtractor.fromBitMapExtractor(bitMapExtractor));
  252.     }

  253.     /**
  254.      * Removes the specified Bloom filter from this Bloom filter.
  255.      *
  256.      * <p>Specifically: all cells for the indexes identified by the {@code other} filter will be decremented by 1.</p>
  257.      *
  258.      * <p>Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an
  259.      * IndexExtractor.</p>
  260.      *
  261.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  262.      *
  263.      * @param other the other Bloom filter
  264.      * @return {@code true} if the removal was successful and the state is valid
  265.      * @see #isValid()
  266.      * @see #subtract(CellExtractor)
  267.      */
  268.     default boolean remove(final BloomFilter<?> other) {
  269.         return remove((IndexExtractor) other);
  270.     }

  271.     /**
  272.      * Removes the unique values from the specified hasher from this Bloom filter.
  273.      *
  274.      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
  275.      * decremented by 1.</p>
  276.      *
  277.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  278.      *
  279.      * @param hasher the hasher to provide the indexes
  280.      * @return {@code true} if the removal was successful and the state is valid
  281.      * @see #isValid()
  282.      * @see #subtract(CellExtractor)
  283.      */
  284.     default boolean remove(final Hasher hasher) {
  285.         Objects.requireNonNull(hasher, "hasher");
  286.         return remove(hasher.indices(getShape()));
  287.     }

  288.     /**
  289.      * Removes the values from the specified IndexExtractor from the Bloom filter from this Bloom filter.
  290.      *
  291.      * <p>Specifically all cells for the unique indices produced by the {@code hasher} will be
  292.      * decremented by 1.</p>
  293.      *
  294.      * <p>This method will return {@code true} if the filter is valid after the operation.</p>
  295.      *
  296.      * <p>Note: If indices that are returned multiple times should be decremented multiple times convert the IndexExtractor
  297.      * to a CellExtractor and subtract that.</p>
  298.      *
  299.      * @param indexExtractor the IndexExtractor to provide the indexes
  300.      * @return {@code true} if the removal was successful and the state is valid
  301.      * @see #isValid()
  302.      * @see #subtract(CellExtractor)
  303.      */
  304.     default boolean remove(final IndexExtractor indexExtractor) {
  305.         Objects.requireNonNull(indexExtractor, "indexExtractor");
  306.         try {
  307.             return subtract(CellExtractor.from(indexExtractor.uniqueIndices()));
  308.         } catch (final IndexOutOfBoundsException e) {
  309.             throw new IllegalArgumentException(
  310.                     String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()));
  311.         }
  312.     }

  313.     /**
  314.      * Adds the specified CellExtractor to this Bloom filter.
  315.      *
  316.      * <p>Specifically
  317.      * all cells for the indexes identified by the {@code other} will be decremented
  318.      * by their corresponding values in the {@code other}.</p>
  319.      *
  320.      * <p>This method will return true if the filter is valid after the operation.</p>
  321.      *
  322.      * @param other the CellExtractor to subtract.
  323.      * @return {@code true} if the subtraction was successful and the state is valid
  324.      * @see #isValid()
  325.      * @see #add(CellExtractor)
  326.      */
  327.     boolean subtract(CellExtractor other);

  328.     /**
  329.      * The default implementation is a no-op since the counting bloom filter returns an unique IndexExtractor by default.
  330.      * @return this counting Bloom filter.
  331.      */
  332.     @Override
  333.     default IndexExtractor uniqueIndices() {
  334.         return this;
  335.     }
  336. }