001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Arrays; 020import java.util.Objects; 021import java.util.function.IntPredicate; 022import java.util.function.LongPredicate; 023import java.util.stream.IntStream; 024 025/** 026 * A counting Bloom filter using an int array to track cells for each enabled bit. 027 * 028 * <p> 029 * Any operation that results in negative counts or integer overflow of counts will mark this filter as invalid. This transition is not reversible. The 030 * operation is completed in full, no exception is raised and the state is set to invalid. This allows the cells for the filter immediately prior to the 031 * operation that created the invalid state to be recovered. See the documentation in {@link #isValid()} for details. 032 * </p> 033 * 034 * <p> 035 * All the operations in the filter assume the cells are currently valid, for example {@code cardinality} or {@code contains} operations. Behavior of an invalid 036 * filter is undefined. It will no longer function identically to a standard Bloom filter that is the merge of all the Bloom filters that have been added to and 037 * not later subtracted from the counting Bloom filter. 038 * </p> 039 * 040 * <p> 041 * The maximum supported number of items that can be stored in the filter is limited by the maximum array size combined with the {@link Shape}. For example an 042 * implementation using a {@link Shape} with a false-positive probability of 1e-6 and {@link Integer#MAX_VALUE} bits can reversibly store approximately 75 043 * million items using 20 hash functions per item with a memory consumption of approximately 8 GB. 044 * </p> 045 * 046 * @see Shape 047 * @see CellExtractor 048 * @since 4.5.0-M1 049 */ 050public final class ArrayCountingBloomFilter implements CountingBloomFilter { 051 052 /** 053 * The shape of this Bloom filter. 054 */ 055 private final Shape shape; 056 057 /** 058 * The cell for each bit index in the filter. 059 */ 060 private final int[] cells; 061 062 /** 063 * The state flag. This is a bitwise {@code OR} of the entire history of all updated 064 * cells. If negative then a negative cell or integer overflow has occurred on 065 * one or more cells in the history of the filter and the state is invalid. 066 * 067 * <p>Maintenance of this state flag is branch-free for improved performance. It 068 * eliminates a conditional check for a negative cell during remove/subtract 069 * operations and a conditional check for integer overflow during merge/add 070 * operations.</p> 071 * 072 * <p>Note: Integer overflow is unlikely in realistic usage scenarios. A cell 073 * that overflows indicates that the number of items in the filter exceeds the 074 * maximum possible size (number of bits) of any Bloom filter constrained by 075 * integer indices. At this point the filter is most likely full (all bits are 076 * non-zero) and thus useless.</p> 077 * 078 * <p>Negative cells are a concern if the filter is used incorrectly by 079 * removing an item that was never added. It is expected that a user of a 080 * counting Bloom filter will not perform this action as it is a mistake. 081 * Enabling an explicit recovery path for negative or overflow cells is a major 082 * performance burden not deemed necessary for the unlikely scenarios when an 083 * invalid state is created. Maintenance of the state flag is a concession to 084 * flag improper use that should not have a major performance impact.</p> 085 */ 086 private int state; 087 088 private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) { 089 this.shape = source.shape; 090 this.state = source.state; 091 this.cells = source.cells.clone(); 092 } 093 094 /** 095 * Constructs an empty counting Bloom filter with the specified shape. 096 * 097 * @param shape the shape of the filter 098 */ 099 public ArrayCountingBloomFilter(final Shape shape) { 100 Objects.requireNonNull(shape, "shape"); 101 this.shape = shape; 102 cells = new int[shape.getNumberOfBits()]; 103 } 104 105 @Override 106 public boolean add(final CellExtractor other) { 107 Objects.requireNonNull(other, "other"); 108 other.processCells(this::add); 109 return isValid(); 110 } 111 112 /** 113 * Add to the cell for the bit index. 114 * 115 * @param idx the index 116 * @param addend the amount to add 117 * @return {@code true} always. 118 */ 119 private boolean add(final int idx, final int addend) { 120 try { 121 final int updated = cells[idx] + addend; 122 state |= updated; 123 cells[idx] = updated; 124 return true; 125 } catch (final IndexOutOfBoundsException e) { 126 throw new IllegalArgumentException( 127 String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); 128 } 129 } 130 131 @Override 132 public int[] asIndexArray() { 133 return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray(); 134 } 135 136 @Override 137 public int cardinality() { 138 return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count(); 139 } 140 141 @Override 142 public int characteristics() { 143 return SPARSE; 144 } 145 146 @Override 147 public void clear() { 148 Arrays.fill(cells, 0); 149 } 150 151 @Override 152 public boolean contains(final BitMapExtractor bitMapExtractor) { 153 return contains(IndexExtractor.fromBitMapExtractor(bitMapExtractor)); 154 } 155 156 @Override 157 public boolean contains(final IndexExtractor indexExtractor) { 158 return indexExtractor.processIndices(idx -> cells[idx] != 0); 159 } 160 161 /** 162 * Creates a new instance of this {@link ArrayCountingBloomFilter} with the same properties as the current one. 163 * 164 * @return a copy of this BloomFilter. 165 */ 166 @Override 167 public ArrayCountingBloomFilter copy() { 168 return new ArrayCountingBloomFilter(this); 169 } 170 171 @Override 172 public int getMaxCell() { 173 return Integer.MAX_VALUE; 174 } 175 176 @Override 177 public int getMaxInsert(final CellExtractor cellExtractor) { 178 final int[] max = { Integer.MAX_VALUE }; 179 cellExtractor.processCells((x, y) -> { 180 final int count = cells[x] / y; 181 if (count < max[0]) { 182 max[0] = count; 183 } 184 return max[0] > 0; 185 }); 186 return max[0]; 187 } 188 189 @Override 190 public Shape getShape() { 191 return shape; 192 } 193 194 /** 195 * {@inheritDoc} 196 * 197 * <p> 198 * <em>Implementation note</em> 199 * </p> 200 * 201 * <p> 202 * The state transition to invalid is permanent. 203 * </p> 204 * 205 * <p> 206 * This implementation does not correct negative cells to zero or integer overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that generated 207 * invalid cells can be reversed by using the complement of the original operation with the same Bloom filter. This will restore the cells to the state 208 * prior to the invalid operation. Cells can then be extracted using {@link #processCells(CellPredicate)}. 209 * </p> 210 */ 211 @Override 212 public boolean isValid() { 213 return state >= 0; 214 } 215 216 @Override 217 public boolean processBitMaps(final LongPredicate consumer) { 218 Objects.requireNonNull(consumer, "consumer"); 219 final int blocksm1 = BitMaps.numberOfBitMaps(cells.length) - 1; 220 int i = 0; 221 long value; 222 // must break final block separate as the number of bits may not fall on the long boundary 223 for (int j = 0; j < blocksm1; j++) { 224 value = 0; 225 for (int k = 0; k < Long.SIZE; k++) { 226 if (cells[i++] != 0) { 227 value |= BitMaps.getLongBit(k); 228 } 229 } 230 if (!consumer.test(value)) { 231 return false; 232 } 233 } 234 // Final block 235 value = 0; 236 for (int k = 0; i < cells.length; k++) { 237 if (cells[i++] != 0) { 238 value |= BitMaps.getLongBit(k); 239 } 240 } 241 return consumer.test(value); 242 } 243 244 @Override 245 public boolean processCells(final CellPredicate consumer) { 246 Objects.requireNonNull(consumer, "consumer"); 247 for (int i = 0; i < cells.length; i++) { 248 if (cells[i] != 0 && !consumer.test(i, cells[i])) { 249 return false; 250 } 251 } 252 return true; 253 } 254 255 @Override 256 public boolean processIndices(final IntPredicate consumer) { 257 Objects.requireNonNull(consumer, "consumer"); 258 for (int i = 0; i < cells.length; i++) { 259 if (cells[i] != 0 && !consumer.test(i)) { 260 return false; 261 } 262 } 263 return true; 264 } 265 266 @Override 267 public boolean subtract(final CellExtractor other) { 268 Objects.requireNonNull(other, "other"); 269 other.processCells(this::subtract); 270 return isValid(); 271 } 272 273 /** 274 * Subtracts from the cell for the bit index. 275 * 276 * @param idx the index 277 * @param subtrahend the amount to subtract 278 * @return {@code true} always. 279 */ 280 private boolean subtract(final int idx, final int subtrahend) { 281 try { 282 final int updated = cells[idx] - subtrahend; 283 state |= updated; 284 cells[idx] = updated; 285 return true; 286 } catch (final IndexOutOfBoundsException e) { 287 throw new IllegalArgumentException( 288 String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); 289 } 290 } 291}