Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.collections4.bloomfilter;
018
019import java.util.Objects;
020import java.util.function.IntPredicate;
021
022/**
023 * A Hasher that implements combinatorial hashing as described by
024 * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique
025 * described in the wikipedia article  <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>.
026 * <p>
027 * Common use for this hasher is to generate bit indices from a byte array output of a hashing
028 * or MessageDigest algorithm.</p>
029 *
030 * <h2>Thoughts on the hasher input</h2>
031 *
032 * <p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than
033 * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be
034 * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits
035 * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for
036 * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial}
037 * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the
038 * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also
039 * ignores the negative wrapping but the behavior is the same, some bits cannot be reached.
040 * </p><p>
041 * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger
042 * than the number of bits then the modulus will create a 'random' position and increment within the size.
043 * </p>
044 *
045 * @since 4.5.0-M1
046 */
047public class EnhancedDoubleHasher implements Hasher {
048
049    /**
050     * Convert bytes to big-endian long filling with zero bytes as necessary.
051     *
052     * @param byteArray the byte array to extract the values from.
053     * @param offset the offset to start extraction from.
054     * @param len the length of the extraction, may be longer than 8.
055     * @return
056     */
057    private static long toLong(final byte[] byteArray, final int offset, final int len) {
058        long val = 0;
059        int shift = Long.SIZE;
060        final int end = offset + Math.min(len, Long.BYTES);
061        for (int i = offset; i < end; i++) {
062            shift -= Byte.SIZE;
063            val |= (long) (byteArray[i] & 0xFF) << shift;
064        }
065        return val;
066    }
067
068    /**
069     * The initial hash value.
070     */
071    private final long initial;
072
073    /**
074     * The value to increment the hash value by.
075     */
076    private final long increment;
077
078    /**
079     * Constructs the EnhancedDoubleHasher from a byte array.
080     * <p>
081     * This method simplifies the conversion from a Digest or hasher algorithm output
082     * to the two values used by the EnhancedDoubleHasher.</p>
083     * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value.
084     * Excess bytes are ignored.
085     * If there are fewer than 16 bytes the following conversions are made.
086     * </p>
087     * <ol>
088     * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li>
089     * <li>The bytes allotted are read in big-endian order any byte not populated is set to zero.</li>
090     * </ol>
091     * <p>
092     * This ensures that small arrays generate the largest possible increment and initial values.
093     * </p>
094     *
095     * @param buffer the buffer to extract the longs from.
096     * @throws IllegalArgumentException is buffer length is zero.
097     */
098    public EnhancedDoubleHasher(final byte[] buffer) {
099        if (buffer.length == 0) {
100            throw new IllegalArgumentException("buffer length must be greater than 0");
101        }
102        // divide by 2
103        final int segment = buffer.length / 2;
104        this.initial = toLong(buffer, 0, segment);
105        this.increment = toLong(buffer, segment, buffer.length - segment);
106    }
107
108    /**
109     * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values.
110     *
111     * @param initial The initial value for the hasher.
112     * @param increment The value to increment the hash by on each iteration.
113     */
114    public EnhancedDoubleHasher(final long initial, final long increment) {
115        this.initial = initial;
116        this.increment = increment;
117    }
118
119    /**
120     * Gets the increment value for the hash calculation.
121     *
122     * @return the increment value for the hash calculation.
123     */
124    long getIncrement() {
125        return increment;
126    }
127
128    /**
129     * Gets the initial value for the hash calculation.
130     *
131     * @return the initial value for the hash calculation.
132     */
133    long getInitial() {
134        return initial;
135    }
136
137    @Override
138    public IndexExtractor indices(final Shape shape) {
139        Objects.requireNonNull(shape, "shape");
140
141        return new IndexExtractor() {
142
143            @Override
144            public int[] asIndexArray() {
145                final int[] result = new int[shape.getNumberOfHashFunctions()];
146                final int[] idx = new int[1];
147
148                // This method needs to return duplicate indices
149
150                processIndices(i -> {
151                    result[idx[0]++] = i;
152                    return true;
153                });
154                return result;
155            }
156
157            @Override
158            public boolean processIndices(final IntPredicate consumer) {
159                Objects.requireNonNull(consumer, "consumer");
160                final int bits = shape.getNumberOfBits();
161                // Enhanced double hashing:
162                // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits
163                // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing
164                //
165                // Essentially this is computing a wrapped modulus from a start point and an
166                // increment and an additional term as a tetrahedral number.
167                // You only need two modulus operations before the loop. Within the loop
168                // the modulus is handled using the sign bit to detect wrapping to ensure:
169                // 0 <= index < bits
170                // 0 <= inc < bits
171                // The final hash is:
172                // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits)
173
174                int index = BitMaps.mod(initial, bits);
175                if (!consumer.test(index)) {
176                    return false;
177                }
178                int inc = BitMaps.mod(increment, bits);
179
180                final int k = shape.getNumberOfHashFunctions();
181
182                if (k >= bits) {
183                    // the tetraheadral incrementer.  We need to ensure that this
184                    // number does not exceed bits-1 or we may end up with an index > bits.
185                    int tet = 1;
186                    for (int i = 1; i < k; i++) {
187                        // Update index and handle wrapping
188                        index -= inc;
189                        index = index < 0 ? index + bits : index;
190                        if (!consumer.test(index)) {
191                            return false;
192                        }
193
194                        // Incorporate the counter into the increment to create a
195                        // tetrahedral number additional term, and handle wrapping.
196                        inc -= tet;
197                        inc = inc < 0 ? inc + bits : inc;
198                        if (++tet == bits) {
199                            tet = 0;
200                        }
201                    }
202                } else {
203                    for (int i = 1; i < k; i++) {
204                        // Update index and handle wrapping
205                        index -= inc;
206                        index = index < 0 ? index + bits : index;
207                        if (!consumer.test(index)) {
208                            return false;
209                        }
210
211                        // Incorporate the counter into the increment to create a
212                        // tetrahedral number additional term, and handle wrapping.
213                        inc -= i;
214                        inc = inc < 0 ? inc + bits : inc;
215                    }
216
217                }
218                return true;
219            }
220        };
221    }
222}