SetOperations.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.collections4.bloomfilter;

  18. import java.util.function.LongBinaryOperator;

  19. /**
  20.  * Implementations of set operations on BitMapExtractors.
  21.  *
  22.  * @since 4.5.0-M1
  23.  */
  24. public final class SetOperations {

  25.     /**
  26.      * Calculates the cardinality of the logical {@code AND} of the bit maps for the two filters.
  27.      *
  28.      * @param first  the first BitMapExtractor.
  29.      * @param second the second BitMapExtractor
  30.      * @return the cardinality of the {@code AND} of the filters.
  31.      */
  32.     public static int andCardinality(final BitMapExtractor first, final BitMapExtractor second) {
  33.         return cardinality(first, second, (x, y) -> x & y);
  34.     }

  35.     /**
  36.      * Calculates the cardinality of a BitMapExtractor. By necessity this method will visit each bit map created by the bitMapExtractor.
  37.      *
  38.      * @param bitMapExtractor the extractor to calculate the cardinality for.
  39.      * @return the cardinality of the bit maps produced by the bitMapExtractor.
  40.      */
  41.     public static int cardinality(final BitMapExtractor bitMapExtractor) {
  42.         final int[] cardinality = new int[1];
  43.         bitMapExtractor.processBitMaps(l -> {
  44.             cardinality[0] += Long.bitCount(l);
  45.             return true;
  46.         });
  47.         return cardinality[0];
  48.     }

  49.     /**
  50.      * Calculates the cardinality of the result of a LongBinaryOperator using the {@code BitMapExtractor.makePredicate} method.
  51.      *
  52.      * @param first  the first BitMapExtractor
  53.      * @param second the second BitMapExtractor
  54.      * @param op     a long binary operation on where x = {@code first} and y = {@code second} bitmap extractors.
  55.      * @return the calculated cardinality.
  56.      */
  57.     private static int cardinality(final BitMapExtractor first, final BitMapExtractor second, final LongBinaryOperator op) {
  58.         final int[] cardinality = new int[1];

  59.         first.processBitMapPairs(second, (x, y) -> {
  60.             cardinality[0] += Long.bitCount(op.applyAsLong(x, y));
  61.             return true;
  62.         });
  63.         return cardinality[0];
  64.     }

  65.     /**
  66.      * Calculates the Cosine distance between two BitMapExtractor.
  67.      * <p>
  68.      * Cosine distance is defined as {@code 1 - Cosine similarity}
  69.      * </p>
  70.      *
  71.      * @param first  the first BitMapExtractor.
  72.      * @param second the second BitMapExtractor.
  73.      * @return the jaccard distance.
  74.      */
  75.     public static double cosineDistance(final BitMapExtractor first, final BitMapExtractor second) {
  76.         return 1.0 - cosineSimilarity(first, second);
  77.     }

  78.     /**
  79.      * Calculates the Cosine similarity between two BitMapExtractors.
  80.      * <p>
  81.      * Also known as Orchini similarity and the Tucker coefficient of congruence or Ochiai similarity.
  82.      * </p>
  83.      * <p>
  84.      * If either extractor is empty the result is 0 (zero)
  85.      * </p>
  86.      *
  87.      * @param first  the first BitMapExtractor.
  88.      * @param second the second BitMapExtractor.
  89.      * @return the Cosine similarity.
  90.      */
  91.     public static double cosineSimilarity(final BitMapExtractor first, final BitMapExtractor second) {
  92.         final int numerator = andCardinality(first, second);
  93.         // Given that the cardinality is an int then the product as a double will not
  94.         // overflow, we can use one sqrt:
  95.         return numerator == 0 ? 0 : numerator / Math.sqrt(cardinality(first) * cardinality(second));
  96.     }

  97.     /**
  98.      * Calculates the Cosine similarity between two Bloom filters.
  99.      * <p>
  100.      * Also known as Orchini similarity and the Tucker coefficient of congruence or Ochiai similarity.
  101.      * </p>
  102.      * <p>
  103.      * If either filter is empty (no enabled bits) the result is 0 (zero)
  104.      * </p>
  105.      * <p>
  106.      * This is a version of cosineSimilarity optimized for Bloom filters.
  107.      * </p>
  108.      *
  109.      * @param first  the first Bloom filter.
  110.      * @param second the second Bloom filter.
  111.      * @return the Cosine similarity.
  112.      */
  113.     public static double cosineSimilarity(final BloomFilter<?> first, final BloomFilter<?> second) {
  114.         final int numerator = andCardinality(first, second);
  115.         // Given that the cardinality is an int then the product as a double will not
  116.         // overflow, we can use one sqrt:
  117.         return numerator == 0 ? 0 : numerator / Math.sqrt(first.cardinality() * second.cardinality());
  118.     }

  119.     /**
  120.      * Calculates the Hamming distance between two BitMapExtractors.
  121.      *
  122.      * @param first  the first BitMapExtractor.
  123.      * @param second the second BitMapExtractor.
  124.      * @return the Hamming distance.
  125.      */
  126.     public static int hammingDistance(final BitMapExtractor first, final BitMapExtractor second) {
  127.         return xorCardinality(first, second);
  128.     }

  129.     /**
  130.      * Calculates the Jaccard distance between two BitMapExtractor.
  131.      * <p>
  132.      * Jaccard distance is defined as {@code 1 - Jaccard similarity}
  133.      * </p>
  134.      *
  135.      * @param first  the first BitMapExtractor.
  136.      * @param second the second BitMapExtractor.
  137.      * @return the Jaccard distance.
  138.      */
  139.     public static double jaccardDistance(final BitMapExtractor first, final BitMapExtractor second) {
  140.         return 1.0 - jaccardSimilarity(first, second);
  141.     }

  142.     /**
  143.      * Calculates the Jaccard similarity between two BitMapExtractor.
  144.      * <p>
  145.      * Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient
  146.      * </p>
  147.      *
  148.      * @param first  the first BitMapExtractor.
  149.      * @param second the second BitMapExtractor.
  150.      * @return the Jaccard similarity.
  151.      */
  152.     public static double jaccardSimilarity(final BitMapExtractor first, final BitMapExtractor second) {
  153.         final int[] cardinality = new int[2];
  154.         first.processBitMapPairs(second, (x, y) -> {
  155.             cardinality[0] += Long.bitCount(x & y);
  156.             cardinality[1] += Long.bitCount(x | y);
  157.             return true;
  158.         });
  159.         final int intersection = cardinality[0];
  160.         return intersection == 0 ? 0 : intersection / (double) cardinality[1];
  161.     }

  162.     /**
  163.      * Calculates the cardinality of the logical {@code OR} of the bit maps for the two filters.
  164.      *
  165.      * @param first  the first BitMapExtractor.
  166.      * @param second the second BitMapExtractor
  167.      * @return the cardinality of the {@code OR} of the filters.
  168.      */
  169.     public static int orCardinality(final BitMapExtractor first, final BitMapExtractor second) {
  170.         return cardinality(first, second, (x, y) -> x | y);
  171.     }

  172.     /**
  173.      * Calculates the cardinality of the logical {@code XOR} of the bit maps for the two filters.
  174.      *
  175.      * @param first  the first BitMapExtractor.
  176.      * @param second the second BitMapExtractor
  177.      * @return the cardinality of the {@code XOR} of the filters.
  178.      */
  179.     public static int xorCardinality(final BitMapExtractor first, final BitMapExtractor second) {
  180.         return cardinality(first, second, (x, y) -> x ^ y);
  181.     }

  182.     /**
  183.      * Do not instantiate.
  184.      */
  185.     private SetOperations() {
  186.     }
  187. }