JaccardSimilarity.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.HashSet;
  19. import java.util.Set;

  20. /**
  21.  * Measures the Jaccard similarity (aka Jaccard index) of two sets of character sequence. Jaccard similarity is the size of the intersection divided by the size
  22.  * of the union of the two sets.
  23.  *
  24.  * <p>
  25.  * For further explanation about Jaccard Similarity, refer https://en.wikipedia.org/wiki/Jaccard_index
  26.  * </p>
  27.  *
  28.  * @since 1.0
  29.  */
  30. public class JaccardSimilarity implements SimilarityScore<Double> {

  31.     /**
  32.      * Singleton instance.
  33.      */
  34.     static final JaccardSimilarity INSTANCE = new JaccardSimilarity();

  35.     /**
  36.      * Creates a new instance.
  37.      */
  38.     public JaccardSimilarity() {
  39.         // empty
  40.     }

  41.     /**
  42.      * Computes the Jaccard Similarity of two set character sequence passed as input.
  43.      *
  44.      * @param left  first input sequence.
  45.      * @param right second input sequence.
  46.      * @return index.
  47.      * @throws IllegalArgumentException if either String input {@code null}.
  48.      */
  49.     @Override
  50.     public Double apply(final CharSequence left, final CharSequence right) {
  51.         return apply(SimilarityInput.input(left), SimilarityInput.input(right));
  52.     }

  53.     /**
  54.      * Computes the Jaccard Similarity of two character sequences passed as input. Does the calculation by identifying the union (characters in at least one of
  55.      * the two sets) of the two sets and intersection (characters which are present in set one which are present in set two)
  56.      *
  57.      * @param <E>   The type of similarity score unit.
  58.      * @param left  first input sequence.
  59.      * @param right second input sequence.
  60.      * @return index.
  61.      * @since 1.13.0
  62.      */
  63.     public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) {
  64.         if (left == null || right == null) {
  65.             throw new IllegalArgumentException("Input cannot be null");
  66.         }
  67.         final int leftLength = left.length();
  68.         final int rightLength = right.length();
  69.         if (leftLength == 0 && rightLength == 0) {
  70.             return 1d;
  71.         }
  72.         if (leftLength == 0 || rightLength == 0) {
  73.             return 0d;
  74.         }
  75.         final Set<E> leftSet = new HashSet<>();
  76.         for (int i = 0; i < leftLength; i++) {
  77.             leftSet.add(left.at(i));
  78.         }
  79.         final Set<E> rightSet = new HashSet<>();
  80.         for (int i = 0; i < rightLength; i++) {
  81.             rightSet.add(right.at(i));
  82.         }
  83.         final Set<E> unionSet = new HashSet<>(leftSet);
  84.         unionSet.addAll(rightSet);
  85.         final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size();
  86.         return 1.0d * intersectionSize / unionSet.size();
  87.     }
  88. }