JaccardSimilarity.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.HashSet;
  19. import java.util.Set;

  20. /**
  21.  * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
  22.  * sequence. Jaccard similarity is the size of the intersection divided by the
  23.  * size of the union of the two sets.
  24.  *
  25.  * <p>
  26.  * For further explanation about Jaccard Similarity, refer
  27.  * https://en.wikipedia.org/wiki/Jaccard_index
  28.  * </p>
  29.  *
  30.  * @since 1.0
  31.  */
  32. public class JaccardSimilarity implements SimilarityScore<Double> {

  33.     /**
  34.      * Calculates Jaccard Similarity of two set character sequence passed as
  35.      * input.
  36.      *
  37.      * @param left first character sequence
  38.      * @param right second character sequence
  39.      * @return index
  40.      * @throws IllegalArgumentException
  41.      *             if either String input {@code null}
  42.      */
  43.     @Override
  44.     public Double apply(CharSequence left, CharSequence right) {
  45.         if (left == null || right == null) {
  46.             throw new IllegalArgumentException("Input cannot be null");
  47.         }
  48.         return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
  49.     }

  50.     /**
  51.      * Calculates Jaccard Similarity of two character sequences passed as
  52.      * input. Does the calculation by identifying the union (characters in at
  53.      * least one of the two sets) of the two sets and intersection (characters
  54.      * which are present in set one which are present in set two)
  55.      *
  56.      * @param left first character sequence
  57.      * @param right second character sequence
  58.      * @return index
  59.      */
  60.     private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
  61.         Set<String> intersectionSet = new HashSet<String>();
  62.         Set<String> unionSet = new HashSet<String>();
  63.         boolean unionFilled = false;
  64.         int leftLength = left.length();
  65.         int rightLength = right.length();
  66.         if (leftLength == 0 || rightLength == 0) {
  67.             return 0d;
  68.         }

  69.         for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
  70.             unionSet.add(String.valueOf(left.charAt(leftIndex)));
  71.             for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
  72.                 if (!unionFilled) {
  73.                     unionSet.add(String.valueOf(right.charAt(rightIndex)));
  74.                 }
  75.                 if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
  76.                     intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
  77.                 }
  78.             }
  79.             unionFilled = true;
  80.         }
  81.         return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
  82.     }
  83. }