View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.HashSet;
20  import java.util.Set;
21  
22  /**
23   * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
24   * sequence. Jaccard similarity is the size of the intersection divided by the
25   * size of the union of the two sets.
26   *
27   * <p>
28   * For further explanation about Jaccard Similarity, refer
29   * https://en.wikipedia.org/wiki/Jaccard_index
30   * </p>
31   *
32   * @since 1.0
33   */
34  public class JaccardSimilarity implements SimilarityScore<Double> {
35  
36      /**
37       * Singleton instance.
38       */
39      static final JaccardSimilarity INSTANCE = new JaccardSimilarity();
40  
41      /**
42       * Calculates Jaccard Similarity of two set character sequence passed as
43       * input.
44       *
45       * @param left first character sequence
46       * @param right second character sequence
47       * @return index
48       * @throws IllegalArgumentException
49       *             if either String input {@code null}
50       */
51      @Override
52      public Double apply(final CharSequence left, final CharSequence right) {
53          if (left == null || right == null) {
54              throw new IllegalArgumentException("Input cannot be null");
55          }
56          return calculateJaccardSimilarity(left, right);
57      }
58  
59      /**
60       * Calculates Jaccard Similarity of two character sequences passed as
61       * input. Does the calculation by identifying the union (characters in at
62       * least one of the two sets) of the two sets and intersection (characters
63       * which are present in set one which are present in set two)
64       *
65       * @param left first character sequence
66       * @param right second character sequence
67       * @return index
68       */
69      private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) {
70          final int leftLength = left.length();
71          final int rightLength = right.length();
72          if (leftLength == 0 && rightLength == 0) {
73              return 1d;
74          }
75          if (leftLength == 0 || rightLength == 0) {
76              return 0d;
77          }
78          final Set<Character> leftSet = new HashSet<>();
79          for (int i = 0; i < leftLength; i++) {
80              leftSet.add(left.charAt(i));
81          }
82          final Set<Character> rightSet = new HashSet<>();
83          for (int i = 0; i < rightLength; i++) {
84              rightSet.add(right.charAt(i));
85          }
86          final Set<Character> unionSet = new HashSet<>(leftSet);
87          unionSet.addAll(rightSet);
88          final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size();
89          return 1.0d * intersectionSize / unionSet.size();
90      }
91  }