View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.HashSet;
20  import java.util.Set;
21  
22  /**
23   * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
24   * sequence. Jaccard similarity is the size of the intersection divided by the
25   * size of the union of the two sets.
26   *
27   * <p>
28   * For further explanation about Jaccard Similarity, refer
29   * https://en.wikipedia.org/wiki/Jaccard_index
30   * </p>
31   *
32   * @since 1.0
33   */
34  public class JaccardSimilarity implements SimilarityScore<Double> {
35  
36      /**
37       * Calculates Jaccard Similarity of two set character sequence passed as
38       * input.
39       * 
40       * @param left first character sequence
41       * @param right second character sequence
42       * @return index
43       * @throws IllegalArgumentException
44       *             if either String input {@code null}
45       */
46      @Override
47      public Double apply(CharSequence left, CharSequence right) {
48          if (left == null || right == null) {
49              throw new IllegalArgumentException("Input cannot be null");
50          }
51          return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
52      }
53  
54      /**
55       * Calculates Jaccard Similarity of two character sequences passed as
56       * input. Does the calculation by identifying the union (characters in at
57       * least one of the two sets) of the two sets and intersection (characters
58       * which are present in set one which are present in set two)
59       * 
60       * @param left first character sequence
61       * @param right second character sequence
62       * @return index
63       */
64      private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
65          Set<String> intersectionSet = new HashSet<String>();
66          Set<String> unionSet = new HashSet<String>();
67          boolean unionFilled = false;
68          int leftLength = left.length();
69          int rightLength = right.length();
70          if (leftLength == 0 || rightLength == 0) {
71              return 0d;
72          }
73  
74          for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
75              unionSet.add(String.valueOf(left.charAt(leftIndex)));
76              for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
77                  if (!unionFilled) {
78                      unionSet.add(String.valueOf(right.charAt(rightIndex)));
79                  }
80                  if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
81                      intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
82                  }
83              }
84              unionFilled = true;
85          }
86          return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
87      }
88  }