001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.HashSet; 020import java.util.Set; 021 022/** 023 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character 024 * sequence. Jaccard similarity is the size of the intersection divided by the 025 * size of the union of the two sets. 026 * 027 * <p> 028 * For further explanation about Jaccard Similarity, refer 029 * https://en.wikipedia.org/wiki/Jaccard_index 030 * </p> 031 * 032 * @since 1.0 033 */ 034public class JaccardSimilarity implements SimilarityScore<Double> { 035 036 /** 037 * Calculates Jaccard Similarity of two set character sequence passed as 038 * input. 039 * 040 * @param left first character sequence 041 * @param right second character sequence 042 * @return index 043 * @throws IllegalArgumentException 044 * if either String input {@code null} 045 */ 046 @Override 047 public Double apply(CharSequence left, CharSequence right) { 048 if (left == null || right == null) { 049 throw new IllegalArgumentException("Input cannot be null"); 050 } 051 return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d; 052 } 053 054 /** 055 * Calculates Jaccard Similarity of two character sequences passed as 056 * input. Does the calculation by identifying the union (characters in at 057 * least one of the two sets) of the two sets and intersection (characters 058 * which are present in set one which are present in set two) 059 * 060 * @param left first character sequence 061 * @param right second character sequence 062 * @return index 063 */ 064 private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) { 065 Set<String> intersectionSet = new HashSet<String>(); 066 Set<String> unionSet = new HashSet<String>(); 067 boolean unionFilled = false; 068 int leftLength = left.length(); 069 int rightLength = right.length(); 070 if (leftLength == 0 || rightLength == 0) { 071 return 0d; 072 } 073 074 for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) { 075 unionSet.add(String.valueOf(left.charAt(leftIndex))); 076 for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) { 077 if (!unionFilled) { 078 unionSet.add(String.valueOf(right.charAt(rightIndex))); 079 } 080 if (left.charAt(leftIndex) == right.charAt(rightIndex)) { 081 intersectionSet.add(String.valueOf(left.charAt(leftIndex))); 082 } 083 } 084 unionFilled = true; 085 } 086 return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); 087 } 088}