001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.HashSet;
020import java.util.Set;
021
022/**
023 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character
024 * sequence. Jaccard similarity is the size of the intersection divided by the
025 * size of the union of the two sets.
026 *
027 * <p>
028 * For further explanation about Jaccard Similarity, refer
029 * https://en.wikipedia.org/wiki/Jaccard_index
030 * </p>
031 *
032 * @since 1.0
033 */
034public class JaccardSimilarity implements SimilarityScore<Double> {
035
036    /**
037     * Calculates Jaccard Similarity of two set character sequence passed as
038     * input.
039     * 
040     * @param left first character sequence
041     * @param right second character sequence
042     * @return index
043     * @throws IllegalArgumentException
044     *             if either String input {@code null}
045     */
046    @Override
047    public Double apply(CharSequence left, CharSequence right) {
048        if (left == null || right == null) {
049            throw new IllegalArgumentException("Input cannot be null");
050        }
051        return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
052    }
053
054    /**
055     * Calculates Jaccard Similarity of two character sequences passed as
056     * input. Does the calculation by identifying the union (characters in at
057     * least one of the two sets) of the two sets and intersection (characters
058     * which are present in set one which are present in set two)
059     * 
060     * @param left first character sequence
061     * @param right second character sequence
062     * @return index
063     */
064    private Double calculateJaccardSimilarity(CharSequence left, CharSequence right) {
065        Set<String> intersectionSet = new HashSet<String>();
066        Set<String> unionSet = new HashSet<String>();
067        boolean unionFilled = false;
068        int leftLength = left.length();
069        int rightLength = right.length();
070        if (leftLength == 0 || rightLength == 0) {
071            return 0d;
072        }
073
074        for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
075            unionSet.add(String.valueOf(left.charAt(leftIndex)));
076            for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
077                if (!unionFilled) {
078                    unionSet.add(String.valueOf(right.charAt(rightIndex)));
079                }
080                if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
081                    intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
082                }
083            }
084            unionFilled = true;
085        }
086        return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size());
087    }
088}