1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 import java.util.Map; 20 21 /** 22 * Measures the cosine distance between two character sequences. 23 * 24 * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences 25 * are converted into vectors through a simple tokenizer that works with a regular expression 26 * to split words in a sentence.</p> 27 * 28 * <p> 29 * For further explanation about Cosine Similarity and Cosine Distance, refer to 30 * https://en.wikipedia.org/wiki/Cosine_similarity. 31 * </p> 32 * 33 * @since 1.0 34 * @see CosineSimilarity 35 */ 36 public class CosineDistance implements EditDistance<Double> { 37 38 /** 39 * Construct a new instance. 40 */ 41 public CosineDistance() { 42 // empty 43 } 44 45 @Override 46 public Double apply(final CharSequence left, final CharSequence right) { 47 final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.apply(left); 48 final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.apply(right); 49 50 final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); 51 final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); 52 final double similarity = CosineSimilarity.INSTANCE.cosineSimilarity(leftVector, rightVector); 53 return 1.0 - similarity; 54 } 55 56 }