1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 import java.util.Map; 20 21 /** 22 * Measures the cosine distance between two character sequences. 23 * 24 * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences 25 * are converted into vectors through a simple tokenizer that works with a regular expression 26 * to split words in a sentence.</p> 27 * 28 * <p> 29 * For further explanation about Cosine Similarity and Cosine Distance, refer to 30 * https://en.wikipedia.org/wiki/Cosine_similarity. 31 * </p> 32 * 33 * @since 1.0 34 * @see CosineSimilarity 35 */ 36 public class CosineDistance implements EditDistance<Double> { 37 38 @Override 39 public Double apply(final CharSequence left, final CharSequence right) { 40 final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.apply(left); 41 final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.apply(right); 42 43 final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); 44 final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); 45 final double similarity = CosineSimilarity.INSTANCE.cosineSimilarity(leftVector, rightVector); 46 return 1.0 - similarity; 47 } 48 49 }