001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.Map; 020 021/** 022 * Measures the cosine distance between two character sequences. 023 * 024 * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences 025 * are converted into vectors through a simple tokenizer that works with a regular expression 026 * to split words in a sentence.</p> 027 * 028 * <p> 029 * For further explanation about Cosine Similarity and Cosine Distance, refer to 030 * http://en.wikipedia.org/wiki/Cosine_similarity. 031 * </p> 032 * 033 * @since 1.0 034 * @see CosineSimilarity 035 */ 036public class CosineDistance implements EditDistance<Double> { 037 /** 038 * Tokenizer used to convert the character sequence into a vector. 039 */ 040 private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer(); 041 /** 042 * Cosine similarity. 043 */ 044 private final CosineSimilarity cosineSimilarity = new CosineSimilarity(); 045 046 @Override 047 public Double apply(final CharSequence left, final CharSequence right) { 048 final CharSequence[] leftTokens = tokenizer.tokenize(left); 049 final CharSequence[] rightTokens = tokenizer.tokenize(right); 050 051 final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); 052 final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); 053 final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector); 054 return 1.0 - similarity; 055 } 056 057}