CosineDistance.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.Map;

  19. /**
  20.  * Measures the cosine distance between two character sequences.
  21.  *
  22.  * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences
  23.  * are converted into vectors through a simple tokenizer that works with a regular expression
  24.  * to split words in a sentence.</p>
  25.  *
  26.  * <p>
  27.  * For further explanation about Cosine Similarity and Cosine Distance, refer to
  28.  * http://en.wikipedia.org/wiki/Cosine_similarity.
  29.  * </p>
  30.  *
  31.  * @since 1.0
  32.  * @see CosineSimilarity
  33.  */
  34. public class CosineDistance implements EditDistance<Double> {
  35.     /**
  36.      * Tokenizer used to convert the character sequence into a vector.
  37.      */
  38.     private final Tokenizer<CharSequence> tokenizer = new RegexTokenizer();
  39.     /**
  40.      * Cosine similarity.
  41.      */
  42.     private final CosineSimilarity cosineSimilarity = new CosineSimilarity();

  43.     @Override
  44.     public Double apply(final CharSequence left, final CharSequence right) {
  45.         final CharSequence[] leftTokens = tokenizer.tokenize(left);
  46.         final CharSequence[] rightTokens = tokenizer.tokenize(right);

  47.         final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens);
  48.         final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens);
  49.         final double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector);
  50.         return 1.0 - similarity;
  51.     }

  52. }