FuzzyScore.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.Locale;

  19. /**
  20.  * A matching algorithm that is similar to the searching algorithms implemented in editors such
  21.  * as Sublime Text, TextMate, Atom and others.
  22.  *
  23.  * <p>
  24.  * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
  25.  * indicates a higher similarity.
  26.  * </p>
  27.  *
  28.  * <p>
  29.  * This code has been adapted from Apache Commons Lang 3.3.
  30.  * </p>
  31.  *
  32.  * @since 1.0
  33.  */
  34. public class FuzzyScore {

  35.     /**
  36.      * Locale used to change the case of text.
  37.      */
  38.     private final Locale locale;

  39.     /**
  40.      * This returns a {@link Locale}-specific {@link FuzzyScore}.
  41.      *
  42.      * @param locale The string matching logic is case insensitive.
  43.                      A {@link Locale} is necessary to normalize both Strings to lower case.
  44.      * @throws IllegalArgumentException
  45.      *         This is thrown if the {@link Locale} parameter is {@code null}.
  46.      */
  47.     public FuzzyScore(final Locale locale) {
  48.         if (locale == null) {
  49.             throw new IllegalArgumentException("Locale must not be null");
  50.         }
  51.         this.locale = locale;
  52.     }

  53.     /**
  54.      * Find the Fuzzy Score which indicates the similarity score between two
  55.      * Strings.
  56.      *
  57.      * <pre>
  58.      * score.fuzzyScore(null, null)                          = IllegalArgumentException
  59.      * score.fuzzyScore("not null", null)                    = IllegalArgumentException
  60.      * score.fuzzyScore(null, "not null")                    = IllegalArgumentException
  61.      * score.fuzzyScore("", "")                              = 0
  62.      * score.fuzzyScore("Workshop", "b")                     = 0
  63.      * score.fuzzyScore("Room", "o")                         = 1
  64.      * score.fuzzyScore("Workshop", "w")                     = 1
  65.      * score.fuzzyScore("Workshop", "ws")                    = 2
  66.      * score.fuzzyScore("Workshop", "wo")                    = 4
  67.      * score.fuzzyScore("Apache Software Foundation", "asf") = 3
  68.      * </pre>
  69.      *
  70.      * @param term a full term that should be matched against, must not be null
  71.      * @param query the query that will be matched against a term, must not be
  72.      *            null
  73.      * @return result score
  74.      * @throws IllegalArgumentException if the term or query is {@code null}
  75.      */
  76.     public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
  77.         if (term == null || query == null) {
  78.             throw new IllegalArgumentException("CharSequences must not be null");
  79.         }

  80.         // fuzzy logic is case insensitive. We normalize the Strings to lower
  81.         // case right from the start. Turning characters to lower case
  82.         // via Character.toLowerCase(char) is unfortunately insufficient
  83.         // as it does not accept a locale.
  84.         final String termLowerCase = term.toString().toLowerCase(locale);
  85.         final String queryLowerCase = query.toString().toLowerCase(locale);

  86.         // the resulting score
  87.         int score = 0;

  88.         // the position in the term which will be scanned next for potential
  89.         // query character matches
  90.         int termIndex = 0;

  91.         // index of the previously matched character in the term
  92.         int previousMatchingCharacterIndex = Integer.MIN_VALUE;

  93.         for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
  94.             final char queryChar = queryLowerCase.charAt(queryIndex);

  95.             boolean termCharacterMatchFound = false;
  96.             for (; termIndex < termLowerCase.length()
  97.                     && !termCharacterMatchFound; termIndex++) {
  98.                 final char termChar = termLowerCase.charAt(termIndex);

  99.                 if (queryChar == termChar) {
  100.                     // simple character matches result in one point
  101.                     score++;

  102.                     // subsequent character matches further improve
  103.                     // the score.
  104.                     if (previousMatchingCharacterIndex + 1 == termIndex) {
  105.                         score += 2;
  106.                     }

  107.                     previousMatchingCharacterIndex = termIndex;

  108.                     // we can leave the nested loop. Every character in the
  109.                     // query can match at most one character in the term.
  110.                     termCharacterMatchFound = true;
  111.                 }
  112.             }
  113.         }

  114.         return score;
  115.     }

  116.     /**
  117.      * Gets the locale.
  118.      *
  119.      * @return The locale
  120.      */
  121.     public Locale getLocale() {
  122.         return locale;
  123.     }

  124. }