FuzzyScore.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text.similarity;

  18. import java.util.Locale;

  19. /**
  20.  * A matching algorithm that is similar to the searching algorithms implemented in editors such
  21.  * as Sublime Text, TextMate, Atom and others.
  22.  *
  23.  * <p>
  24.  * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
  25.  * indicates a higher similarity.
  26.  * </p>
  27.  *
  28.  * <p>
  29.  * This code has been adapted from Apache Commons Lang 3.3.
  30.  * </p>
  31.  *
  32.  * @since 1.0
  33.  */
  34. public class FuzzyScore {

  35.     /**
  36.      * Locale used to change the case of text.
  37.      */
  38.     private final Locale locale;


  39.     /**
  40.      * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
  41.      *
  42.      * @param locale The string matching logic is case insensitive.
  43.                      A {@link Locale} is necessary to normalize both Strings to lower case.
  44.      * @throws IllegalArgumentException
  45.      *         This is thrown if the {@link Locale} parameter is {@code null}.
  46.      */
  47.     public FuzzyScore(final Locale locale) {
  48.         if (locale == null) {
  49.             throw new IllegalArgumentException("Locale must not be null");
  50.         }
  51.         this.locale = locale;
  52.     }

  53.     /**
  54.      * <p>
  55.      * Find the Fuzzy Score which indicates the similarity score between two
  56.      * Strings.
  57.      * </p>
  58.      *
  59.      * <pre>
  60.      * score.fuzzyScore(null, null, null)                                    = IllegalArgumentException
  61.      * score.fuzzyScore("", "", Locale.ENGLISH)                              = 0
  62.      * score.fuzzyScore("Workshop", "b", Locale.ENGLISH)                     = 0
  63.      * score.fuzzyScore("Room", "o", Locale.ENGLISH)                         = 1
  64.      * score.fuzzyScore("Workshop", "w", Locale.ENGLISH)                     = 1
  65.      * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH)                    = 2
  66.      * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH)                    = 4
  67.      * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
  68.      * </pre>
  69.      *
  70.      * @param term a full term that should be matched against, must not be null
  71.      * @param query the query that will be matched against a term, must not be
  72.      *            null
  73.      * @return result score
  74.      * @throws IllegalArgumentException if either String input {@code null} or
  75.      *             Locale input {@code null}
  76.      */
  77.     public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
  78.         if (term == null || query == null) {
  79.             throw new IllegalArgumentException("Strings must not be null");
  80.         }

  81.         // fuzzy logic is case insensitive. We normalize the Strings to lower
  82.         // case right from the start. Turning characters to lower case
  83.         // via Character.toLowerCase(char) is unfortunately insufficient
  84.         // as it does not accept a locale.
  85.         final String termLowerCase = term.toString().toLowerCase(locale);
  86.         final String queryLowerCase = query.toString().toLowerCase(locale);

  87.         // the resulting score
  88.         int score = 0;

  89.         // the position in the term which will be scanned next for potential
  90.         // query character matches
  91.         int termIndex = 0;

  92.         // index of the previously matched character in the term
  93.         int previousMatchingCharacterIndex = Integer.MIN_VALUE;

  94.         for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
  95.             final char queryChar = queryLowerCase.charAt(queryIndex);

  96.             boolean termCharacterMatchFound = false;
  97.             for (; termIndex < termLowerCase.length()
  98.                     && !termCharacterMatchFound; termIndex++) {
  99.                 final char termChar = termLowerCase.charAt(termIndex);

  100.                 if (queryChar == termChar) {
  101.                     // simple character matches result in one point
  102.                     score++;

  103.                     // subsequent character matches further improve
  104.                     // the score.
  105.                     if (previousMatchingCharacterIndex + 1 == termIndex) {
  106.                         score += 2;
  107.                     }

  108.                     previousMatchingCharacterIndex = termIndex;

  109.                     // we can leave the nested loop. Every character in the
  110.                     // query can match at most one character in the term.
  111.                     termCharacterMatchFound = true;
  112.                 }
  113.             }
  114.         }

  115.         return score;
  116.     }

  117.     /**
  118.      * Gets the locale.
  119.      *
  120.      * @return the locale
  121.      */
  122.     public Locale getLocale() {
  123.         return locale;
  124.     }

  125. }