FuzzyScore.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.text.similarity;
- import java.util.Locale;
- /**
- * A matching algorithm that is similar to the searching algorithms implemented in editors such
- * as Sublime Text, TextMate, Atom and others.
- *
- * <p>
- * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
- * indicates a higher similarity.
- * </p>
- *
- * <p>
- * This code has been adapted from Apache Commons Lang 3.3.
- * </p>
- *
- * @since 1.0
- */
- public class FuzzyScore {
- /**
- * Locale used to change the case of text.
- */
- private final Locale locale;
- /**
- * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
- *
- * @param locale The string matching logic is case insensitive.
- A {@link Locale} is necessary to normalize both Strings to lower case.
- * @throws IllegalArgumentException
- * This is thrown if the {@link Locale} parameter is {@code null}.
- */
- public FuzzyScore(final Locale locale) {
- if (locale == null) {
- throw new IllegalArgumentException("Locale must not be null");
- }
- this.locale = locale;
- }
- /**
- * <p>
- * Find the Fuzzy Score which indicates the similarity score between two
- * Strings.
- * </p>
- *
- * <pre>
- * score.fuzzyScore(null, null, null) = IllegalArgumentException
- * score.fuzzyScore("", "", Locale.ENGLISH) = 0
- * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0
- * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1
- * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1
- * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2
- * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4
- * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
- * </pre>
- *
- * @param term a full term that should be matched against, must not be null
- * @param query the query that will be matched against a term, must not be
- * null
- * @return result score
- * @throws IllegalArgumentException if either String input {@code null} or
- * Locale input {@code null}
- */
- public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
- if (term == null || query == null) {
- throw new IllegalArgumentException("Strings must not be null");
- }
- // fuzzy logic is case insensitive. We normalize the Strings to lower
- // case right from the start. Turning characters to lower case
- // via Character.toLowerCase(char) is unfortunately insufficient
- // as it does not accept a locale.
- final String termLowerCase = term.toString().toLowerCase(locale);
- final String queryLowerCase = query.toString().toLowerCase(locale);
- // the resulting score
- int score = 0;
- // the position in the term which will be scanned next for potential
- // query character matches
- int termIndex = 0;
- // index of the previously matched character in the term
- int previousMatchingCharacterIndex = Integer.MIN_VALUE;
- for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
- final char queryChar = queryLowerCase.charAt(queryIndex);
- boolean termCharacterMatchFound = false;
- for (; termIndex < termLowerCase.length()
- && !termCharacterMatchFound; termIndex++) {
- final char termChar = termLowerCase.charAt(termIndex);
- if (queryChar == termChar) {
- // simple character matches result in one point
- score++;
- // subsequent character matches further improve
- // the score.
- if (previousMatchingCharacterIndex + 1 == termIndex) {
- score += 2;
- }
- previousMatchingCharacterIndex = termIndex;
- // we can leave the nested loop. Every character in the
- // query can match at most one character in the term.
- termCharacterMatchFound = true;
- }
- }
- }
- return score;
- }
- /**
- * Gets the locale.
- *
- * @return the locale
- */
- public Locale getLocale() {
- return locale;
- }
- }