001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.Locale; 020 021/** 022 * A matching algorithm that is similar to the searching algorithms implemented in editors such 023 * as Sublime Text, TextMate, Atom and others. 024 * 025 * <p> 026 * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score 027 * indicates a higher similarity. 028 * </p> 029 * 030 * <p> 031 * This code has been adapted from Apache Commons Lang 3.3. 032 * </p> 033 * 034 * @since 1.0 035 */ 036public class FuzzyScore { 037 038 /** 039 * Locale used to change the case of text. 040 */ 041 private final Locale locale; 042 043 044 /** 045 * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p> 046 * 047 * @param locale The string matching logic is case insensitive. 048 A {@link Locale} is necessary to normalize both Strings to lower case. 049 * @throws IllegalArgumentException 050 * This is thrown if the {@link Locale} parameter is {@code null}. 051 */ 052 public FuzzyScore(final Locale locale) { 053 if (locale == null) { 054 throw new IllegalArgumentException("Locale must not be null"); 055 } 056 this.locale = locale; 057 } 058 059 /** 060 * <p> 061 * Find the Fuzzy Score which indicates the similarity score between two 062 * Strings. 063 * </p> 064 * 065 * <pre> 066 * score.fuzzyScore(null, null) = IllegalArgumentException 067 * score.fuzzyScore("not null", null) = IllegalArgumentException 068 * score.fuzzyScore(null, "not null") = IllegalArgumentException 069 * score.fuzzyScore("", "") = 0 070 * score.fuzzyScore("Workshop", "b") = 0 071 * score.fuzzyScore("Room", "o") = 1 072 * score.fuzzyScore("Workshop", "w") = 1 073 * score.fuzzyScore("Workshop", "ws") = 2 074 * score.fuzzyScore("Workshop", "wo") = 4 075 * score.fuzzyScore("Apache Software Foundation", "asf") = 3 076 * </pre> 077 * 078 * @param term a full term that should be matched against, must not be null 079 * @param query the query that will be matched against a term, must not be 080 * null 081 * @return result score 082 * @throws IllegalArgumentException if the term or query is {@code null} 083 */ 084 public Integer fuzzyScore(final CharSequence term, final CharSequence query) { 085 if (term == null || query == null) { 086 throw new IllegalArgumentException("CharSequences must not be null"); 087 } 088 089 // fuzzy logic is case insensitive. We normalize the Strings to lower 090 // case right from the start. Turning characters to lower case 091 // via Character.toLowerCase(char) is unfortunately insufficient 092 // as it does not accept a locale. 093 final String termLowerCase = term.toString().toLowerCase(locale); 094 final String queryLowerCase = query.toString().toLowerCase(locale); 095 096 // the resulting score 097 int score = 0; 098 099 // the position in the term which will be scanned next for potential 100 // query character matches 101 int termIndex = 0; 102 103 // index of the previously matched character in the term 104 int previousMatchingCharacterIndex = Integer.MIN_VALUE; 105 106 for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { 107 final char queryChar = queryLowerCase.charAt(queryIndex); 108 109 boolean termCharacterMatchFound = false; 110 for (; termIndex < termLowerCase.length() 111 && !termCharacterMatchFound; termIndex++) { 112 final char termChar = termLowerCase.charAt(termIndex); 113 114 if (queryChar == termChar) { 115 // simple character matches result in one point 116 score++; 117 118 // subsequent character matches further improve 119 // the score. 120 if (previousMatchingCharacterIndex + 1 == termIndex) { 121 score += 2; 122 } 123 124 previousMatchingCharacterIndex = termIndex; 125 126 // we can leave the nested loop. Every character in the 127 // query can match at most one character in the term. 128 termCharacterMatchFound = true; 129 } 130 } 131 } 132 133 return score; 134 } 135 136 /** 137 * Gets the locale. 138 * 139 * @return The locale 140 */ 141 public Locale getLocale() { 142 return locale; 143 } 144 145}