1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 import java.util.Locale; 20 21 /** 22 * A matching algorithm that is similar to the searching algorithms implemented in editors such 23 * as Sublime Text, TextMate, Atom and others. 24 * 25 * <p> 26 * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score 27 * indicates a higher similarity. 28 * </p> 29 * 30 * <p> 31 * This code has been adapted from Apache Commons Lang 3.3. 32 * </p> 33 * 34 * @since 1.0 35 */ 36 public class FuzzyScore { 37 38 /** 39 * Locale used to change the case of text. 40 */ 41 private final Locale locale; 42 43 44 /** 45 * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p> 46 * 47 * @param locale The string matching logic is case insensitive. 48 A {@link Locale} is necessary to normalize both Strings to lower case. 49 * @throws IllegalArgumentException 50 * This is thrown if the {@link Locale} parameter is {@code null}. 51 */ 52 public FuzzyScore(final Locale locale) { 53 if (locale == null) { 54 throw new IllegalArgumentException("Locale must not be null"); 55 } 56 this.locale = locale; 57 } 58 59 /** 60 * <p> 61 * Find the Fuzzy Score which indicates the similarity score between two 62 * Strings. 63 * </p> 64 * 65 * <pre> 66 * score.fuzzyScore(null, null, null) = IllegalArgumentException 67 * score.fuzzyScore("", "", Locale.ENGLISH) = 0 68 * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0 69 * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1 70 * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1 71 * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2 72 * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4 73 * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3 74 * </pre> 75 * 76 * @param term a full term that should be matched against, must not be null 77 * @param query the query that will be matched against a term, must not be 78 * null 79 * @return result score 80 * @throws IllegalArgumentException if either String input {@code null} or 81 * Locale input {@code null} 82 */ 83 public Integer fuzzyScore(final CharSequence term, final CharSequence query) { 84 if (term == null || query == null) { 85 throw new IllegalArgumentException("Strings must not be null"); 86 } 87 88 // fuzzy logic is case insensitive. We normalize the Strings to lower 89 // case right from the start. Turning characters to lower case 90 // via Character.toLowerCase(char) is unfortunately insufficient 91 // as it does not accept a locale. 92 final String termLowerCase = term.toString().toLowerCase(locale); 93 final String queryLowerCase = query.toString().toLowerCase(locale); 94 95 // the resulting score 96 int score = 0; 97 98 // the position in the term which will be scanned next for potential 99 // query character matches 100 int termIndex = 0; 101 102 // index of the previously matched character in the term 103 int previousMatchingCharacterIndex = Integer.MIN_VALUE; 104 105 for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { 106 final char queryChar = queryLowerCase.charAt(queryIndex); 107 108 boolean termCharacterMatchFound = false; 109 for (; termIndex < termLowerCase.length() 110 && !termCharacterMatchFound; termIndex++) { 111 final char termChar = termLowerCase.charAt(termIndex); 112 113 if (queryChar == termChar) { 114 // simple character matches result in one point 115 score++; 116 117 // subsequent character matches further improve 118 // the score. 119 if (previousMatchingCharacterIndex + 1 == termIndex) { 120 score += 2; 121 } 122 123 previousMatchingCharacterIndex = termIndex; 124 125 // we can leave the nested loop. Every character in the 126 // query can match at most one character in the term. 127 termCharacterMatchFound = true; 128 } 129 } 130 } 131 132 return score; 133 } 134 135 /** 136 * Gets the locale. 137 * 138 * @return the locale 139 */ 140 public Locale getLocale() { 141 return locale; 142 } 143 144 }