1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text.similarity;
18
19 import java.util.Locale;
20
21 /**
22 * A matching algorithm that is similar to the searching algorithms implemented in editors such
23 * as Sublime Text, TextMate, Atom and others.
24 *
25 * <p>
26 * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
27 * indicates a higher similarity.
28 * </p>
29 *
30 * <p>
31 * This code has been adapted from Apache Commons Lang 3.3.
32 * </p>
33 *
34 * @since 1.0
35 */
36 public class FuzzyScore {
37
38 /**
39 * Locale used to change the case of text.
40 */
41 private final Locale locale;
42
43
44 /**
45 * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
46 *
47 * @param locale The string matching logic is case insensitive.
48 A {@link Locale} is necessary to normalize both Strings to lower case.
49 * @throws IllegalArgumentException
50 * This is thrown if the {@link Locale} parameter is {@code null}.
51 */
52 public FuzzyScore(final Locale locale) {
53 if (locale == null) {
54 throw new IllegalArgumentException("Locale must not be null");
55 }
56 this.locale = locale;
57 }
58
59 /**
60 * <p>
61 * Find the Fuzzy Score which indicates the similarity score between two
62 * Strings.
63 * </p>
64 *
65 * <pre>
66 * score.fuzzyScore(null, null, null) = IllegalArgumentException
67 * score.fuzzyScore("", "", Locale.ENGLISH) = 0
68 * score.fuzzyScore("Workshop", "b", Locale.ENGLISH) = 0
69 * score.fuzzyScore("Room", "o", Locale.ENGLISH) = 1
70 * score.fuzzyScore("Workshop", "w", Locale.ENGLISH) = 1
71 * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH) = 2
72 * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH) = 4
73 * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
74 * </pre>
75 *
76 * @param term a full term that should be matched against, must not be null
77 * @param query the query that will be matched against a term, must not be
78 * null
79 * @return result score
80 * @throws IllegalArgumentException if either String input {@code null} or
81 * Locale input {@code null}
82 */
83 public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
84 if (term == null || query == null) {
85 throw new IllegalArgumentException("Strings must not be null");
86 }
87
88 // fuzzy logic is case insensitive. We normalize the Strings to lower
89 // case right from the start. Turning characters to lower case
90 // via Character.toLowerCase(char) is unfortunately insufficient
91 // as it does not accept a locale.
92 final String termLowerCase = term.toString().toLowerCase(locale);
93 final String queryLowerCase = query.toString().toLowerCase(locale);
94
95 // the resulting score
96 int score = 0;
97
98 // the position in the term which will be scanned next for potential
99 // query character matches
100 int termIndex = 0;
101
102 // index of the previously matched character in the term
103 int previousMatchingCharacterIndex = Integer.MIN_VALUE;
104
105 for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
106 final char queryChar = queryLowerCase.charAt(queryIndex);
107
108 boolean termCharacterMatchFound = false;
109 for (; termIndex < termLowerCase.length()
110 && !termCharacterMatchFound; termIndex++) {
111 final char termChar = termLowerCase.charAt(termIndex);
112
113 if (queryChar == termChar) {
114 // simple character matches result in one point
115 score++;
116
117 // subsequent character matches further improve
118 // the score.
119 if (previousMatchingCharacterIndex + 1 == termIndex) {
120 score += 2;
121 }
122
123 previousMatchingCharacterIndex = termIndex;
124
125 // we can leave the nested loop. Every character in the
126 // query can match at most one character in the term.
127 termCharacterMatchFound = true;
128 }
129 }
130 }
131
132 return score;
133 }
134
135 /**
136 * Gets the locale.
137 *
138 * @return the locale
139 */
140 public Locale getLocale() {
141 return locale;
142 }
143
144 }