View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Locale;
20  
21  /**
22   * A matching algorithm that is similar to the searching algorithms implemented in editors such
23   * as Sublime Text, TextMate, Atom and others.
24   *
25   * <p>
26   * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
27   * indicates a higher similarity.
28   * </p>
29   *
30   * <p>
31   * This code has been adapted from Apache Commons Lang 3.3.
32   * </p>
33   *
34   * @since 1.0
35   */
36  public class FuzzyScore {
37  
38      /**
39       * Locale used to change the case of text.
40       */
41      private final Locale locale;
42  
43  
44      /**
45       * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
46       *
47       * @param locale The string matching logic is case insensitive.
48                       A {@link Locale} is necessary to normalize both Strings to lower case.
49       * @throws IllegalArgumentException
50       *         This is thrown if the {@link Locale} parameter is {@code null}.
51       */
52      public FuzzyScore(final Locale locale) {
53          if (locale == null) {
54              throw new IllegalArgumentException("Locale must not be null");
55          }
56          this.locale = locale;
57      }
58  
59      /**
60       * <p>
61       * Find the Fuzzy Score which indicates the similarity score between two
62       * Strings.
63       * </p>
64       *
65       * <pre>
66       * score.fuzzyScore(null, null, null)                                    = IllegalArgumentException
67       * score.fuzzyScore("", "", Locale.ENGLISH)                              = 0
68       * score.fuzzyScore("Workshop", "b", Locale.ENGLISH)                     = 0
69       * score.fuzzyScore("Room", "o", Locale.ENGLISH)                         = 1
70       * score.fuzzyScore("Workshop", "w", Locale.ENGLISH)                     = 1
71       * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH)                    = 2
72       * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH)                    = 4
73       * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
74       * </pre>
75       *
76       * @param term a full term that should be matched against, must not be null
77       * @param query the query that will be matched against a term, must not be
78       *            null
79       * @return result score
80       * @throws IllegalArgumentException if either String input {@code null} or
81       *             Locale input {@code null}
82       */
83      public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
84          if (term == null || query == null) {
85              throw new IllegalArgumentException("Strings must not be null");
86          }
87  
88          // fuzzy logic is case insensitive. We normalize the Strings to lower
89          // case right from the start. Turning characters to lower case
90          // via Character.toLowerCase(char) is unfortunately insufficient
91          // as it does not accept a locale.
92          final String termLowerCase = term.toString().toLowerCase(locale);
93          final String queryLowerCase = query.toString().toLowerCase(locale);
94  
95          // the resulting score
96          int score = 0;
97  
98          // the position in the term which will be scanned next for potential
99          // query character matches
100         int termIndex = 0;
101 
102         // index of the previously matched character in the term
103         int previousMatchingCharacterIndex = Integer.MIN_VALUE;
104 
105         for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
106             final char queryChar = queryLowerCase.charAt(queryIndex);
107 
108             boolean termCharacterMatchFound = false;
109             for (; termIndex < termLowerCase.length()
110                     && !termCharacterMatchFound; termIndex++) {
111                 final char termChar = termLowerCase.charAt(termIndex);
112 
113                 if (queryChar == termChar) {
114                     // simple character matches result in one point
115                     score++;
116 
117                     // subsequent character matches further improve
118                     // the score.
119                     if (previousMatchingCharacterIndex + 1 == termIndex) {
120                         score += 2;
121                     }
122 
123                     previousMatchingCharacterIndex = termIndex;
124 
125                     // we can leave the nested loop. Every character in the
126                     // query can match at most one character in the term.
127                     termCharacterMatchFound = true;
128                 }
129             }
130         }
131 
132         return score;
133     }
134 
135     /**
136      * Gets the locale.
137      *
138      * @return the locale
139      */
140     public Locale getLocale() {
141         return locale;
142     }
143 
144 }