001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Locale;
020
021/**
022 * A matching algorithm that is similar to the searching algorithms implemented in editors such
023 * as Sublime Text, TextMate, Atom and others.
024 *
025 * <p>
026 * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
027 * indicates a higher similarity.
028 * </p>
029 *
030 * <p>
031 * This code has been adapted from Apache Commons Lang 3.3.
032 * </p>
033 *
034 * @since 1.0
035 */
036public class FuzzyScore {
037
038    /**
039     * Locale used to change the case of text.
040     */
041    private final Locale locale;
042
043
044    /**
045     * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p>
046     *
047     * @param locale The string matching logic is case insensitive.
048                     A {@link Locale} is necessary to normalize both Strings to lower case.
049     * @throws IllegalArgumentException
050     *         This is thrown if the {@link Locale} parameter is {@code null}.
051     */
052    public FuzzyScore(final Locale locale) {
053        if (locale == null) {
054            throw new IllegalArgumentException("Locale must not be null");
055        }
056        this.locale = locale;
057    }
058
059    /**
060     * <p>
061     * Find the Fuzzy Score which indicates the similarity score between two
062     * Strings.
063     * </p>
064     *
065     * <pre>
066     * score.fuzzyScore(null, null, null)                                    = IllegalArgumentException
067     * score.fuzzyScore("", "", Locale.ENGLISH)                              = 0
068     * score.fuzzyScore("Workshop", "b", Locale.ENGLISH)                     = 0
069     * score.fuzzyScore("Room", "o", Locale.ENGLISH)                         = 1
070     * score.fuzzyScore("Workshop", "w", Locale.ENGLISH)                     = 1
071     * score.fuzzyScore("Workshop", "ws", Locale.ENGLISH)                    = 2
072     * score.fuzzyScore("Workshop", "wo", Locale.ENGLISH)                    = 4
073     * score.fuzzyScore("Apache Software Foundation", "asf", Locale.ENGLISH) = 3
074     * </pre>
075     *
076     * @param term a full term that should be matched against, must not be null
077     * @param query the query that will be matched against a term, must not be
078     *            null
079     * @return result score
080     * @throws IllegalArgumentException if either String input {@code null} or
081     *             Locale input {@code null}
082     */
083    public Integer fuzzyScore(final CharSequence term, final CharSequence query) {
084        if (term == null || query == null) {
085            throw new IllegalArgumentException("Strings must not be null");
086        }
087
088        // fuzzy logic is case insensitive. We normalize the Strings to lower
089        // case right from the start. Turning characters to lower case
090        // via Character.toLowerCase(char) is unfortunately insufficient
091        // as it does not accept a locale.
092        final String termLowerCase = term.toString().toLowerCase(locale);
093        final String queryLowerCase = query.toString().toLowerCase(locale);
094
095        // the resulting score
096        int score = 0;
097
098        // the position in the term which will be scanned next for potential
099        // query character matches
100        int termIndex = 0;
101
102        // index of the previously matched character in the term
103        int previousMatchingCharacterIndex = Integer.MIN_VALUE;
104
105        for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) {
106            final char queryChar = queryLowerCase.charAt(queryIndex);
107
108            boolean termCharacterMatchFound = false;
109            for (; termIndex < termLowerCase.length()
110                    && !termCharacterMatchFound; termIndex++) {
111                final char termChar = termLowerCase.charAt(termIndex);
112
113                if (queryChar == termChar) {
114                    // simple character matches result in one point
115                    score++;
116
117                    // subsequent character matches further improve
118                    // the score.
119                    if (previousMatchingCharacterIndex + 1 == termIndex) {
120                        score += 2;
121                    }
122
123                    previousMatchingCharacterIndex = termIndex;
124
125                    // we can leave the nested loop. Every character in the
126                    // query can match at most one character in the term.
127                    termCharacterMatchFound = true;
128                }
129            }
130        }
131
132        return score;
133    }
134
135    /**
136     * Gets the locale.
137     *
138     * @return the locale
139     */
140    public Locale getLocale() {
141        return locale;
142    }
143
144}