001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.EnumMap;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Locale;
027import java.util.Map;
028import java.util.Scanner;
029import java.util.Set;
030import java.util.regex.Pattern;
031
032import org.apache.commons.codec.Resources;
033
034/**
035 * Language guessing utility.
036 * <p>
037 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
038 * done by reference to a whole series of rules distributed in resource files.
039 * </p>
040 * <p>
041 * Instances of this class are typically managed through the static factory method instance().
042 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
043 * </p>
044 * <p>
045 * This class is intended to be immutable and thread-safe.
046 * </p>
047 * <h2>Lang resources</h2>
048 * <p>
049 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
050 * They are systematically named following the pattern:
051 * </p>
052 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
053 * <p>
054 * The format of these resources is the following:
055 * </p>
056 * <ul>
057 * <li><strong>Rules:</strong> whitespace separated strings.
058 * There should be 3 columns to each row, and these will be interpreted as:
059 * <ol>
060 * <li>pattern: a regular expression.</li>
061 * <li>languages: a '+'-separated list of languages.</li>
062 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
063 * </ol>
064 * </li>
065 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be
066 * discarded as a comment.</li>
067 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode.
068 * This will skip all content until a line ending in '*' and '/' is found.</li>
069 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
070 * </ul>
071 * <p>
072 * Port of lang.php
073 * </p>
074 *
075 * @since 1.6
076 */
077public class Lang {
078    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
079    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
080    // encapsulate a particular language-guessing rule table and the language guessing itself.
081    //
082    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
083    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
084    // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.
085
086    private static final class LangRule {
087        private final boolean acceptOnMatch;
088        private final Set<String> languages;
089        private final Pattern pattern;
090
091        private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
092            this.pattern = pattern;
093            this.languages = languages;
094            this.acceptOnMatch = acceptOnMatch;
095        }
096
097        public boolean matches(final String txt) {
098            return pattern.matcher(txt).find();
099        }
100    }
101
102    private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class);
103
104    private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";
105
106    private static final Pattern PLUS = Pattern.compile("\\+");
107
108    static {
109        for (final NameType s : NameType.values()) {
110            LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
111        }
112    }
113
114    /**
115     * Gets a Lang instance for one of the supported NameTypes.
116     *
117     * @param nameType
118     *            the NameType to look up
119     * @return a Lang encapsulating the language guessing rules for that name type
120     */
121    public static Lang instance(final NameType nameType) {
122        return LANGS.get(nameType);
123    }
124
125    /**
126     * Loads language rules from a resource.
127     * <p>
128     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
129     * You will only need to call this yourself if you are developing custom language mapping rules.
130     * </p>
131     *
132     * @param languageRulesResourceName
133     *            the fully-qualified resource name to load
134     * @param languages
135     *            the languages that these rules will support
136     * @return a Lang encapsulating the loaded language-guessing rules.
137     */
138    public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
139        final List<LangRule> rules = new ArrayList<>();
140        try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
141                ResourceConstants.ENCODING)) {
142            boolean inExtendedComment = false;
143            while (scanner.hasNextLine()) {
144                final String rawLine = scanner.nextLine();
145                String line = rawLine;
146                if (inExtendedComment) {
147                    // check for closing comment marker, otherwise discard doc comment line
148                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
149                        inExtendedComment = false;
150                    }
151                } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
152                    inExtendedComment = true;
153                } else {
154                    // discard comments
155                    final int cmtI = line.indexOf(ResourceConstants.CMT);
156                    if (cmtI >= 0) {
157                        line = line.substring(0, cmtI);
158                    }
159
160                    // trim leading-trailing whitespace
161                    line = line.trim();
162
163                    if (line.isEmpty()) {
164                        continue; // empty lines can be safely skipped
165                    }
166
167                    // split it up
168                    final String[] parts = ResourceConstants.SPACES.split(line);
169
170                    if (parts.length != 3) {
171                        throw new IllegalArgumentException("Malformed line '" + rawLine +
172                                "' in language resource '" + languageRulesResourceName + "'");
173                    }
174
175                    final Pattern pattern = Pattern.compile(parts[0]);
176                    final String[] langs = PLUS.split(parts[1]);
177                    final boolean accept = parts[2].equals("true");
178
179                    rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
180                }
181            }
182        }
183        return new Lang(rules, languages);
184    }
185
186    private final Languages languages;
187
188    private final List<LangRule> rules;
189
190    private Lang(final List<LangRule> rules, final Languages languages) {
191        this.rules = Collections.unmodifiableList(rules);
192        this.languages = languages;
193    }
194
195    /**
196     * Guesses the language of a word.
197     *
198     * @param text
199     *            the word
200     * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
201     */
202    public String guessLanguage(final String text) {
203        final Languages.LanguageSet ls = guessLanguages(text);
204        return ls.isSingleton() ? ls.getAny() : Languages.ANY;
205    }
206
207    /**
208     * Guesses the languages of a word.
209     *
210     * @param input
211     *            the word
212     * @return a Set of Strings of language names that are potential matches for the input word
213     */
214    public Languages.LanguageSet guessLanguages(final String input) {
215        final String text = input.toLowerCase(Locale.ENGLISH);
216        final Set<String> langs = new HashSet<>(languages.getLanguages());
217        rules.forEach(rule -> {
218            if (rule.matches(text)) {
219                if (rule.acceptOnMatch) {
220                    langs.retainAll(rule.languages);
221                } else {
222                    langs.removeAll(rule.languages);
223                }
224            }
225        });
226        final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227        return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228    }
229}