001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.EnumMap;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Locale;
027import java.util.Map;
028import java.util.Scanner;
029import java.util.Set;
030import java.util.regex.Pattern;
031
032import org.apache.commons.codec.Resources;
033
034/**
035 * Language guessing utility.
036 * <p>
037 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
038 * done by reference to a whole series of rules distributed in resource files.
039 * <p>
040 * Instances of this class are typically managed through the static factory method instance().
041 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
042 * <p>
043 * This class is intended to be immutable and thread-safe.
044 * <p>
045 * <b>Lang resources</b>
046 * <p>
047 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
048 * They are systematically named following the pattern:
049 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
050 * The format of these resources is the following:
051 * <ul>
052 * <li><b>Rules:</b> whitespace separated strings.
053 * There should be 3 columns to each row, and these will be interpreted as:
054 * <ol>
055 * <li>pattern: a regular expression.</li>
056 * <li>languages: a '+'-separated list of languages.</li>
057 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
058 * </ol>
059 * </li>
060 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
061 * discarded as a comment.</li>
062 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
063 * This will skip all content until a line ending in '*' and '/' is found.</li>
064 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
065 * </ul>
066 * <p>
067 * Port of lang.php
068 *
069 * @since 1.6
070 */
071public class Lang {
072    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
073    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
074    // encapsulate a particular language-guessing rule table and the language guessing itself.
075    //
076    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
077    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
078    // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
079
080    private static final class LangRule {
081        private final boolean acceptOnMatch;
082        private final Set<String> languages;
083        private final Pattern pattern;
084
085        private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
086            this.pattern = pattern;
087            this.languages = languages;
088            this.acceptOnMatch = acceptOnMatch;
089        }
090
091        public boolean matches(final String txt) {
092            return this.pattern.matcher(txt).find();
093        }
094    }
095
096    private static final Map<NameType, Lang> Langs = new EnumMap<>(NameType.class);
097
098    private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
099
100    static {
101        for (final NameType s : NameType.values()) {
102            Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
103        }
104    }
105
106    /**
107     * Gets a Lang instance for one of the supported NameTypes.
108     *
109     * @param nameType
110     *            the NameType to look up
111     * @return a Lang encapsulating the language guessing rules for that name type
112     */
113    public static Lang instance(final NameType nameType) {
114        return Langs.get(nameType);
115    }
116
117    /**
118     * Loads language rules from a resource.
119     * <p>
120     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121     * You will only need to call this yourself if you are developing custom language mapping rules.
122     *
123     * @param languageRulesResourceName
124     *            the fully-qualified resource name to load
125     * @param languages
126     *            the languages that these rules will support
127     * @return a Lang encapsulating the loaded language-guessing rules.
128     */
129    public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130        final List<LangRule> rules = new ArrayList<>();
131        try (final Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
132                ResourceConstants.ENCODING)) {
133            boolean inExtendedComment = false;
134            while (scanner.hasNextLine()) {
135                final String rawLine = scanner.nextLine();
136                String line = rawLine;
137                if (inExtendedComment) {
138                    // check for closing comment marker, otherwise discard doc comment line
139                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
140                        inExtendedComment = false;
141                    }
142                } else {
143                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
144                        inExtendedComment = true;
145                    } else {
146                        // discard comments
147                        final int cmtI = line.indexOf(ResourceConstants.CMT);
148                        if (cmtI >= 0) {
149                            line = line.substring(0, cmtI);
150                        }
151
152                        // trim leading-trailing whitespace
153                        line = line.trim();
154
155                        if (line.length() == 0) {
156                            continue; // empty lines can be safely skipped
157                        }
158
159                        // split it up
160                        final String[] parts = line.split("\\s+");
161
162                        if (parts.length != 3) {
163                            throw new IllegalArgumentException("Malformed line '" + rawLine +
164                                    "' in language resource '" + languageRulesResourceName + "'");
165                        }
166
167                        final Pattern pattern = Pattern.compile(parts[0]);
168                        final String[] langs = parts[1].split("\\+");
169                        final boolean accept = parts[2].equals("true");
170
171                        rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
172                    }
173                }
174            }
175        }
176        return new Lang(rules, languages);
177    }
178
179    private final Languages languages;
180    private final List<LangRule> rules;
181
182    private Lang(final List<LangRule> rules, final Languages languages) {
183        this.rules = Collections.unmodifiableList(rules);
184        this.languages = languages;
185    }
186
187    /**
188     * Guesses the language of a word.
189     *
190     * @param text
191     *            the word
192     * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
193     */
194    public String guessLanguage(final String text) {
195        final Languages.LanguageSet ls = guessLanguages(text);
196        return ls.isSingleton() ? ls.getAny() : Languages.ANY;
197    }
198
199    /**
200     * Guesses the languages of a word.
201     *
202     * @param input
203     *            the word
204     * @return a Set of Strings of language names that are potential matches for the input word
205     */
206    public Languages.LanguageSet guessLanguages(final String input) {
207        final String text = input.toLowerCase(Locale.ENGLISH);
208
209        final Set<String> langs = new HashSet<>(this.languages.getLanguages());
210        for (final LangRule rule : this.rules) {
211            if (rule.matches(text)) {
212                if (rule.acceptOnMatch) {
213                    langs.retainAll(rule.languages);
214                } else {
215                    langs.removeAll(rule.languages);
216                }
217            }
218        }
219
220        final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
221        return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
222    }
223}