001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.io.InputStream;
021import java.util.ArrayList;
022import java.util.Arrays;
023import java.util.Collections;
024import java.util.EnumMap;
025import java.util.HashSet;
026import java.util.List;
027import java.util.Locale;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Pattern;
032
033/**
034 * Language guessing utility.
035 * <p>
036 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
037 * done by reference to a whole series of rules distributed in resource files.
038 * <p>
039 * Instances of this class are typically managed through the static factory method instance().
040 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
041 * <p>
042 * This class is intended to be immutable and thread-safe.
043 * <p>
044 * <b>Lang resources</b>
045 * <p>
046 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
047 * They are systematically named following the pattern:
048 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
049 * The format of these resources is the following:
050 * <ul>
051 * <li><b>Rules:</b> whitespace separated strings.
052 * There should be 3 columns to each row, and these will be interpreted as:
053 * <ol>
054 * <li>pattern: a regular expression.</li>
055 * <li>languages: a '+'-separated list of languages.</li>
056 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
057 * </ol>
058 * </li>
059 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
060 * discarded as a comment.</li>
061 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
062 * This will skip all content until a line ending in '*' and '/' is found.</li>
063 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
064 * </ul>
065 * <p>
066 * Port of lang.php
067 *
068 * @since 1.6
069 * @version $Id$
070 */
071public class Lang {
072    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
073    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
074    // encapsulate a particular language-guessing rule table and the language guessing itself.
075    //
076    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
077    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
078    // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
079
080    private static final class LangRule {
081        private final boolean acceptOnMatch;
082        private final Set<String> languages;
083        private final Pattern pattern;
084
085        private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
086            this.pattern = pattern;
087            this.languages = languages;
088            this.acceptOnMatch = acceptOnMatch;
089        }
090
091        public boolean matches(final String txt) {
092            return this.pattern.matcher(txt).find();
093        }
094    }
095
096    private static final Map<NameType, Lang> Langs = new EnumMap<>(NameType.class);
097
098    private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
099
100    static {
101        for (final NameType s : NameType.values()) {
102            Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
103        }
104    }
105
106    /**
107     * Gets a Lang instance for one of the supported NameTypes.
108     *
109     * @param nameType
110     *            the NameType to look up
111     * @return a Lang encapsulating the language guessing rules for that name type
112     */
113    public static Lang instance(final NameType nameType) {
114        return Langs.get(nameType);
115    }
116
117    /**
118     * Loads language rules from a resource.
119     * <p>
120     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121     * You will only need to call this yourself if you are developing custom language mapping rules.
122     *
123     * @param languageRulesResourceName
124     *            the fully-qualified resource name to load
125     * @param languages
126     *            the languages that these rules will support
127     * @return a Lang encapsulating the loaded language-guessing rules.
128     */
129    public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130        final List<LangRule> rules = new ArrayList<>();
131        final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132
133        if (lRulesIS == null) {
134            throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135        }
136
137        try (final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING)) {
138            boolean inExtendedComment = false;
139            while (scanner.hasNextLine()) {
140                final String rawLine = scanner.nextLine();
141                String line = rawLine;
142                if (inExtendedComment) {
143                    // check for closing comment marker, otherwise discard doc comment line
144                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
145                        inExtendedComment = false;
146                    }
147                } else {
148                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
149                        inExtendedComment = true;
150                    } else {
151                        // discard comments
152                        final int cmtI = line.indexOf(ResourceConstants.CMT);
153                        if (cmtI >= 0) {
154                            line = line.substring(0, cmtI);
155                        }
156
157                        // trim leading-trailing whitespace
158                        line = line.trim();
159
160                        if (line.length() == 0) {
161                            continue; // empty lines can be safely skipped
162                        }
163
164                        // split it up
165                        final String[] parts = line.split("\\s+");
166
167                        if (parts.length != 3) {
168                            throw new IllegalArgumentException("Malformed line '" + rawLine +
169                                    "' in language resource '" + languageRulesResourceName + "'");
170                        }
171
172                        final Pattern pattern = Pattern.compile(parts[0]);
173                        final String[] langs = parts[1].split("\\+");
174                        final boolean accept = parts[2].equals("true");
175
176                        rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
177                    }
178                }
179            }
180        }
181        return new Lang(rules, languages);
182    }
183
184    private final Languages languages;
185    private final List<LangRule> rules;
186
187    private Lang(final List<LangRule> rules, final Languages languages) {
188        this.rules = Collections.unmodifiableList(rules);
189        this.languages = languages;
190    }
191
192    /**
193     * Guesses the language of a word.
194     *
195     * @param text
196     *            the word
197     * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
198     */
199    public String guessLanguage(final String text) {
200        final Languages.LanguageSet ls = guessLanguages(text);
201        return ls.isSingleton() ? ls.getAny() : Languages.ANY;
202    }
203
204    /**
205     * Guesses the languages of a word.
206     *
207     * @param input
208     *            the word
209     * @return a Set of Strings of language names that are potential matches for the input word
210     */
211    public Languages.LanguageSet guessLanguages(final String input) {
212        final String text = input.toLowerCase(Locale.ENGLISH);
213
214        final Set<String> langs = new HashSet<>(this.languages.getLanguages());
215        for (final LangRule rule : this.rules) {
216            if (rule.matches(text)) {
217                if (rule.acceptOnMatch) {
218                    langs.retainAll(rule.languages);
219                } else {
220                    langs.removeAll(rule.languages);
221                }
222            }
223        }
224
225        final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
226        return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
227    }
228}