001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.io.InputStream;
021import java.util.ArrayList;
022import java.util.Arrays;
023import java.util.Collections;
024import java.util.EnumMap;
025import java.util.HashSet;
026import java.util.List;
027import java.util.Locale;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Pattern;
032
033/**
034 * Language guessing utility.
035 * <p>
036 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
037 * done by reference to a whole series of rules distributed in resource files.
038 * <p>
039 * Instances of this class are typically managed through the static factory method instance().
040 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
041 * <p>
042 * This class is intended to be immutable and thread-safe.
043 * <p>
044 * <b>Lang resources</b>
045 * <p>
046 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
047 * They are systematically named following the pattern:
048 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
049 * The format of these resources is the following:
050 * <ul>
051 * <li><b>Rules:</b> whitespace separated strings.
052 * There should be 3 columns to each row, and these will be interpreted as:
053 * <ol>
054 * <li>pattern: a regular expression.</li>
055 * <li>languages: a '+'-separated list of languages.</li>
056 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
057 * </ol>
058 * </li>
059 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
060 * discarded as a comment.</li>
061 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
062 * This will skip all content until a line ending in '*' and '/' is found.</li>
063 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
064 * </ul>
065 * <p>
066 * Port of lang.php
067 *
068 * @since 1.6
069 * @version $Id: Lang.html 928559 2014-11-10 02:53:54Z ggregory $
070 */
071public class Lang {
072    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
073    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
074    // encapsulate a particular language-guessing rule table and the language guessing itself.
075    //
076    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
077    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
078    // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
079
080    private static final class LangRule {
081        private final boolean acceptOnMatch;
082        private final Set<String> languages;
083        private final Pattern pattern;
084
085        private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
086            this.pattern = pattern;
087            this.languages = languages;
088            this.acceptOnMatch = acceptOnMatch;
089        }
090
091        public boolean matches(final String txt) {
092            return this.pattern.matcher(txt).find();
093        }
094    }
095
096    private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
097
098    private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
099
100    static {
101        for (final NameType s : NameType.values()) {
102            Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
103        }
104    }
105
106    /**
107     * Gets a Lang instance for one of the supported NameTypes.
108     *
109     * @param nameType
110     *            the NameType to look up
111     * @return a Lang encapsulating the language guessing rules for that name type
112     */
113    public static Lang instance(final NameType nameType) {
114        return Langs.get(nameType);
115    }
116
117    /**
118     * Loads language rules from a resource.
119     * <p>
120     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121     * You will only need to call this yourself if you are developing custom language mapping rules.
122     *
123     * @param languageRulesResourceName
124     *            the fully-qualified resource name to load
125     * @param languages
126     *            the languages that these rules will support
127     * @return a Lang encapsulating the loaded language-guessing rules.
128     */
129    public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130        final List<LangRule> rules = new ArrayList<LangRule>();
131        final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132
133        if (lRulesIS == null) {
134            throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135        }
136
137        final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138        try {
139            boolean inExtendedComment = false;
140            while (scanner.hasNextLine()) {
141                final String rawLine = scanner.nextLine();
142                String line = rawLine;
143                if (inExtendedComment) {
144                    // check for closing comment marker, otherwise discard doc comment line
145                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146                        inExtendedComment = false;
147                    }
148                } else {
149                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150                        inExtendedComment = true;
151                    } else {
152                        // discard comments
153                        final int cmtI = line.indexOf(ResourceConstants.CMT);
154                        if (cmtI >= 0) {
155                            line = line.substring(0, cmtI);
156                        }
157
158                        // trim leading-trailing whitespace
159                        line = line.trim();
160
161                        if (line.length() == 0) {
162                            continue; // empty lines can be safely skipped
163                        }
164
165                        // split it up
166                        final String[] parts = line.split("\\s+");
167
168                        if (parts.length != 3) {
169                            throw new IllegalArgumentException("Malformed line '" + rawLine +
170                                    "' in language resource '" + languageRulesResourceName + "'");
171                        }
172
173                        final Pattern pattern = Pattern.compile(parts[0]);
174                        final String[] langs = parts[1].split("\\+");
175                        final boolean accept = parts[2].equals("true");
176
177                        rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178                    }
179                }
180            }
181        } finally {
182            scanner.close();
183        }
184        return new Lang(rules, languages);
185    }
186
187    private final Languages languages;
188    private final List<LangRule> rules;
189
190    private Lang(final List<LangRule> rules, final Languages languages) {
191        this.rules = Collections.unmodifiableList(rules);
192        this.languages = languages;
193    }
194
195    /**
196     * Guesses the language of a word.
197     *
198     * @param text
199     *            the word
200     * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
201     */
202    public String guessLanguage(final String text) {
203        final Languages.LanguageSet ls = guessLanguages(text);
204        return ls.isSingleton() ? ls.getAny() : Languages.ANY;
205    }
206
207    /**
208     * Guesses the languages of a word.
209     *
210     * @param input
211     *            the word
212     * @return a Set of Strings of language names that are potential matches for the input word
213     */
214    public Languages.LanguageSet guessLanguages(final String input) {
215        final String text = input.toLowerCase(Locale.ENGLISH);
216
217        final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
218        for (final LangRule rule : this.rules) {
219            if (rule.matches(text)) {
220                if (rule.acceptOnMatch) {
221                    langs.retainAll(rule.languages);
222                } else {
223                    langs.removeAll(rule.languages);
224                }
225            }
226        }
227
228        final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
229        return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
230    }
231}