001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language.bm;
019    
020    import java.io.InputStream;
021    import java.util.ArrayList;
022    import java.util.Arrays;
023    import java.util.Collections;
024    import java.util.EnumMap;
025    import java.util.HashSet;
026    import java.util.List;
027    import java.util.Locale;
028    import java.util.Map;
029    import java.util.Scanner;
030    import java.util.Set;
031    import java.util.regex.Pattern;
032    
033    /**
034     * Language guessing utility.
035     * <p>
036     * This class encapsulates rules used to guess the possible languages that a word originates from. This is
037     * done by reference to a whole series of rules distributed in resource files.
038     * <p>
039     * Instances of this class are typically managed through the static factory method instance().
040     * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
041     * <p>
042     * This class is intended to be immutable and thread-safe.
043     * <p>
044     * <b>Lang resources</b>
045     * <p>
046     * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
047     * They are systematically named following the pattern:
048     * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
049     * The format of these resources is the following:
050     * <ul>
051     * <li><b>Rules:</b> whitespace separated strings.
052     * There should be 3 columns to each row, and these will be interpreted as:
053     * <ol>
054     * <li>pattern: a regular expression.</li>
055     * <li>languages: a '+'-separated list of languages.</li>
056     * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
057     * </ol>
058     * </li>
059     * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
060     * discarded as a comment.</li>
061     * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
062     * This will skip all content until a line ending in '*' and '/' is found.</li>
063     * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
064     * </ul>
065     * <p>
066     * Port of lang.php
067     *
068     * @since 1.6
069     * @version $Id: Lang.html 889935 2013-12-11 05:05:13Z ggregory $
070     */
071    public class Lang {
072        // Implementation note: This class is divided into two sections. The first part is a static factory interface that
073        // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
074        // encapsulate a particular language-guessing rule table and the language guessing itself.
075        //
076        // It may make sense in the future to expose the private constructor to allow power users to build custom language-
077        // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
078        // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
079    
080        private static final class LangRule {
081            private final boolean acceptOnMatch;
082            private final Set<String> languages;
083            private final Pattern pattern;
084    
085            private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
086                this.pattern = pattern;
087                this.languages = languages;
088                this.acceptOnMatch = acceptOnMatch;
089            }
090    
091            public boolean matches(final String txt) {
092                return this.pattern.matcher(txt).find();
093            }
094        }
095    
096        private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
097    
098        private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
099    
100        static {
101            for (final NameType s : NameType.values()) {
102                Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
103            }
104        }
105    
106        /**
107         * Gets a Lang instance for one of the supported NameTypes.
108         *
109         * @param nameType
110         *            the NameType to look up
111         * @return a Lang encapsulating the language guessing rules for that name type
112         */
113        public static Lang instance(final NameType nameType) {
114            return Langs.get(nameType);
115        }
116    
117        /**
118         * Loads language rules from a resource.
119         * <p>
120         * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121         * You will only need to call this yourself if you are developing custom language mapping rules.
122         *
123         * @param languageRulesResourceName
124         *            the fully-qualified resource name to load
125         * @param languages
126         *            the languages that these rules will support
127         * @return a Lang encapsulating the loaded language-guessing rules.
128         */
129        public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130            final List<LangRule> rules = new ArrayList<LangRule>();
131            final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132    
133            if (lRulesIS == null) {
134                throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135            }
136    
137            final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138            boolean inExtendedComment = false;
139            while (scanner.hasNextLine()) {
140                final String rawLine = scanner.nextLine();
141                String line = rawLine;
142    
143                if (inExtendedComment) {
144                    // check for closing comment marker, otherwise discard doc comment line
145                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146                        inExtendedComment = false;
147                    }
148                } else {
149                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150                        inExtendedComment = true;
151                    } else {
152                        // discard comments
153                        final int cmtI = line.indexOf(ResourceConstants.CMT);
154                        if (cmtI >= 0) {
155                            line = line.substring(0, cmtI);
156                        }
157    
158                        // trim leading-trailing whitespace
159                        line = line.trim();
160    
161                        if (line.length() == 0) {
162                            continue; // empty lines can be safely skipped
163                        }
164    
165                        // split it up
166                        final String[] parts = line.split("\\s+");
167    
168                        if (parts.length != 3) {
169                            throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
170                                                               languageRulesResourceName + "'");
171                        }
172    
173                        final Pattern pattern = Pattern.compile(parts[0]);
174                        final String[] langs = parts[1].split("\\+");
175                        final boolean accept = parts[2].equals("true");
176    
177                        rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178                    }
179                }
180            }
181    
182            return new Lang(rules, languages);
183        }
184    
185        private final Languages languages;
186        private final List<LangRule> rules;
187    
188        private Lang(final List<LangRule> rules, final Languages languages) {
189            this.rules = Collections.unmodifiableList(rules);
190            this.languages = languages;
191        }
192    
193        /**
194         * Guesses the language of a word.
195         *
196         * @param text
197         *            the word
198         * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
199         */
200        public String guessLanguage(final String text) {
201            final Languages.LanguageSet ls = guessLanguages(text);
202            return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203        }
204    
205        /**
206         * Guesses the languages of a word.
207         *
208         * @param input
209         *            the word
210         * @return a Set of Strings of language names that are potential matches for the input word
211         */
212        public Languages.LanguageSet guessLanguages(final String input) {
213            final String text = input.toLowerCase(Locale.ENGLISH);
214    
215            final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
216            for (final LangRule rule : this.rules) {
217                if (rule.matches(text)) {
218                    if (rule.acceptOnMatch) {
219                        langs.retainAll(rule.languages);
220                    } else {
221                        langs.removeAll(rule.languages);
222                    }
223                }
224            }
225    
226            final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227            return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228        }
229    }