1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.EnumMap;
25 import java.util.HashSet;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.Map;
29 import java.util.Scanner;
30 import java.util.Set;
31 import java.util.regex.Pattern;
32
33 /**
34 * Language guessing utility.
35 * <p>
36 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
37 * done by reference to a whole series of rules distributed in resource files.
38 * <p>
39 * Instances of this class are typically managed through the static factory method instance().
40 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
41 * <p>
42 * This class is intended to be immutable and thread-safe.
43 * <p>
44 * <b>Lang resources</b>
45 * <p>
46 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
47 * They are systematically named following the pattern:
48 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
49 * The format of these resources is the following:
50 * <ul>
51 * <li><b>Rules:</b> whitespace separated strings.
52 * There should be 3 columns to each row, and these will be interpreted as:
53 * <ol>
54 * <li>pattern: a regular expression.</li>
55 * <li>languages: a '+'-separated list of languages.</li>
56 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
57 * </ol>
58 * </li>
59 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
60 * discarded as a comment.</li>
61 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
62 * This will skip all content until a line ending in '*' and '/' is found.</li>
63 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
64 * </ul>
65 * <p>
66 * Port of lang.php
67 *
68 * @since 1.6
69 * @version $Id: Lang.html 889935 2013-12-11 05:05:13Z ggregory $
70 */
71 public class Lang {
72 // Implementation note: This class is divided into two sections. The first part is a static factory interface that
73 // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
74 // encapsulate a particular language-guessing rule table and the language guessing itself.
75 //
76 // It may make sense in the future to expose the private constructor to allow power users to build custom language-
77 // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
78 // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
79
80 private static final class LangRule {
81 private final boolean acceptOnMatch;
82 private final Set<String> languages;
83 private final Pattern pattern;
84
85 private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
86 this.pattern = pattern;
87 this.languages = languages;
88 this.acceptOnMatch = acceptOnMatch;
89 }
90
91 public boolean matches(final String txt) {
92 return this.pattern.matcher(txt).find();
93 }
94 }
95
96 private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
97
98 private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
99
100 static {
101 for (final NameType s : NameType.values()) {
102 Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
103 }
104 }
105
106 /**
107 * Gets a Lang instance for one of the supported NameTypes.
108 *
109 * @param nameType
110 * the NameType to look up
111 * @return a Lang encapsulating the language guessing rules for that name type
112 */
113 public static Lang instance(final NameType nameType) {
114 return Langs.get(nameType);
115 }
116
117 /**
118 * Loads language rules from a resource.
119 * <p>
120 * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121 * You will only need to call this yourself if you are developing custom language mapping rules.
122 *
123 * @param languageRulesResourceName
124 * the fully-qualified resource name to load
125 * @param languages
126 * the languages that these rules will support
127 * @return a Lang encapsulating the loaded language-guessing rules.
128 */
129 public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130 final List<LangRule> rules = new ArrayList<LangRule>();
131 final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132
133 if (lRulesIS == null) {
134 throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135 }
136
137 final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138 boolean inExtendedComment = false;
139 while (scanner.hasNextLine()) {
140 final String rawLine = scanner.nextLine();
141 String line = rawLine;
142
143 if (inExtendedComment) {
144 // check for closing comment marker, otherwise discard doc comment line
145 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146 inExtendedComment = false;
147 }
148 } else {
149 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150 inExtendedComment = true;
151 } else {
152 // discard comments
153 final int cmtI = line.indexOf(ResourceConstants.CMT);
154 if (cmtI >= 0) {
155 line = line.substring(0, cmtI);
156 }
157
158 // trim leading-trailing whitespace
159 line = line.trim();
160
161 if (line.length() == 0) {
162 continue; // empty lines can be safely skipped
163 }
164
165 // split it up
166 final String[] parts = line.split("\\s+");
167
168 if (parts.length != 3) {
169 throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
170 languageRulesResourceName + "'");
171 }
172
173 final Pattern pattern = Pattern.compile(parts[0]);
174 final String[] langs = parts[1].split("\\+");
175 final boolean accept = parts[2].equals("true");
176
177 rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178 }
179 }
180 }
181
182 return new Lang(rules, languages);
183 }
184
185 private final Languages languages;
186 private final List<LangRule> rules;
187
188 private Lang(final List<LangRule> rules, final Languages languages) {
189 this.rules = Collections.unmodifiableList(rules);
190 this.languages = languages;
191 }
192
193 /**
194 * Guesses the language of a word.
195 *
196 * @param text
197 * the word
198 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
199 */
200 public String guessLanguage(final String text) {
201 final Languages.LanguageSet ls = guessLanguages(text);
202 return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203 }
204
205 /**
206 * Guesses the languages of a word.
207 *
208 * @param input
209 * the word
210 * @return a Set of Strings of language names that are potential matches for the input word
211 */
212 public Languages.LanguageSet guessLanguages(final String input) {
213 final String text = input.toLowerCase(Locale.ENGLISH);
214
215 final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
216 for (final LangRule rule : this.rules) {
217 if (rule.matches(text)) {
218 if (rule.acceptOnMatch) {
219 langs.retainAll(rule.languages);
220 } else {
221 langs.removeAll(rule.languages);
222 }
223 }
224 }
225
226 final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228 }
229 }