1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.EnumMap;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Locale;
27 import java.util.Map;
28 import java.util.Scanner;
29 import java.util.Set;
30 import java.util.regex.Pattern;
31
32 import org.apache.commons.codec.Resources;
33
34 /**
35 * Language guessing utility.
36 * <p>
37 * This class encapsulates rules used to guess the possible languages that a word originates from. This is
38 * done by reference to a whole series of rules distributed in resource files.
39 * </p>
40 * <p>
41 * Instances of this class are typically managed through the static factory method instance().
42 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
43 * </p>
44 * <p>
45 * This class is intended to be immutable and thread-safe.
46 * </p>
47 * <h2>Lang resources</h2>
48 * <p>
49 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
50 * They are systematically named following the pattern:
51 * </p>
52 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
53 * <p>
54 * The format of these resources is the following:
55 * </p>
56 * <ul>
57 * <li><strong>Rules:</strong> whitespace separated strings.
58 * There should be 3 columns to each row, and these will be interpreted as:
59 * <ol>
60 * <li>pattern: a regular expression.</li>
61 * <li>languages: a '+'-separated list of languages.</li>
62 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
63 * </ol>
64 * </li>
65 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be
66 * discarded as a comment.</li>
67 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode.
68 * This will skip all content until a line ending in '*' and '/' is found.</li>
69 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
70 * </ul>
71 * <p>
72 * Port of lang.php
73 * </p>
74 *
75 * @since 1.6
76 */
77 public class Lang {
78 // Implementation note: This class is divided into two sections. The first part is a static factory interface that
79 // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
80 // encapsulate a particular language-guessing rule table and the language guessing itself.
81 //
82 // It may make sense in the future to expose the private constructor to allow power users to build custom language-
83 // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
84 // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.
85
86 private static final class LangRule {
87 private final boolean acceptOnMatch;
88 private final Set<String> languages;
89 private final Pattern pattern;
90
91 private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
92 this.pattern = pattern;
93 this.languages = languages;
94 this.acceptOnMatch = acceptOnMatch;
95 }
96
97 public boolean matches(final String txt) {
98 return pattern.matcher(txt).find();
99 }
100 }
101
102 private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class);
103
104 private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";
105
106 private static final Pattern PLUS = Pattern.compile("\\+");
107
108 static {
109 for (final NameType s : NameType.values()) {
110 LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
111 }
112 }
113
114 /**
115 * Gets a Lang instance for one of the supported NameTypes.
116 *
117 * @param nameType
118 * the NameType to look up
119 * @return a Lang encapsulating the language guessing rules for that name type
120 */
121 public static Lang instance(final NameType nameType) {
122 return LANGS.get(nameType);
123 }
124
125 /**
126 * Loads language rules from a resource.
127 * <p>
128 * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
129 * You will only need to call this yourself if you are developing custom language mapping rules.
130 * </p>
131 *
132 * @param languageRulesResourceName
133 * the fully-qualified resource name to load
134 * @param languages
135 * the languages that these rules will support
136 * @return a Lang encapsulating the loaded language-guessing rules.
137 */
138 public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
139 final List<LangRule> rules = new ArrayList<>();
140 try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
141 ResourceConstants.ENCODING)) {
142 boolean inExtendedComment = false;
143 while (scanner.hasNextLine()) {
144 final String rawLine = scanner.nextLine();
145 String line = rawLine;
146 if (inExtendedComment) {
147 // check for closing comment marker, otherwise discard doc comment line
148 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
149 inExtendedComment = false;
150 }
151 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
152 inExtendedComment = true;
153 } else {
154 // discard comments
155 final int cmtI = line.indexOf(ResourceConstants.CMT);
156 if (cmtI >= 0) {
157 line = line.substring(0, cmtI);
158 }
159
160 // trim leading-trailing whitespace
161 line = line.trim();
162
163 if (line.isEmpty()) {
164 continue; // empty lines can be safely skipped
165 }
166
167 // split it up
168 final String[] parts = ResourceConstants.SPACES.split(line);
169
170 if (parts.length != 3) {
171 throw new IllegalArgumentException("Malformed line '" + rawLine +
172 "' in language resource '" + languageRulesResourceName + "'");
173 }
174
175 final Pattern pattern = Pattern.compile(parts[0]);
176 final String[] langs = PLUS.split(parts[1]);
177 final boolean accept = parts[2].equals("true");
178
179 rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
180 }
181 }
182 }
183 return new Lang(rules, languages);
184 }
185
186 private final Languages languages;
187
188 private final List<LangRule> rules;
189
190 private Lang(final List<LangRule> rules, final Languages languages) {
191 this.rules = Collections.unmodifiableList(rules);
192 this.languages = languages;
193 }
194
195 /**
196 * Guesses the language of a word.
197 *
198 * @param text
199 * the word
200 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
201 */
202 public String guessLanguage(final String text) {
203 final Languages.LanguageSet ls = guessLanguages(text);
204 return ls.isSingleton() ? ls.getAny() : Languages.ANY;
205 }
206
207 /**
208 * Guesses the languages of a word.
209 *
210 * @param input
211 * the word
212 * @return a Set of Strings of language names that are potential matches for the input word
213 */
214 public Languages.LanguageSet guessLanguages(final String input) {
215 final String text = input.toLowerCase(Locale.ENGLISH);
216 final Set<String> langs = new HashSet<>(languages.getLanguages());
217 rules.forEach(rule -> {
218 if (rule.matches(text)) {
219 if (rule.acceptOnMatch) {
220 langs.retainAll(rule.languages);
221 } else {
222 langs.removeAll(rule.languages);
223 }
224 }
225 });
226 final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228 }
229 }