1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.EnumMap;
25 import java.util.HashSet;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.Map;
29 import java.util.Scanner;
30 import java.util.Set;
31 import java.util.regex.Pattern;
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 public class Lang {
72
73
74
75
76
77
78
79
80 private static final class LangRule {
81 private final boolean acceptOnMatch;
82 private final Set<String> languages;
83 private final Pattern pattern;
84
85 private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
86 this.pattern = pattern;
87 this.languages = languages;
88 this.acceptOnMatch = acceptOnMatch;
89 }
90
91 public boolean matches(final String txt) {
92 return this.pattern.matcher(txt).find();
93 }
94 }
95
96 private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
97
98 private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
99
100 static {
101 for (final NameType s : NameType.values()) {
102 Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
103 }
104 }
105
106
107
108
109
110
111
112
113 public static Lang instance(final NameType nameType) {
114 return Langs.get(nameType);
115 }
116
117
118
119
120
121
122
123
124
125
126
127
128
129 public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130 final List<LangRule> rules = new ArrayList<LangRule>();
131 final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132
133 if (lRulesIS == null) {
134 throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135 }
136
137 final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138 boolean inExtendedComment = false;
139 while (scanner.hasNextLine()) {
140 final String rawLine = scanner.nextLine();
141 String line = rawLine;
142
143 if (inExtendedComment) {
144
145 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146 inExtendedComment = false;
147 }
148 } else {
149 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150 inExtendedComment = true;
151 } else {
152
153 final int cmtI = line.indexOf(ResourceConstants.CMT);
154 if (cmtI >= 0) {
155 line = line.substring(0, cmtI);
156 }
157
158
159 line = line.trim();
160
161 if (line.length() == 0) {
162 continue;
163 }
164
165
166 final String[] parts = line.split("\\s+");
167
168 if (parts.length != 3) {
169 throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
170 languageRulesResourceName + "'");
171 }
172
173 final Pattern pattern = Pattern.compile(parts[0]);
174 final String[] langs = parts[1].split("\\+");
175 final boolean accept = parts[2].equals("true");
176
177 rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178 }
179 }
180 }
181
182 return new Lang(rules, languages);
183 }
184
185 private final Languages languages;
186 private final List<LangRule> rules;
187
188 private Lang(final List<LangRule> rules, final Languages languages) {
189 this.rules = Collections.unmodifiableList(rules);
190 this.languages = languages;
191 }
192
193
194
195
196
197
198
199
200 public String guessLanguage(final String text) {
201 final Languages.LanguageSet ls = guessLanguages(text);
202 return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203 }
204
205
206
207
208
209
210
211
212 public Languages.LanguageSet guessLanguages(final String input) {
213 final String text = input.toLowerCase(Locale.ENGLISH);
214
215 final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
216 for (final LangRule rule : this.rules) {
217 if (rule.matches(text)) {
218 if (rule.acceptOnMatch) {
219 langs.retainAll(rule.languages);
220 } else {
221 langs.removeAll(rule.languages);
222 }
223 }
224 }
225
226 final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228 }
229 }