1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.EnumMap;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Scanner;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 public class Rule {
81
82 public static final class Phoneme implements PhonemeExpr {
83 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84 @Override
85 public int compare(Phoneme o1, Phoneme o2) {
86 for (int i = 0; i < o1.phonemeText.length(); i++) {
87 if (i >= o2.phonemeText.length()) {
88 return +1;
89 }
90 int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91 if (c != 0) {
92 return c;
93 }
94 }
95
96 if (o1.phonemeText.length() < o2.phonemeText.length()) {
97 return -1;
98 }
99
100 return 0;
101 }
102 };
103
104 private final CharSequence phonemeText;
105 private final Languages.LanguageSet languages;
106
107 public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) {
108 this.phonemeText = phonemeText;
109 this.languages = languages;
110 }
111
112 public Phoneme append(CharSequence str) {
113 return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114 }
115
116 public Languages.LanguageSet getLanguages() {
117 return this.languages;
118 }
119
120 @Override
121 public Iterable<Phoneme> getPhonemes() {
122 return Collections.singleton(this);
123 }
124
125 public CharSequence getPhonemeText() {
126 return this.phonemeText;
127 }
128
129 public Phoneme join(Phoneme right) {
130 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131 this.languages.restrictTo(right.languages));
132 }
133 }
134
135 public interface PhonemeExpr {
136 Iterable<Phoneme> getPhonemes();
137 }
138
139 public static final class PhonemeList implements PhonemeExpr {
140 private final List<Phoneme> phonemes;
141
142 public PhonemeList(List<Phoneme> phonemes) {
143 this.phonemes = phonemes;
144 }
145
146 @Override
147 public List<Phoneme> getPhonemes() {
148 return this.phonemes;
149 }
150 }
151
152
153
154
155 public static interface RPattern {
156 boolean isMatch(CharSequence input);
157 }
158
159 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160 @Override
161 public boolean isMatch(CharSequence input) {
162 return true;
163 }
164 };
165
166 public static final String ALL = "ALL";
167
168 private static final String DOUBLE_QUOTE = "\"";
169
170 private static final String HASH_INCLUDE = "#include";
171
172 private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173 new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174
175 static {
176 for (NameType s : NameType.values()) {
177 Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
178
179 for (RuleType rt : RuleType.values()) {
180 Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
181
182 Languages ls = Languages.getInstance(s);
183 for (String l : ls.getLanguages()) {
184 try {
185 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
186 } catch (IllegalStateException e) {
187 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
188 }
189 }
190 if (!rt.equals(RuleType.RULES)) {
191 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
192 }
193
194 rts.put(rt, Collections.unmodifiableMap(rs));
195 }
196
197 RULES.put(s, Collections.unmodifiableMap(rts));
198 }
199 }
200
201 private static boolean contains(CharSequence chars, char input) {
202 for (int i = 0; i < chars.length(); i++) {
203 if (chars.charAt(i) == input) {
204 return true;
205 }
206 }
207 return false;
208 }
209
210 private static String createResourceName(NameType nameType, RuleType rt, String lang) {
211 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
212 nameType.getName(), rt.getName(), lang);
213 }
214
215 private static Scanner createScanner(NameType nameType, RuleType rt, String lang) {
216 String resName = createResourceName(nameType, rt, lang);
217 InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
218
219 if (rulesIS == null) {
220 throw new IllegalArgumentException("Unable to load resource: " + resName);
221 }
222
223 return new Scanner(rulesIS, ResourceConstants.ENCODING);
224 }
225
226 private static Scanner createScanner(String lang) {
227 String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
228 InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
229
230 if (rulesIS == null) {
231 throw new IllegalArgumentException("Unable to load resource: " + resName);
232 }
233
234 return new Scanner(rulesIS, ResourceConstants.ENCODING);
235 }
236
237 private static boolean endsWith(CharSequence input, CharSequence suffix) {
238 if (suffix.length() > input.length()) {
239 return false;
240 }
241 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
242 if (input.charAt(i) != suffix.charAt(j)) {
243 return false;
244 }
245 }
246 return true;
247 }
248
249
250
251
252
253
254
255
256
257
258
259
260 public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) {
261 return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
262 getInstance(nameType, rt, Languages.ANY);
263 }
264
265
266
267
268
269
270
271
272
273
274
275
276 public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) {
277 List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
278
279 if (rules == null) {
280 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
281 nameType.getName(), rt.getName(), lang));
282 }
283
284 return rules;
285 }
286
287 private static Phoneme parsePhoneme(String ph) {
288 int open = ph.indexOf("[");
289 if (open >= 0) {
290 if (!ph.endsWith("]")) {
291 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
292 }
293 String before = ph.substring(0, open);
294 String in = ph.substring(open + 1, ph.length() - 1);
295 Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
296
297 return new Phoneme(before, Languages.LanguageSet.from(langs));
298 } else {
299 return new Phoneme(ph, Languages.ANY_LANGUAGE);
300 }
301 }
302
303 private static PhonemeExpr parsePhonemeExpr(String ph) {
304 if (ph.startsWith("(")) {
305 if (!ph.endsWith(")")) {
306 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
307 }
308
309 List<Phoneme> phs = new ArrayList<Phoneme>();
310 String body = ph.substring(1, ph.length() - 1);
311 for (String part : body.split("[|]")) {
312 phs.add(parsePhoneme(part));
313 }
314 if (body.startsWith("|") || body.endsWith("|")) {
315 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
316 }
317
318 return new PhonemeList(phs);
319 } else {
320 return parsePhoneme(ph);
321 }
322 }
323
324 private static List<Rule> parseRules(final Scanner scanner, final String location) {
325 List<Rule> lines = new ArrayList<Rule>();
326 int currentLine = 0;
327
328 boolean inMultilineComment = false;
329 while (scanner.hasNextLine()) {
330 currentLine++;
331 String rawLine = scanner.nextLine();
332 String line = rawLine;
333
334 if (inMultilineComment) {
335 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
336 inMultilineComment = false;
337 }
338 } else {
339 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
340 inMultilineComment = true;
341 } else {
342
343 int cmtI = line.indexOf(ResourceConstants.CMT);
344 if (cmtI >= 0) {
345 line = line.substring(0, cmtI);
346 }
347
348
349 line = line.trim();
350
351 if (line.length() == 0) {
352 continue;
353 }
354
355 if (line.startsWith(HASH_INCLUDE)) {
356
357 String incl = line.substring(HASH_INCLUDE.length()).trim();
358 if (incl.contains(" ")) {
359 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
360 location);
361 } else {
362 lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
363 }
364 } else {
365
366 String[] parts = line.split("\\s+");
367 if (parts.length != 4) {
368 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
369 " parts: " + rawLine + " in " + location);
370 } else {
371 try {
372 String pat = stripQuotes(parts[0]);
373 String lCon = stripQuotes(parts[1]);
374 String rCon = stripQuotes(parts[2]);
375 PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
376 final int cLine = currentLine;
377 Rule r = new Rule(pat, lCon, rCon, ph) {
378 private final int myLine = cLine;
379 private final String loc = location;
380
381 @Override
382 public String toString() {
383 final StringBuilder sb = new StringBuilder();
384 sb.append("Rule");
385 sb.append("{line=").append(myLine);
386 sb.append(", loc='").append(loc).append('\'');
387 sb.append('}');
388 return sb.toString();
389 }
390 };
391 lines.add(r);
392 } catch (IllegalArgumentException e) {
393 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
394 location, e);
395 }
396 }
397 }
398 }
399 }
400 }
401
402 return lines;
403 }
404
405
406
407
408
409
410
411
412 private static RPattern pattern(final String regex) {
413 boolean startsWith = regex.startsWith("^");
414 boolean endsWith = regex.endsWith("$");
415 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
416 boolean boxes = content.contains("[");
417
418 if (!boxes) {
419 if (startsWith && endsWith) {
420
421 if (content.length() == 0) {
422
423 return new RPattern() {
424 @Override
425 public boolean isMatch(CharSequence input) {
426 return input.length() == 0;
427 }
428 };
429 } else {
430 return new RPattern() {
431 @Override
432 public boolean isMatch(CharSequence input) {
433 return input.equals(content);
434 }
435 };
436 }
437 } else if ((startsWith || endsWith) && content.length() == 0) {
438
439 return ALL_STRINGS_RMATCHER;
440 } else if (startsWith) {
441
442 return new RPattern() {
443 @Override
444 public boolean isMatch(CharSequence input) {
445 return startsWith(input, content);
446 }
447 };
448 } else if (endsWith) {
449
450 return new RPattern() {
451 @Override
452 public boolean isMatch(CharSequence input) {
453 return endsWith(input, content);
454 }
455 };
456 }
457 } else {
458 boolean startsWithBox = content.startsWith("[");
459 boolean endsWithBox = content.endsWith("]");
460
461 if (startsWithBox && endsWithBox) {
462 String boxContent = content.substring(1, content.length() - 1);
463 if (!boxContent.contains("[")) {
464
465 boolean negate = boxContent.startsWith("^");
466 if (negate) {
467 boxContent = boxContent.substring(1);
468 }
469 final String bContent = boxContent;
470 final boolean shouldMatch = !negate;
471
472 if (startsWith && endsWith) {
473
474 return new RPattern() {
475 @Override
476 public boolean isMatch(CharSequence input) {
477 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
478 }
479 };
480 } else if (startsWith) {
481
482 return new RPattern() {
483 @Override
484 public boolean isMatch(CharSequence input) {
485 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
486 }
487 };
488 } else if (endsWith) {
489
490 return new RPattern() {
491 @Override
492 public boolean isMatch(CharSequence input) {
493 return input.length() > 0 &&
494 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
495 }
496 };
497 }
498 }
499 }
500 }
501
502 return new RPattern() {
503 Pattern pattern = Pattern.compile(regex);
504
505 @Override
506 public boolean isMatch(CharSequence input) {
507 Matcher matcher = pattern.matcher(input);
508 return matcher.find();
509 }
510 };
511 }
512
513 private static boolean startsWith(CharSequence input, CharSequence prefix) {
514 if (prefix.length() > input.length()) {
515 return false;
516 }
517 for (int i = 0; i < prefix.length(); i++) {
518 if (input.charAt(i) != prefix.charAt(i)) {
519 return false;
520 }
521 }
522 return true;
523 }
524
525 private static String stripQuotes(String str) {
526 if (str.startsWith(DOUBLE_QUOTE)) {
527 str = str.substring(1);
528 }
529
530 if (str.endsWith(DOUBLE_QUOTE)) {
531 str = str.substring(0, str.length() - 1);
532 }
533
534 return str;
535 }
536
537 private final RPattern lContext;
538
539 private final String pattern;
540
541 private final PhonemeExpr phoneme;
542
543 private final RPattern rContext;
544
545
546
547
548
549
550
551
552
553
554
555
556
557 public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
558 this.pattern = pattern;
559 this.lContext = pattern(lContext + "$");
560 this.rContext = pattern("^" + rContext);
561 this.phoneme = phoneme;
562 }
563
564
565
566
567
568
569 public RPattern getLContext() {
570 return this.lContext;
571 }
572
573
574
575
576
577
578 public String getPattern() {
579 return this.pattern;
580 }
581
582
583
584
585
586
587 public PhonemeExpr getPhoneme() {
588 return this.phoneme;
589 }
590
591
592
593
594
595
596 public RPattern getRContext() {
597 return this.rContext;
598 }
599
600
601
602
603
604
605
606
607
608
609
610
611 public boolean patternAndContextMatches(CharSequence input, int i) {
612 if (i < 0) {
613 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
614 }
615
616 int patternLength = this.pattern.length();
617 int ipl = i + patternLength;
618
619 if (ipl > input.length()) {
620
621 return false;
622 }
623
624
625
626 if (!input.subSequence(i, ipl).equals(this.pattern)) {
627 return false;
628 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
629 return false;
630 }
631 return this.lContext.isMatch(input.subSequence(0, i));
632 }
633 }