1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.EnumMap;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Scanner;
30 import java.util.Set;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33
34 import org.apache.commons.codec.Resources;
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81 public class Rule {
82
83
84
85
86 public static final class Phoneme implements PhonemeExpr {
87
88
89
90
91 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
92 final int o1Length = o1.phonemeText.length();
93 final int o2Length = o2.phonemeText.length();
94 for (int i = 0; i < o1Length; i++) {
95 if (i >= o2Length) {
96 return +1;
97 }
98 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
99 if (c != 0) {
100 return c;
101 }
102 }
103 if (o1Length < o2Length) {
104 return -1;
105 }
106 return 0;
107 };
108 private final StringBuilder phonemeText;
109 private final Languages.LanguageSet languages;
110
111
112
113
114
115
116
117 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
118 this.phonemeText = new StringBuilder(phonemeText);
119 this.languages = languages;
120 }
121
122
123
124
125
126
127
128 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
129 this(phonemeLeft.phonemeText, phonemeLeft.languages);
130 this.phonemeText.append(phonemeRight.phonemeText);
131 }
132
133
134
135
136
137
138
139
140 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
141 this(phonemeLeft.phonemeText, languages);
142 this.phonemeText.append(phonemeRight.phonemeText);
143 }
144
145
146
147
148
149
150
151 public Phoneme append(final CharSequence sequence) {
152 this.phonemeText.append(sequence);
153 return this;
154 }
155
156
157
158
159
160
161 public Languages.LanguageSet getLanguages() {
162 return this.languages;
163 }
164
165 @Override
166 public Iterable<Phoneme> getPhonemes() {
167 return Collections.singleton(this);
168 }
169
170
171
172
173
174
175 public CharSequence getPhonemeText() {
176 return this.phonemeText;
177 }
178
179
180
181
182
183
184
185
186 @Deprecated
187 public Phoneme join(final Phoneme right) {
188 return new Phoneme(phonemeText.toString() + right.phonemeText.toString(), languages.restrictTo(right.languages));
189 }
190
191
192
193
194
195
196
197 public Phoneme mergeWithLanguage(final LanguageSet lang) {
198 return new Phoneme(phonemeText.toString(), languages.merge(lang));
199 }
200
201 @Override
202 public int size() {
203 return 1;
204 }
205
206 @Override
207 public String toString() {
208 return phonemeText.toString() + "[" + languages + "]";
209 }
210 }
211
212
213
214
215 public interface PhonemeExpr {
216
217
218
219
220
221
222 Iterable<Phoneme> getPhonemes();
223
224
225
226
227
228
229
230 default int size() {
231
232 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
233 }
234 }
235
236
237
238
239 public static final class PhonemeList implements PhonemeExpr {
240
241 private final List<Phoneme> phonemeList;
242
243
244
245
246
247
248 public PhonemeList(final List<Phoneme> phonemes) {
249 this.phonemeList = phonemes;
250 }
251
252 @Override
253 public List<Phoneme> getPhonemes() {
254 return phonemeList;
255 }
256
257 @Override
258 public int size() {
259 return phonemeList.size();
260 }
261 }
262
263
264
265
266 public interface RPattern {
267
268
269
270
271
272
273
274 boolean isMatch(CharSequence input);
275 }
276
277 private static final String PIPE = "|";
278
279
280
281
282 public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
283
284
285
286
287
288
289 @Deprecated
290 public static final String ALL = "ALL";
291
292 private static final String DOUBLE_QUOTE = "\"";
293 private static final String HASH_INCLUDE = "#include";
294 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
295 private static final Pattern AROUND_PLUS = Pattern.compile("[+]");
296 private static final Pattern AROUND_PIPE = Pattern.compile("[|]");
297 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = new EnumMap<>(NameType.class);
298
299
300
301
302 static {
303 for (final NameType nameType : NameType.values()) {
304 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rtsMap = new EnumMap<>(RuleType.class);
305 for (final RuleType ruleType : RuleType.values()) {
306 final Map<String, Map<String, List<Rule>>> rsMap = new HashMap<>();
307 final Languages languages = Languages.getInstance(nameType);
308 languages.getLanguages().forEach(l -> {
309 try (Scanner scanner = createScanner(nameType, ruleType, l)) {
310 rsMap.put(l, parseRules(scanner, createResourceName(nameType, ruleType, l)));
311 } catch (final IllegalStateException e) {
312 throw new IllegalStateException("Problem processing " + createResourceName(nameType, ruleType, l), e);
313 }
314 });
315 if (!ruleType.equals(RuleType.RULES)) {
316 try (Scanner scanner = createScanner(nameType, ruleType, "common")) {
317 rsMap.put("common", parseRules(scanner, createResourceName(nameType, ruleType, "common")));
318 }
319 }
320 rtsMap.put(ruleType, Collections.unmodifiableMap(rsMap));
321 }
322 RULES.put(nameType, Collections.unmodifiableMap(rtsMap));
323 }
324 }
325
326 private static boolean contains(final CharSequence chars, final char input) {
327 return chars.chars().anyMatch(c -> c == input);
328 }
329
330 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
331 return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang);
332 }
333
334 @SuppressWarnings("resource")
335 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
336 final String resName = createResourceName(nameType, rt, lang);
337 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
338 }
339
340 @SuppressWarnings("resource")
341 private static Scanner createScanner(final String lang) {
342 final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
343 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
344 }
345
346 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
347 final int suffixLength = suffix.length();
348 final int inputLength = input.length();
349 if (suffixLength > inputLength) {
350 return false;
351 }
352 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
353 if (input.charAt(i) != suffix.charAt(j)) {
354 return false;
355 }
356 }
357 return true;
358 }
359
360
361
362
363
364
365
366
367
368 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
369 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
370 final List<Rule> allRules = new ArrayList<>();
371 ruleMap.values().forEach(rules -> allRules.addAll(rules));
372 return allRules;
373 }
374
375
376
377
378
379
380
381
382
383 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
384 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
385 }
386
387
388
389
390
391
392
393
394
395
396 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
397 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : getInstanceMap(nameType, rt, Languages.ANY);
398 }
399
400
401
402
403
404
405
406
407
408
409 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final String lang) {
410 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
411 if (rules == null) {
412 throw new IllegalArgumentException(String.format("No rules found for %s, %s, '%s'.", nameType.getName(), rt.getName(), lang));
413 }
414 return rules;
415 }
416
417 private static Phoneme parsePhoneme(final String ph) {
418 final int open = ph.indexOf("[");
419 if (open >= 0) {
420 if (!ph.endsWith("]")) {
421 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
422 }
423 final String before = ph.substring(0, open);
424 final String in = ph.substring(open + 1, ph.length() - 1);
425 final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in)));
426 return new Phoneme(before, Languages.LanguageSet.from(langs));
427 }
428 return new Phoneme(ph, Languages.ANY_LANGUAGE);
429 }
430
431
432
433
434 static PhonemeExpr parsePhonemeExpr(final String ph) {
435 if (ph.startsWith("(")) {
436
437 if (!ph.endsWith(")")) {
438 throw new IllegalArgumentException("Phoneme starting with '(' must end with ')'");
439 }
440 final List<Phoneme> phs = new ArrayList<>();
441 final String body = ph.substring(1, ph.length() - 1);
442 final String[] split = AROUND_PIPE.split(body);
443 for (final String part : split) {
444 phs.add(parsePhoneme(part));
445 }
446 if (split.length > 1 && split[0].length() != 0 && body.startsWith(PIPE) || split[split.length - 1].length() != 0 && body.endsWith(PIPE)) {
447 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
448 }
449 return new PhonemeList(phs);
450 }
451 return parsePhoneme(ph);
452 }
453
454 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
455 final Map<String, List<Rule>> lines = new HashMap<>();
456 int currentLine = 0;
457 boolean inMultilineComment = false;
458 while (scanner.hasNextLine()) {
459 currentLine++;
460 final String rawLine = scanner.nextLine();
461 String line = rawLine;
462 if (inMultilineComment) {
463 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
464 inMultilineComment = false;
465 }
466 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
467 inMultilineComment = true;
468 } else {
469
470 final int cmtI = line.indexOf(ResourceConstants.CMT);
471 if (cmtI >= 0) {
472 line = line.substring(0, cmtI);
473 }
474
475 line = line.trim();
476 if (line.isEmpty()) {
477 continue;
478 }
479 if (line.startsWith(HASH_INCLUDE)) {
480
481 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
482 if (incl.contains(" ")) {
483 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + location);
484 }
485 try (Scanner hashIncludeScanner = createScanner(incl)) {
486 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
487 }
488 } else {
489
490 final String[] parts = ResourceConstants.SPACES.split(line);
491 if (parts.length != 4) {
492 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
493 }
494 try {
495 final String pat = stripQuotes(parts[0]);
496 final String lCon = stripQuotes(parts[1]);
497 final String rCon = stripQuotes(parts[2]);
498 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
499 final int cLine = currentLine;
500 final Rule r = new Rule(pat, lCon, rCon, ph) {
501
502 private final int myLine = cLine;
503 private final String loc = location;
504
505 @Override
506 public String toString() {
507 final StringBuilder sb = new StringBuilder();
508 sb.append("Rule");
509 sb.append("{line=").append(myLine);
510 sb.append(", loc='").append(loc).append('\'');
511 sb.append(", pat='").append(pat).append('\'');
512 sb.append(", lcon='").append(lCon).append('\'');
513 sb.append(", rcon='").append(rCon).append('\'');
514 sb.append('}');
515 return sb.toString();
516 }
517 };
518 final String patternKey = r.pattern.substring(0, 1);
519 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
520 rules.add(r);
521 } catch (final IllegalArgumentException e) {
522 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
523 }
524 }
525 }
526 }
527 return lines;
528 }
529
530
531
532
533
534
535
536 private static RPattern pattern(final String regex) {
537 final boolean startsWith = regex.startsWith("^");
538 final boolean endsWith = regex.endsWith("$");
539 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
540 final boolean boxes = content.contains("[");
541 if (!boxes) {
542 if (startsWith && endsWith) {
543
544 if (content.isEmpty()) {
545
546 return input -> input.length() == 0;
547 }
548 return input -> input.equals(content);
549 }
550 if ((startsWith || endsWith) && content.isEmpty()) {
551
552 return ALL_STRINGS_RMATCHER;
553 }
554 if (startsWith) {
555
556 return input -> startsWith(input, content);
557 }
558 if (endsWith) {
559
560 return input -> endsWith(input, content);
561 }
562 } else {
563 final boolean startsWithBox = content.startsWith("[");
564 final boolean endsWithBox = content.endsWith("]");
565 if (startsWithBox && endsWithBox) {
566 String boxContent = content.substring(1, content.length() - 1);
567 if (!boxContent.contains("[")) {
568
569 final boolean negate = boxContent.startsWith("^");
570 if (negate) {
571 boxContent = boxContent.substring(1);
572 }
573 final String bContent = boxContent;
574 final boolean shouldMatch = !negate;
575 if (startsWith && endsWith) {
576
577 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
578 }
579 if (startsWith) {
580
581 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
582 }
583 if (endsWith) {
584
585 return input -> input.length() > 0 && contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
586 }
587 }
588 }
589 }
590 return new RPattern() {
591
592 final Pattern pattern = Pattern.compile(regex);
593
594 @Override
595 public boolean isMatch(final CharSequence input) {
596 final Matcher matcher = pattern.matcher(input);
597 return matcher.find();
598 }
599 };
600 }
601
602 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
603 if (prefix.length() > input.length()) {
604 return false;
605 }
606 for (int i = 0; i < prefix.length(); i++) {
607 if (input.charAt(i) != prefix.charAt(i)) {
608 return false;
609 }
610 }
611 return true;
612 }
613
614 private static String stripQuotes(String str) {
615 if (str.startsWith(DOUBLE_QUOTE)) {
616 str = str.substring(1);
617 }
618 if (str.endsWith(DOUBLE_QUOTE)) {
619 str = str.substring(0, str.length() - 1);
620 }
621 return str;
622 }
623
624 private final RPattern lContext;
625 private final String pattern;
626 private final PhonemeExpr phoneme;
627 private final RPattern rContext;
628
629
630
631
632
633
634
635
636
637 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
638 this.pattern = pattern;
639 this.lContext = pattern(lContext + "$");
640 this.rContext = pattern("^" + rContext);
641 this.phoneme = phoneme;
642 }
643
644
645
646
647
648
649 public RPattern getLContext() {
650 return lContext;
651 }
652
653
654
655
656
657
658 public String getPattern() {
659 return pattern;
660 }
661
662
663
664
665
666
667 public PhonemeExpr getPhoneme() {
668 return phoneme;
669 }
670
671
672
673
674
675
676 public RPattern getRContext() {
677 return rContext;
678 }
679
680
681
682
683
684
685
686
687
688 public boolean patternAndContextMatches(final CharSequence input, final int i) {
689 if (i < 0) {
690 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
691 }
692 final int patternLength = pattern.length();
693 final int ipl = i + patternLength;
694 if (ipl > input.length()) {
695
696 return false;
697 }
698
699
700 if (!input.subSequence(i, ipl).equals(pattern)) {
701 return false;
702 }
703 if (!rContext.isMatch(input.subSequence(ipl, input.length()))) {
704 return false;
705 }
706 return lContext.isMatch(input.subSequence(0, i));
707 }
708 }