1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.EnumMap;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Scanner;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 public class Rule {
83
84 public static final class Phoneme implements PhonemeExpr {
85 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
86 @Override
87 public int compare(final Phoneme o1, final Phoneme o2) {
88 for (int i = 0; i < o1.phonemeText.length(); i++) {
89 if (i >= o2.phonemeText.length()) {
90 return +1;
91 }
92 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
93 if (c != 0) {
94 return c;
95 }
96 }
97
98 if (o1.phonemeText.length() < o2.phonemeText.length()) {
99 return -1;
100 }
101
102 return 0;
103 }
104 };
105
106 private final StringBuilder phonemeText;
107 private final Languages.LanguageSet languages;
108
109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110 this.phonemeText = new StringBuilder(phonemeText);
111 this.languages = languages;
112 }
113
114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115 this(phonemeLeft.phonemeText, phonemeLeft.languages);
116 this.phonemeText.append(phonemeRight.phonemeText);
117 }
118
119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120 this(phonemeLeft.phonemeText, languages);
121 this.phonemeText.append(phonemeRight.phonemeText);
122 }
123
124 public Phoneme append(final CharSequence str) {
125 this.phonemeText.append(str);
126 return this;
127 }
128
129 public Languages.LanguageSet getLanguages() {
130 return this.languages;
131 }
132
133 @Override
134 public Iterable<Phoneme> getPhonemes() {
135 return Collections.singleton(this);
136 }
137
138 public CharSequence getPhonemeText() {
139 return this.phonemeText;
140 }
141
142
143
144
145
146
147
148
149 @Deprecated
150 public Phoneme join(final Phoneme right) {
151 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
152 this.languages.restrictTo(right.languages));
153 }
154
155
156
157
158
159
160
161
162 public Phoneme mergeWithLanguage(final LanguageSet lang) {
163 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
164 }
165
166 @Override
167 public String toString() {
168 return phonemeText.toString() + "[" + languages + "]";
169 }
170 }
171
172 public interface PhonemeExpr {
173 Iterable<Phoneme> getPhonemes();
174 }
175
176 public static final class PhonemeList implements PhonemeExpr {
177 private final List<Phoneme> phonemes;
178
179 public PhonemeList(final List<Phoneme> phonemes) {
180 this.phonemes = phonemes;
181 }
182
183 @Override
184 public List<Phoneme> getPhonemes() {
185 return this.phonemes;
186 }
187 }
188
189
190
191
192 public interface RPattern {
193 boolean isMatch(CharSequence input);
194 }
195
196 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
197 @Override
198 public boolean isMatch(final CharSequence input) {
199 return true;
200 }
201 };
202
203 public static final String ALL = "ALL";
204
205 private static final String DOUBLE_QUOTE = "\"";
206
207 private static final String HASH_INCLUDE = "#include";
208
209 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
210 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
211
212 static {
213 for (final NameType s : NameType.values()) {
214 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
215 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
216
217 for (final RuleType rt : RuleType.values()) {
218 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
219
220 final Languages ls = Languages.getInstance(s);
221 for (final String l : ls.getLanguages()) {
222 try {
223 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
224 } catch (final IllegalStateException e) {
225 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
226 }
227 }
228 if (!rt.equals(RuleType.RULES)) {
229 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
230 }
231
232 rts.put(rt, Collections.unmodifiableMap(rs));
233 }
234
235 RULES.put(s, Collections.unmodifiableMap(rts));
236 }
237 }
238
239 private static boolean contains(final CharSequence chars, final char input) {
240 for (int i = 0; i < chars.length(); i++) {
241 if (chars.charAt(i) == input) {
242 return true;
243 }
244 }
245 return false;
246 }
247
248 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
249 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
250 nameType.getName(), rt.getName(), lang);
251 }
252
253 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
254 final String resName = createResourceName(nameType, rt, lang);
255 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
256
257 if (rulesIS == null) {
258 throw new IllegalArgumentException("Unable to load resource: " + resName);
259 }
260
261 return new Scanner(rulesIS, ResourceConstants.ENCODING);
262 }
263
264 private static Scanner createScanner(final String lang) {
265 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
266 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
267
268 if (rulesIS == null) {
269 throw new IllegalArgumentException("Unable to load resource: " + resName);
270 }
271
272 return new Scanner(rulesIS, ResourceConstants.ENCODING);
273 }
274
275 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
276 if (suffix.length() > input.length()) {
277 return false;
278 }
279 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
280 if (input.charAt(i) != suffix.charAt(j)) {
281 return false;
282 }
283 }
284 return true;
285 }
286
287
288
289
290
291
292
293
294
295
296
297
298 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
299 final Languages.LanguageSet langs) {
300 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
301 final List<Rule> allRules = new ArrayList<Rule>();
302 for (final List<Rule> rules : ruleMap.values()) {
303 allRules.addAll(rules);
304 }
305 return allRules;
306 }
307
308
309
310
311
312
313
314
315
316
317
318
319 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
320 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
321 }
322
323
324
325
326
327
328
329
330
331
332
333
334
335 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
336 final Languages.LanguageSet langs) {
337 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
338 getInstanceMap(nameType, rt, Languages.ANY);
339 }
340
341
342
343
344
345
346
347
348
349
350
351
352
353 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
354 final String lang) {
355 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
356
357 if (rules == null) {
358 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
359 nameType.getName(), rt.getName(), lang));
360 }
361
362 return rules;
363 }
364
365 private static Phoneme parsePhoneme(final String ph) {
366 final int open = ph.indexOf("[");
367 if (open >= 0) {
368 if (!ph.endsWith("]")) {
369 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
370 }
371 final String before = ph.substring(0, open);
372 final String in = ph.substring(open + 1, ph.length() - 1);
373 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
374
375 return new Phoneme(before, Languages.LanguageSet.from(langs));
376 } else {
377 return new Phoneme(ph, Languages.ANY_LANGUAGE);
378 }
379 }
380
381 private static PhonemeExpr parsePhonemeExpr(final String ph) {
382 if (ph.startsWith("(")) {
383 if (!ph.endsWith(")")) {
384 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
385 }
386
387 final List<Phoneme> phs = new ArrayList<Phoneme>();
388 final String body = ph.substring(1, ph.length() - 1);
389 for (final String part : body.split("[|]")) {
390 phs.add(parsePhoneme(part));
391 }
392 if (body.startsWith("|") || body.endsWith("|")) {
393 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
394 }
395
396 return new PhonemeList(phs);
397 } else {
398 return parsePhoneme(ph);
399 }
400 }
401
402 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
403 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
404 int currentLine = 0;
405
406 boolean inMultilineComment = false;
407 while (scanner.hasNextLine()) {
408 currentLine++;
409 final String rawLine = scanner.nextLine();
410 String line = rawLine;
411
412 if (inMultilineComment) {
413 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
414 inMultilineComment = false;
415 }
416 } else {
417 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
418 inMultilineComment = true;
419 } else {
420
421 final int cmtI = line.indexOf(ResourceConstants.CMT);
422 if (cmtI >= 0) {
423 line = line.substring(0, cmtI);
424 }
425
426
427 line = line.trim();
428
429 if (line.length() == 0) {
430 continue;
431 }
432
433 if (line.startsWith(HASH_INCLUDE)) {
434
435 final String incl = line.substring(HASH_INCLUDE.length()).trim();
436 if (incl.contains(" ")) {
437 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
438 location);
439 } else {
440 lines.putAll(parseRules(createScanner(incl), location + "->" + incl));
441 }
442 } else {
443
444 final String[] parts = line.split("\\s+");
445 if (parts.length != 4) {
446 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
447 " parts: " + rawLine + " in " + location);
448 } else {
449 try {
450 final String pat = stripQuotes(parts[0]);
451 final String lCon = stripQuotes(parts[1]);
452 final String rCon = stripQuotes(parts[2]);
453 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
454 final int cLine = currentLine;
455 final Rule r = new Rule(pat, lCon, rCon, ph) {
456 private final int myLine = cLine;
457 private final String loc = location;
458
459 @Override
460 public String toString() {
461 final StringBuilder sb = new StringBuilder();
462 sb.append("Rule");
463 sb.append("{line=").append(myLine);
464 sb.append(", loc='").append(loc).append('\'');
465 sb.append(", pat='").append(pat).append('\'');
466 sb.append(", lcon='").append(lCon).append('\'');
467 sb.append(", rcon='").append(rCon).append('\'');
468 sb.append('}');
469 return sb.toString();
470 }
471 };
472 final String patternKey = r.pattern.substring(0,1);
473 List<Rule> rules = lines.get(patternKey);
474 if (rules == null) {
475 rules = new ArrayList<Rule>();
476 lines.put(patternKey, rules);
477 }
478 rules.add(r);
479 } catch (final IllegalArgumentException e) {
480 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
481 location, e);
482 }
483 }
484 }
485 }
486 }
487 }
488
489 return lines;
490 }
491
492
493
494
495
496
497
498
499 private static RPattern pattern(final String regex) {
500 final boolean startsWith = regex.startsWith("^");
501 final boolean endsWith = regex.endsWith("$");
502 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
503 final boolean boxes = content.contains("[");
504
505 if (!boxes) {
506 if (startsWith && endsWith) {
507
508 if (content.length() == 0) {
509
510 return new RPattern() {
511 @Override
512 public boolean isMatch(final CharSequence input) {
513 return input.length() == 0;
514 }
515 };
516 } else {
517 return new RPattern() {
518 @Override
519 public boolean isMatch(final CharSequence input) {
520 return input.equals(content);
521 }
522 };
523 }
524 } else if ((startsWith || endsWith) && content.length() == 0) {
525
526 return ALL_STRINGS_RMATCHER;
527 } else if (startsWith) {
528
529 return new RPattern() {
530 @Override
531 public boolean isMatch(final CharSequence input) {
532 return startsWith(input, content);
533 }
534 };
535 } else if (endsWith) {
536
537 return new RPattern() {
538 @Override
539 public boolean isMatch(final CharSequence input) {
540 return endsWith(input, content);
541 }
542 };
543 }
544 } else {
545 final boolean startsWithBox = content.startsWith("[");
546 final boolean endsWithBox = content.endsWith("]");
547
548 if (startsWithBox && endsWithBox) {
549 String boxContent = content.substring(1, content.length() - 1);
550 if (!boxContent.contains("[")) {
551
552 final boolean negate = boxContent.startsWith("^");
553 if (negate) {
554 boxContent = boxContent.substring(1);
555 }
556 final String bContent = boxContent;
557 final boolean shouldMatch = !negate;
558
559 if (startsWith && endsWith) {
560
561 return new RPattern() {
562 @Override
563 public boolean isMatch(final CharSequence input) {
564 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
565 }
566 };
567 } else if (startsWith) {
568
569 return new RPattern() {
570 @Override
571 public boolean isMatch(final CharSequence input) {
572 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
573 }
574 };
575 } else if (endsWith) {
576
577 return new RPattern() {
578 @Override
579 public boolean isMatch(final CharSequence input) {
580 return input.length() > 0 &&
581 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
582 }
583 };
584 }
585 }
586 }
587 }
588
589 return new RPattern() {
590 Pattern pattern = Pattern.compile(regex);
591
592 @Override
593 public boolean isMatch(final CharSequence input) {
594 final Matcher matcher = pattern.matcher(input);
595 return matcher.find();
596 }
597 };
598 }
599
600 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
601 if (prefix.length() > input.length()) {
602 return false;
603 }
604 for (int i = 0; i < prefix.length(); i++) {
605 if (input.charAt(i) != prefix.charAt(i)) {
606 return false;
607 }
608 }
609 return true;
610 }
611
612 private static String stripQuotes(String str) {
613 if (str.startsWith(DOUBLE_QUOTE)) {
614 str = str.substring(1);
615 }
616
617 if (str.endsWith(DOUBLE_QUOTE)) {
618 str = str.substring(0, str.length() - 1);
619 }
620
621 return str;
622 }
623
624 private final RPattern lContext;
625
626 private final String pattern;
627
628 private final PhonemeExpr phoneme;
629
630 private final RPattern rContext;
631
632
633
634
635
636
637
638
639
640
641
642
643
644 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
645 this.pattern = pattern;
646 this.lContext = pattern(lContext + "$");
647 this.rContext = pattern("^" + rContext);
648 this.phoneme = phoneme;
649 }
650
651
652
653
654
655
656 public RPattern getLContext() {
657 return this.lContext;
658 }
659
660
661
662
663
664
665 public String getPattern() {
666 return this.pattern;
667 }
668
669
670
671
672
673
674 public PhonemeExpr getPhoneme() {
675 return this.phoneme;
676 }
677
678
679
680
681
682
683 public RPattern getRContext() {
684 return this.rContext;
685 }
686
687
688
689
690
691
692
693
694
695
696
697
698 public boolean patternAndContextMatches(final CharSequence input, final int i) {
699 if (i < 0) {
700 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
701 }
702
703 final int patternLength = this.pattern.length();
704 final int ipl = i + patternLength;
705
706 if (ipl > input.length()) {
707
708 return false;
709 }
710
711
712
713 if (!input.subSequence(i, ipl).equals(this.pattern)) {
714 return false;
715 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
716 return false;
717 }
718 return this.lContext.isMatch(input.subSequence(0, i));
719 }
720 }