1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.EnumMap;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Scanner;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 public class Rule {
83
84 public static final class Phoneme implements PhonemeExpr {
85 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
86 @Override
87 public int compare(final Phoneme o1, final Phoneme o2) {
88 for (int i = 0; i < o1.phonemeText.length(); i++) {
89 if (i >= o2.phonemeText.length()) {
90 return +1;
91 }
92 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
93 if (c != 0) {
94 return c;
95 }
96 }
97
98 if (o1.phonemeText.length() < o2.phonemeText.length()) {
99 return -1;
100 }
101
102 return 0;
103 }
104 };
105
106 private final StringBuilder phonemeText;
107 private final Languages.LanguageSet languages;
108
109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110 this.phonemeText = new StringBuilder(phonemeText);
111 this.languages = languages;
112 }
113
114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115 this(phonemeLeft.phonemeText, phonemeLeft.languages);
116 this.phonemeText.append(phonemeRight.phonemeText);
117 }
118
119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120 this(phonemeLeft.phonemeText, languages);
121 this.phonemeText.append(phonemeRight.phonemeText);
122 }
123
124 public Phoneme append(final CharSequence str) {
125 this.phonemeText.append(str);
126 return this;
127 }
128
129 public Languages.LanguageSet getLanguages() {
130 return this.languages;
131 }
132
133 @Override
134 public Iterable<Phoneme> getPhonemes() {
135 return Collections.singleton(this);
136 }
137
138 public CharSequence getPhonemeText() {
139 return this.phonemeText;
140 }
141
142
143
144
145
146
147
148
149 @Deprecated
150 public Phoneme join(final Phoneme right) {
151 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
152 this.languages.restrictTo(right.languages));
153 }
154
155
156
157
158
159
160
161
162 public Phoneme mergeWithLanguage(final LanguageSet lang) {
163 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
164 }
165
166 @Override
167 public String toString() {
168 return phonemeText.toString() + "[" + languages + "]";
169 }
170 }
171
172 public interface PhonemeExpr {
173 Iterable<Phoneme> getPhonemes();
174 }
175
176 public static final class PhonemeList implements PhonemeExpr {
177 private final List<Phoneme> phonemes;
178
179 public PhonemeList(final List<Phoneme> phonemes) {
180 this.phonemes = phonemes;
181 }
182
183 @Override
184 public List<Phoneme> getPhonemes() {
185 return this.phonemes;
186 }
187 }
188
189
190
191
192 public interface RPattern {
193 boolean isMatch(CharSequence input);
194 }
195
196 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
197 @Override
198 public boolean isMatch(final CharSequence input) {
199 return true;
200 }
201 };
202
203 public static final String ALL = "ALL";
204
205 private static final String DOUBLE_QUOTE = "\"";
206
207 private static final String HASH_INCLUDE = "#include";
208
209 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
210 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
211
212 static {
213 for (final NameType s : NameType.values()) {
214 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
215 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
216
217 for (final RuleType rt : RuleType.values()) {
218 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
219
220 final Languages ls = Languages.getInstance(s);
221 for (final String l : ls.getLanguages()) {
222 final Scanner scanner = createScanner(s, rt, l);
223 try {
224 rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
225 } catch (final IllegalStateException e) {
226 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
227 } finally {
228 scanner.close();
229 }
230 }
231 if (!rt.equals(RuleType.RULES)) {
232 final Scanner scanner = createScanner(s, rt, "common");
233 try {
234 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
235 } finally {
236 scanner.close();
237 }
238 }
239
240 rts.put(rt, Collections.unmodifiableMap(rs));
241 }
242
243 RULES.put(s, Collections.unmodifiableMap(rts));
244 }
245 }
246
247 private static boolean contains(final CharSequence chars, final char input) {
248 for (int i = 0; i < chars.length(); i++) {
249 if (chars.charAt(i) == input) {
250 return true;
251 }
252 }
253 return false;
254 }
255
256 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
257 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
258 nameType.getName(), rt.getName(), lang);
259 }
260
261 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
262 final String resName = createResourceName(nameType, rt, lang);
263 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
264
265 if (rulesIS == null) {
266 throw new IllegalArgumentException("Unable to load resource: " + resName);
267 }
268
269 return new Scanner(rulesIS, ResourceConstants.ENCODING);
270 }
271
272 private static Scanner createScanner(final String lang) {
273 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
274 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
275
276 if (rulesIS == null) {
277 throw new IllegalArgumentException("Unable to load resource: " + resName);
278 }
279
280 return new Scanner(rulesIS, ResourceConstants.ENCODING);
281 }
282
283 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
284 if (suffix.length() > input.length()) {
285 return false;
286 }
287 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
288 if (input.charAt(i) != suffix.charAt(j)) {
289 return false;
290 }
291 }
292 return true;
293 }
294
295
296
297
298
299
300
301
302
303
304
305
306 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
307 final Languages.LanguageSet langs) {
308 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
309 final List<Rule> allRules = new ArrayList<Rule>();
310 for (final List<Rule> rules : ruleMap.values()) {
311 allRules.addAll(rules);
312 }
313 return allRules;
314 }
315
316
317
318
319
320
321
322
323
324
325
326
327 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
328 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
329 }
330
331
332
333
334
335
336
337
338
339
340
341
342
343 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
344 final Languages.LanguageSet langs) {
345 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
346 getInstanceMap(nameType, rt, Languages.ANY);
347 }
348
349
350
351
352
353
354
355
356
357
358
359
360
361 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
362 final String lang) {
363 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
364
365 if (rules == null) {
366 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
367 nameType.getName(), rt.getName(), lang));
368 }
369
370 return rules;
371 }
372
373 private static Phoneme parsePhoneme(final String ph) {
374 final int open = ph.indexOf("[");
375 if (open >= 0) {
376 if (!ph.endsWith("]")) {
377 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
378 }
379 final String before = ph.substring(0, open);
380 final String in = ph.substring(open + 1, ph.length() - 1);
381 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
382
383 return new Phoneme(before, Languages.LanguageSet.from(langs));
384 }
385 return new Phoneme(ph, Languages.ANY_LANGUAGE);
386 }
387
388 private static PhonemeExpr parsePhonemeExpr(final String ph) {
389 if (ph.startsWith("(")) {
390 if (!ph.endsWith(")")) {
391 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
392 }
393
394 final List<Phoneme> phs = new ArrayList<Phoneme>();
395 final String body = ph.substring(1, ph.length() - 1);
396 for (final String part : body.split("[|]")) {
397 phs.add(parsePhoneme(part));
398 }
399 if (body.startsWith("|") || body.endsWith("|")) {
400 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
401 }
402
403 return new PhonemeList(phs);
404 }
405 return parsePhoneme(ph);
406 }
407
408 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
409 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
410 int currentLine = 0;
411
412 boolean inMultilineComment = false;
413 while (scanner.hasNextLine()) {
414 currentLine++;
415 final String rawLine = scanner.nextLine();
416 String line = rawLine;
417
418 if (inMultilineComment) {
419 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
420 inMultilineComment = false;
421 }
422 } else {
423 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
424 inMultilineComment = true;
425 } else {
426
427 final int cmtI = line.indexOf(ResourceConstants.CMT);
428 if (cmtI >= 0) {
429 line = line.substring(0, cmtI);
430 }
431
432
433 line = line.trim();
434
435 if (line.length() == 0) {
436 continue;
437 }
438
439 if (line.startsWith(HASH_INCLUDE)) {
440
441 final String incl = line.substring(HASH_INCLUDE.length()).trim();
442 if (incl.contains(" ")) {
443 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
444 location);
445 }
446 final Scanner hashIncludeScanner = createScanner(incl);
447 try {
448 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449 } finally {
450 hashIncludeScanner.close();
451 }
452 } else {
453
454 final String[] parts = line.split("\\s+");
455 if (parts.length != 4) {
456 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
457 " parts: " + rawLine + " in " + location);
458 }
459 try {
460 final String pat = stripQuotes(parts[0]);
461 final String lCon = stripQuotes(parts[1]);
462 final String rCon = stripQuotes(parts[2]);
463 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
464 final int cLine = currentLine;
465 final Rule r = new Rule(pat, lCon, rCon, ph) {
466 private final int myLine = cLine;
467 private final String loc = location;
468
469 @Override
470 public String toString() {
471 final StringBuilder sb = new StringBuilder();
472 sb.append("Rule");
473 sb.append("{line=").append(myLine);
474 sb.append(", loc='").append(loc).append('\'');
475 sb.append(", pat='").append(pat).append('\'');
476 sb.append(", lcon='").append(lCon).append('\'');
477 sb.append(", rcon='").append(rCon).append('\'');
478 sb.append('}');
479 return sb.toString();
480 }
481 };
482 final String patternKey = r.pattern.substring(0,1);
483 List<Rule> rules = lines.get(patternKey);
484 if (rules == null) {
485 rules = new ArrayList<Rule>();
486 lines.put(patternKey, rules);
487 }
488 rules.add(r);
489 } catch (final IllegalArgumentException e) {
490 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
491 location, e);
492 }
493 }
494 }
495 }
496 }
497
498 return lines;
499 }
500
501
502
503
504
505
506
507
508 private static RPattern pattern(final String regex) {
509 final boolean startsWith = regex.startsWith("^");
510 final boolean endsWith = regex.endsWith("$");
511 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
512 final boolean boxes = content.contains("[");
513
514 if (!boxes) {
515 if (startsWith && endsWith) {
516
517 if (content.length() == 0) {
518
519 return new RPattern() {
520 @Override
521 public boolean isMatch(final CharSequence input) {
522 return input.length() == 0;
523 }
524 };
525 }
526 return new RPattern() {
527 @Override
528 public boolean isMatch(final CharSequence input) {
529 return input.equals(content);
530 }
531 };
532 } else if ((startsWith || endsWith) && content.length() == 0) {
533
534 return ALL_STRINGS_RMATCHER;
535 } else if (startsWith) {
536
537 return new RPattern() {
538 @Override
539 public boolean isMatch(final CharSequence input) {
540 return startsWith(input, content);
541 }
542 };
543 } else if (endsWith) {
544
545 return new RPattern() {
546 @Override
547 public boolean isMatch(final CharSequence input) {
548 return endsWith(input, content);
549 }
550 };
551 }
552 } else {
553 final boolean startsWithBox = content.startsWith("[");
554 final boolean endsWithBox = content.endsWith("]");
555
556 if (startsWithBox && endsWithBox) {
557 String boxContent = content.substring(1, content.length() - 1);
558 if (!boxContent.contains("[")) {
559
560 final boolean negate = boxContent.startsWith("^");
561 if (negate) {
562 boxContent = boxContent.substring(1);
563 }
564 final String bContent = boxContent;
565 final boolean shouldMatch = !negate;
566
567 if (startsWith && endsWith) {
568
569 return new RPattern() {
570 @Override
571 public boolean isMatch(final CharSequence input) {
572 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
573 }
574 };
575 } else if (startsWith) {
576
577 return new RPattern() {
578 @Override
579 public boolean isMatch(final CharSequence input) {
580 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
581 }
582 };
583 } else if (endsWith) {
584
585 return new RPattern() {
586 @Override
587 public boolean isMatch(final CharSequence input) {
588 return input.length() > 0 &&
589 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
590 }
591 };
592 }
593 }
594 }
595 }
596
597 return new RPattern() {
598 Pattern pattern = Pattern.compile(regex);
599
600 @Override
601 public boolean isMatch(final CharSequence input) {
602 final Matcher matcher = pattern.matcher(input);
603 return matcher.find();
604 }
605 };
606 }
607
608 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
609 if (prefix.length() > input.length()) {
610 return false;
611 }
612 for (int i = 0; i < prefix.length(); i++) {
613 if (input.charAt(i) != prefix.charAt(i)) {
614 return false;
615 }
616 }
617 return true;
618 }
619
620 private static String stripQuotes(String str) {
621 if (str.startsWith(DOUBLE_QUOTE)) {
622 str = str.substring(1);
623 }
624
625 if (str.endsWith(DOUBLE_QUOTE)) {
626 str = str.substring(0, str.length() - 1);
627 }
628
629 return str;
630 }
631
632 private final RPattern lContext;
633
634 private final String pattern;
635
636 private final PhonemeExpr phoneme;
637
638 private final RPattern rContext;
639
640
641
642
643
644
645
646
647
648
649
650
651
652 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
653 this.pattern = pattern;
654 this.lContext = pattern(lContext + "$");
655 this.rContext = pattern("^" + rContext);
656 this.phoneme = phoneme;
657 }
658
659
660
661
662
663
664 public RPattern getLContext() {
665 return this.lContext;
666 }
667
668
669
670
671
672
673 public String getPattern() {
674 return this.pattern;
675 }
676
677
678
679
680
681
682 public PhonemeExpr getPhoneme() {
683 return this.phoneme;
684 }
685
686
687
688
689
690
691 public RPattern getRContext() {
692 return this.rContext;
693 }
694
695
696
697
698
699
700
701
702
703
704
705
706 public boolean patternAndContextMatches(final CharSequence input, final int i) {
707 if (i < 0) {
708 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
709 }
710
711 final int patternLength = this.pattern.length();
712 final int ipl = i + patternLength;
713
714 if (ipl > input.length()) {
715
716 return false;
717 }
718
719
720
721 if (!input.subSequence(i, ipl).equals(this.pattern)) {
722 return false;
723 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
724 return false;
725 }
726 return this.lContext.isMatch(input.subSequence(0, i));
727 }
728 }