1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.EnumMap;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Scanner;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 public class Rule {
83
84 public static final class Phoneme implements PhonemeExpr {
85 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
86 @Override
87 public int compare(final Phoneme o1, final Phoneme o2) {
88 for (int i = 0; i < o1.phonemeText.length(); i++) {
89 if (i >= o2.phonemeText.length()) {
90 return +1;
91 }
92 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
93 if (c != 0) {
94 return c;
95 }
96 }
97
98 if (o1.phonemeText.length() < o2.phonemeText.length()) {
99 return -1;
100 }
101
102 return 0;
103 }
104 };
105
106 private final StringBuilder phonemeText;
107 private final Languages.LanguageSet languages;
108
109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110 this.phonemeText = new StringBuilder(phonemeText);
111 this.languages = languages;
112 }
113
114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115 this(phonemeLeft.phonemeText, phonemeLeft.languages);
116 this.phonemeText.append(phonemeRight.phonemeText);
117 }
118
119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120 this(phonemeLeft.phonemeText, languages);
121 this.phonemeText.append(phonemeRight.phonemeText);
122 }
123
124 public Phoneme append(final CharSequence str) {
125 this.phonemeText.append(str);
126 return this;
127 }
128
129 public Languages.LanguageSet getLanguages() {
130 return this.languages;
131 }
132
133 @Override
134 public Iterable<Phoneme> getPhonemes() {
135 return Collections.singleton(this);
136 }
137
138 public CharSequence getPhonemeText() {
139 return this.phonemeText;
140 }
141
142
143
144
145 @Deprecated
146 public Phoneme join(final Phoneme right) {
147 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
148 this.languages.restrictTo(right.languages));
149 }
150 }
151
152 public interface PhonemeExpr {
153 Iterable<Phoneme> getPhonemes();
154 }
155
156 public static final class PhonemeList implements PhonemeExpr {
157 private final List<Phoneme> phonemes;
158
159 public PhonemeList(final List<Phoneme> phonemes) {
160 this.phonemes = phonemes;
161 }
162
163 @Override
164 public List<Phoneme> getPhonemes() {
165 return this.phonemes;
166 }
167 }
168
169
170
171
172 public interface RPattern {
173 boolean isMatch(CharSequence input);
174 }
175
176 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
177 @Override
178 public boolean isMatch(final CharSequence input) {
179 return true;
180 }
181 };
182
183 public static final String ALL = "ALL";
184
185 private static final String DOUBLE_QUOTE = "\"";
186
187 private static final String HASH_INCLUDE = "#include";
188
189 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
190 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
191
192 static {
193 for (final NameType s : NameType.values()) {
194 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
195 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
196
197 for (final RuleType rt : RuleType.values()) {
198 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
199
200 final Languages ls = Languages.getInstance(s);
201 for (final String l : ls.getLanguages()) {
202 try {
203 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
204 } catch (final IllegalStateException e) {
205 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
206 }
207 }
208 if (!rt.equals(RuleType.RULES)) {
209 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
210 }
211
212 rts.put(rt, Collections.unmodifiableMap(rs));
213 }
214
215 RULES.put(s, Collections.unmodifiableMap(rts));
216 }
217 }
218
219 private static boolean contains(final CharSequence chars, final char input) {
220 for (int i = 0; i < chars.length(); i++) {
221 if (chars.charAt(i) == input) {
222 return true;
223 }
224 }
225 return false;
226 }
227
228 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
229 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
230 nameType.getName(), rt.getName(), lang);
231 }
232
233 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
234 final String resName = createResourceName(nameType, rt, lang);
235 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
236
237 if (rulesIS == null) {
238 throw new IllegalArgumentException("Unable to load resource: " + resName);
239 }
240
241 return new Scanner(rulesIS, ResourceConstants.ENCODING);
242 }
243
244 private static Scanner createScanner(final String lang) {
245 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
246 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
247
248 if (rulesIS == null) {
249 throw new IllegalArgumentException("Unable to load resource: " + resName);
250 }
251
252 return new Scanner(rulesIS, ResourceConstants.ENCODING);
253 }
254
255 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
256 if (suffix.length() > input.length()) {
257 return false;
258 }
259 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
260 if (input.charAt(i) != suffix.charAt(j)) {
261 return false;
262 }
263 }
264 return true;
265 }
266
267
268
269
270
271
272
273
274
275
276
277
278 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
279 final Languages.LanguageSet langs) {
280 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
281 final List<Rule> allRules = new ArrayList<Rule>();
282 for (final List<Rule> rules : ruleMap.values()) {
283 allRules.addAll(rules);
284 }
285 return allRules;
286 }
287
288
289
290
291
292
293
294
295
296
297
298
299 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
300 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
301 }
302
303
304
305
306
307
308
309
310
311
312
313
314
315 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
316 final Languages.LanguageSet langs) {
317 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
318 getInstanceMap(nameType, rt, Languages.ANY);
319 }
320
321
322
323
324
325
326
327
328
329
330
331
332
333 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
334 final String lang) {
335 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
336
337 if (rules == null) {
338 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
339 nameType.getName(), rt.getName(), lang));
340 }
341
342 return rules;
343 }
344
345 private static Phoneme parsePhoneme(final String ph) {
346 final int open = ph.indexOf("[");
347 if (open >= 0) {
348 if (!ph.endsWith("]")) {
349 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
350 }
351 final String before = ph.substring(0, open);
352 final String in = ph.substring(open + 1, ph.length() - 1);
353 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
354
355 return new Phoneme(before, Languages.LanguageSet.from(langs));
356 } else {
357 return new Phoneme(ph, Languages.ANY_LANGUAGE);
358 }
359 }
360
361 private static PhonemeExpr parsePhonemeExpr(final String ph) {
362 if (ph.startsWith("(")) {
363 if (!ph.endsWith(")")) {
364 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
365 }
366
367 final List<Phoneme> phs = new ArrayList<Phoneme>();
368 final String body = ph.substring(1, ph.length() - 1);
369 for (final String part : body.split("[|]")) {
370 phs.add(parsePhoneme(part));
371 }
372 if (body.startsWith("|") || body.endsWith("|")) {
373 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
374 }
375
376 return new PhonemeList(phs);
377 } else {
378 return parsePhoneme(ph);
379 }
380 }
381
382 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
383 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
384 int currentLine = 0;
385
386 boolean inMultilineComment = false;
387 while (scanner.hasNextLine()) {
388 currentLine++;
389 final String rawLine = scanner.nextLine();
390 String line = rawLine;
391
392 if (inMultilineComment) {
393 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
394 inMultilineComment = false;
395 }
396 } else {
397 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
398 inMultilineComment = true;
399 } else {
400
401 final int cmtI = line.indexOf(ResourceConstants.CMT);
402 if (cmtI >= 0) {
403 line = line.substring(0, cmtI);
404 }
405
406
407 line = line.trim();
408
409 if (line.length() == 0) {
410 continue;
411 }
412
413 if (line.startsWith(HASH_INCLUDE)) {
414
415 final String incl = line.substring(HASH_INCLUDE.length()).trim();
416 if (incl.contains(" ")) {
417 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
418 location);
419 } else {
420 lines.putAll(parseRules(createScanner(incl), location + "->" + incl));
421 }
422 } else {
423
424 final String[] parts = line.split("\\s+");
425 if (parts.length != 4) {
426 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
427 " parts: " + rawLine + " in " + location);
428 } else {
429 try {
430 final String pat = stripQuotes(parts[0]);
431 final String lCon = stripQuotes(parts[1]);
432 final String rCon = stripQuotes(parts[2]);
433 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
434 final int cLine = currentLine;
435 final Rule r = new Rule(pat, lCon, rCon, ph) {
436 private final int myLine = cLine;
437 private final String loc = location;
438
439 @Override
440 public String toString() {
441 final StringBuilder sb = new StringBuilder();
442 sb.append("Rule");
443 sb.append("{line=").append(myLine);
444 sb.append(", loc='").append(loc).append('\'');
445 sb.append('}');
446 return sb.toString();
447 }
448 };
449 final String patternKey = r.pattern.substring(0,1);
450 List<Rule> rules = lines.get(patternKey);
451 if (rules == null) {
452 rules = new ArrayList<Rule>();
453 lines.put(patternKey, rules);
454 }
455 rules.add(r);
456 } catch (final IllegalArgumentException e) {
457 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
458 location, e);
459 }
460 }
461 }
462 }
463 }
464 }
465
466 return lines;
467 }
468
469
470
471
472
473
474
475
476 private static RPattern pattern(final String regex) {
477 final boolean startsWith = regex.startsWith("^");
478 final boolean endsWith = regex.endsWith("$");
479 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
480 final boolean boxes = content.contains("[");
481
482 if (!boxes) {
483 if (startsWith && endsWith) {
484
485 if (content.length() == 0) {
486
487 return new RPattern() {
488 @Override
489 public boolean isMatch(final CharSequence input) {
490 return input.length() == 0;
491 }
492 };
493 } else {
494 return new RPattern() {
495 @Override
496 public boolean isMatch(final CharSequence input) {
497 return input.equals(content);
498 }
499 };
500 }
501 } else if ((startsWith || endsWith) && content.length() == 0) {
502
503 return ALL_STRINGS_RMATCHER;
504 } else if (startsWith) {
505
506 return new RPattern() {
507 @Override
508 public boolean isMatch(final CharSequence input) {
509 return startsWith(input, content);
510 }
511 };
512 } else if (endsWith) {
513
514 return new RPattern() {
515 @Override
516 public boolean isMatch(final CharSequence input) {
517 return endsWith(input, content);
518 }
519 };
520 }
521 } else {
522 final boolean startsWithBox = content.startsWith("[");
523 final boolean endsWithBox = content.endsWith("]");
524
525 if (startsWithBox && endsWithBox) {
526 String boxContent = content.substring(1, content.length() - 1);
527 if (!boxContent.contains("[")) {
528
529 final boolean negate = boxContent.startsWith("^");
530 if (negate) {
531 boxContent = boxContent.substring(1);
532 }
533 final String bContent = boxContent;
534 final boolean shouldMatch = !negate;
535
536 if (startsWith && endsWith) {
537
538 return new RPattern() {
539 @Override
540 public boolean isMatch(final CharSequence input) {
541 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
542 }
543 };
544 } else if (startsWith) {
545
546 return new RPattern() {
547 @Override
548 public boolean isMatch(final CharSequence input) {
549 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550 }
551 };
552 } else if (endsWith) {
553
554 return new RPattern() {
555 @Override
556 public boolean isMatch(final CharSequence input) {
557 return input.length() > 0 &&
558 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
559 }
560 };
561 }
562 }
563 }
564 }
565
566 return new RPattern() {
567 Pattern pattern = Pattern.compile(regex);
568
569 @Override
570 public boolean isMatch(final CharSequence input) {
571 final Matcher matcher = pattern.matcher(input);
572 return matcher.find();
573 }
574 };
575 }
576
577 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
578 if (prefix.length() > input.length()) {
579 return false;
580 }
581 for (int i = 0; i < prefix.length(); i++) {
582 if (input.charAt(i) != prefix.charAt(i)) {
583 return false;
584 }
585 }
586 return true;
587 }
588
589 private static String stripQuotes(String str) {
590 if (str.startsWith(DOUBLE_QUOTE)) {
591 str = str.substring(1);
592 }
593
594 if (str.endsWith(DOUBLE_QUOTE)) {
595 str = str.substring(0, str.length() - 1);
596 }
597
598 return str;
599 }
600
601 private final RPattern lContext;
602
603 private final String pattern;
604
605 private final PhonemeExpr phoneme;
606
607 private final RPattern rContext;
608
609
610
611
612
613
614
615
616
617
618
619
620
621 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
622 this.pattern = pattern;
623 this.lContext = pattern(lContext + "$");
624 this.rContext = pattern("^" + rContext);
625 this.phoneme = phoneme;
626 }
627
628
629
630
631
632
633 public RPattern getLContext() {
634 return this.lContext;
635 }
636
637
638
639
640
641
642 public String getPattern() {
643 return this.pattern;
644 }
645
646
647
648
649
650
651 public PhonemeExpr getPhoneme() {
652 return this.phoneme;
653 }
654
655
656
657
658
659
660 public RPattern getRContext() {
661 return this.rContext;
662 }
663
664
665
666
667
668
669
670
671
672
673
674
675 public boolean patternAndContextMatches(final CharSequence input, final int i) {
676 if (i < 0) {
677 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
678 }
679
680 final int patternLength = this.pattern.length();
681 final int ipl = i + patternLength;
682
683 if (ipl > input.length()) {
684
685 return false;
686 }
687
688
689
690 if (!input.subSequence(i, ipl).equals(this.pattern)) {
691 return false;
692 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
693 return false;
694 }
695 return this.lContext.isMatch(input.subSequence(0, i));
696 }
697 }