1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language.bm;
19
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.EnumMap;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Scanner;
31 import java.util.Set;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 public class Rule {
81
82 public static final class Phoneme implements PhonemeExpr {
83 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84 @Override
85 public int compare(final Phoneme o1, final Phoneme o2) {
86 for (int i = 0; i < o1.phonemeText.length(); i++) {
87 if (i >= o2.phonemeText.length()) {
88 return +1;
89 }
90 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91 if (c != 0) {
92 return c;
93 }
94 }
95
96 if (o1.phonemeText.length() < o2.phonemeText.length()) {
97 return -1;
98 }
99
100 return 0;
101 }
102 };
103
104 private final CharSequence phonemeText;
105 private final Languages.LanguageSet languages;
106
107 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
108 this.phonemeText = phonemeText;
109 this.languages = languages;
110 }
111
112 public Phoneme append(final CharSequence str) {
113 return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114 }
115
116 public Languages.LanguageSet getLanguages() {
117 return this.languages;
118 }
119
120 @Override
121 public Iterable<Phoneme> getPhonemes() {
122 return Collections.singleton(this);
123 }
124
125 public CharSequence getPhonemeText() {
126 return this.phonemeText;
127 }
128
129 public Phoneme join(final Phoneme right) {
130 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131 this.languages.restrictTo(right.languages));
132 }
133 }
134
135 public interface PhonemeExpr {
136 Iterable<Phoneme> getPhonemes();
137 }
138
139 public static final class PhonemeList implements PhonemeExpr {
140 private final List<Phoneme> phonemes;
141
142 public PhonemeList(final List<Phoneme> phonemes) {
143 this.phonemes = phonemes;
144 }
145
146 @Override
147 public List<Phoneme> getPhonemes() {
148 return this.phonemes;
149 }
150 }
151
152
153
154
155 public static interface RPattern {
156 boolean isMatch(CharSequence input);
157 }
158
159 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160 @Override
161 public boolean isMatch(final CharSequence input) {
162 return true;
163 }
164 };
165
166 public static final String ALL = "ALL";
167
168 private static final String DOUBLE_QUOTE = "\"";
169
170 private static final String HASH_INCLUDE = "#include";
171
172 private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173 new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174
175 static {
176 for (final NameType s : NameType.values()) {
177 final Map<RuleType, Map<String, List<Rule>>> rts =
178 new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
179
180 for (final RuleType rt : RuleType.values()) {
181 final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
182
183 final Languages ls = Languages.getInstance(s);
184 for (final String l : ls.getLanguages()) {
185 try {
186 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
187 } catch (final IllegalStateException e) {
188 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
189 }
190 }
191 if (!rt.equals(RuleType.RULES)) {
192 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
193 }
194
195 rts.put(rt, Collections.unmodifiableMap(rs));
196 }
197
198 RULES.put(s, Collections.unmodifiableMap(rts));
199 }
200 }
201
202 private static boolean contains(final CharSequence chars, final char input) {
203 for (int i = 0; i < chars.length(); i++) {
204 if (chars.charAt(i) == input) {
205 return true;
206 }
207 }
208 return false;
209 }
210
211 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
212 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
213 nameType.getName(), rt.getName(), lang);
214 }
215
216 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
217 final String resName = createResourceName(nameType, rt, lang);
218 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
219
220 if (rulesIS == null) {
221 throw new IllegalArgumentException("Unable to load resource: " + resName);
222 }
223
224 return new Scanner(rulesIS, ResourceConstants.ENCODING);
225 }
226
227 private static Scanner createScanner(final String lang) {
228 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
229 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
230
231 if (rulesIS == null) {
232 throw new IllegalArgumentException("Unable to load resource: " + resName);
233 }
234
235 return new Scanner(rulesIS, ResourceConstants.ENCODING);
236 }
237
238 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
239 if (suffix.length() > input.length()) {
240 return false;
241 }
242 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
243 if (input.charAt(i) != suffix.charAt(j)) {
244 return false;
245 }
246 }
247 return true;
248 }
249
250
251
252
253
254
255
256
257
258
259
260
261 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
262 final Languages.LanguageSet langs) {
263 return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
264 getInstance(nameType, rt, Languages.ANY);
265 }
266
267
268
269
270
271
272
273
274
275
276
277
278 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
279 final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
280
281 if (rules == null) {
282 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
283 nameType.getName(), rt.getName(), lang));
284 }
285
286 return rules;
287 }
288
289 private static Phoneme parsePhoneme(final String ph) {
290 final int open = ph.indexOf("[");
291 if (open >= 0) {
292 if (!ph.endsWith("]")) {
293 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
294 }
295 final String before = ph.substring(0, open);
296 final String in = ph.substring(open + 1, ph.length() - 1);
297 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
298
299 return new Phoneme(before, Languages.LanguageSet.from(langs));
300 } else {
301 return new Phoneme(ph, Languages.ANY_LANGUAGE);
302 }
303 }
304
305 private static PhonemeExpr parsePhonemeExpr(final String ph) {
306 if (ph.startsWith("(")) {
307 if (!ph.endsWith(")")) {
308 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
309 }
310
311 final List<Phoneme> phs = new ArrayList<Phoneme>();
312 final String body = ph.substring(1, ph.length() - 1);
313 for (final String part : body.split("[|]")) {
314 phs.add(parsePhoneme(part));
315 }
316 if (body.startsWith("|") || body.endsWith("|")) {
317 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
318 }
319
320 return new PhonemeList(phs);
321 } else {
322 return parsePhoneme(ph);
323 }
324 }
325
326 private static List<Rule> parseRules(final Scanner scanner, final String location) {
327 final List<Rule> lines = new ArrayList<Rule>();
328 int currentLine = 0;
329
330 boolean inMultilineComment = false;
331 while (scanner.hasNextLine()) {
332 currentLine++;
333 final String rawLine = scanner.nextLine();
334 String line = rawLine;
335
336 if (inMultilineComment) {
337 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
338 inMultilineComment = false;
339 }
340 } else {
341 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
342 inMultilineComment = true;
343 } else {
344
345 final int cmtI = line.indexOf(ResourceConstants.CMT);
346 if (cmtI >= 0) {
347 line = line.substring(0, cmtI);
348 }
349
350
351 line = line.trim();
352
353 if (line.length() == 0) {
354 continue;
355 }
356
357 if (line.startsWith(HASH_INCLUDE)) {
358
359 final String incl = line.substring(HASH_INCLUDE.length()).trim();
360 if (incl.contains(" ")) {
361 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
362 location);
363 } else {
364 lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
365 }
366 } else {
367
368 final String[] parts = line.split("\\s+");
369 if (parts.length != 4) {
370 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
371 " parts: " + rawLine + " in " + location);
372 } else {
373 try {
374 final String pat = stripQuotes(parts[0]);
375 final String lCon = stripQuotes(parts[1]);
376 final String rCon = stripQuotes(parts[2]);
377 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
378 final int cLine = currentLine;
379 final Rule r = new Rule(pat, lCon, rCon, ph) {
380 private final int myLine = cLine;
381 private final String loc = location;
382
383 @Override
384 public String toString() {
385 final StringBuilder sb = new StringBuilder();
386 sb.append("Rule");
387 sb.append("{line=").append(myLine);
388 sb.append(", loc='").append(loc).append('\'');
389 sb.append('}');
390 return sb.toString();
391 }
392 };
393 lines.add(r);
394 } catch (final IllegalArgumentException e) {
395 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
396 location, e);
397 }
398 }
399 }
400 }
401 }
402 }
403
404 return lines;
405 }
406
407
408
409
410
411
412
413
414 private static RPattern pattern(final String regex) {
415 final boolean startsWith = regex.startsWith("^");
416 final boolean endsWith = regex.endsWith("$");
417 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
418 final boolean boxes = content.contains("[");
419
420 if (!boxes) {
421 if (startsWith && endsWith) {
422
423 if (content.length() == 0) {
424
425 return new RPattern() {
426 @Override
427 public boolean isMatch(final CharSequence input) {
428 return input.length() == 0;
429 }
430 };
431 } else {
432 return new RPattern() {
433 @Override
434 public boolean isMatch(final CharSequence input) {
435 return input.equals(content);
436 }
437 };
438 }
439 } else if ((startsWith || endsWith) && content.length() == 0) {
440
441 return ALL_STRINGS_RMATCHER;
442 } else if (startsWith) {
443
444 return new RPattern() {
445 @Override
446 public boolean isMatch(final CharSequence input) {
447 return startsWith(input, content);
448 }
449 };
450 } else if (endsWith) {
451
452 return new RPattern() {
453 @Override
454 public boolean isMatch(final CharSequence input) {
455 return endsWith(input, content);
456 }
457 };
458 }
459 } else {
460 final boolean startsWithBox = content.startsWith("[");
461 final boolean endsWithBox = content.endsWith("]");
462
463 if (startsWithBox && endsWithBox) {
464 String boxContent = content.substring(1, content.length() - 1);
465 if (!boxContent.contains("[")) {
466
467 final boolean negate = boxContent.startsWith("^");
468 if (negate) {
469 boxContent = boxContent.substring(1);
470 }
471 final String bContent = boxContent;
472 final boolean shouldMatch = !negate;
473
474 if (startsWith && endsWith) {
475
476 return new RPattern() {
477 @Override
478 public boolean isMatch(final CharSequence input) {
479 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
480 }
481 };
482 } else if (startsWith) {
483
484 return new RPattern() {
485 @Override
486 public boolean isMatch(final CharSequence input) {
487 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
488 }
489 };
490 } else if (endsWith) {
491
492 return new RPattern() {
493 @Override
494 public boolean isMatch(final CharSequence input) {
495 return input.length() > 0 &&
496 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
497 }
498 };
499 }
500 }
501 }
502 }
503
504 return new RPattern() {
505 Pattern pattern = Pattern.compile(regex);
506
507 @Override
508 public boolean isMatch(final CharSequence input) {
509 final Matcher matcher = pattern.matcher(input);
510 return matcher.find();
511 }
512 };
513 }
514
515 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
516 if (prefix.length() > input.length()) {
517 return false;
518 }
519 for (int i = 0; i < prefix.length(); i++) {
520 if (input.charAt(i) != prefix.charAt(i)) {
521 return false;
522 }
523 }
524 return true;
525 }
526
527 private static String stripQuotes(String str) {
528 if (str.startsWith(DOUBLE_QUOTE)) {
529 str = str.substring(1);
530 }
531
532 if (str.endsWith(DOUBLE_QUOTE)) {
533 str = str.substring(0, str.length() - 1);
534 }
535
536 return str;
537 }
538
539 private final RPattern lContext;
540
541 private final String pattern;
542
543 private final PhonemeExpr phoneme;
544
545 private final RPattern rContext;
546
547
548
549
550
551
552
553
554
555
556
557
558
559 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
560 this.pattern = pattern;
561 this.lContext = pattern(lContext + "$");
562 this.rContext = pattern("^" + rContext);
563 this.phoneme = phoneme;
564 }
565
566
567
568
569
570
571 public RPattern getLContext() {
572 return this.lContext;
573 }
574
575
576
577
578
579
580 public String getPattern() {
581 return this.pattern;
582 }
583
584
585
586
587
588
589 public PhonemeExpr getPhoneme() {
590 return this.phoneme;
591 }
592
593
594
595
596
597
598 public RPattern getRContext() {
599 return this.rContext;
600 }
601
602
603
604
605
606
607
608
609
610
611
612
613 public boolean patternAndContextMatches(final CharSequence input, final int i) {
614 if (i < 0) {
615 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
616 }
617
618 final int patternLength = this.pattern.length();
619 final int ipl = i + patternLength;
620
621 if (ipl > input.length()) {
622
623 return false;
624 }
625
626
627
628 if (!input.subSequence(i, ipl).equals(this.pattern)) {
629 return false;
630 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
631 return false;
632 }
633 return this.lContext.isMatch(input.subSequence(0, i));
634 }
635 }