001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * <ul> 043 * <li>the pattern matches at the current position</li> 044 * <li>the string up until the beginning of the pattern matches the left context</li> 045 * <li>the string from the end of the pattern matches the right context</li> 046 * <li>logical is ALL and all languages are in scope; or</li> 047 * <li>logical is any other value and at least one language is in scope</li> 048 * </ul> 049 * <p> 050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 051 * to explicitly construct their own. 052 * <p> 053 * Rules are immutable and thread-safe. 054 * <p> 055 * <b>Rules resources</b> 056 * <p> 057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 058 * named following the pattern: 059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 060 * <p> 061 * The format of these resources is the following: 062 * <ul> 063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 064 * will be interpreted as: 065 * <ol> 066 * <li>pattern</li> 067 * <li>left context</li> 068 * <li>right context</li> 069 * <li>phoneme</li> 070 * </ol> 071 * </li> 072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 073 * as a comment.</li> 074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 075 * all content until a line ending in '*' and '/' is found.</li> 076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 */ 081public class Rule { 082 083 public static final class Phoneme implements PhonemeExpr { 084 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 085 @Override 086 public int compare(final Phoneme o1, final Phoneme o2) { 087 for (int i = 0; i < o1.phonemeText.length(); i++) { 088 if (i >= o2.phonemeText.length()) { 089 return +1; 090 } 091 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 092 if (c != 0) { 093 return c; 094 } 095 } 096 097 if (o1.phonemeText.length() < o2.phonemeText.length()) { 098 return -1; 099 } 100 101 return 0; 102 } 103 }; 104 105 private final StringBuilder phonemeText; 106 private final Languages.LanguageSet languages; 107 108 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 109 this.phonemeText = new StringBuilder(phonemeText); 110 this.languages = languages; 111 } 112 113 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 114 this(phonemeLeft.phonemeText, phonemeLeft.languages); 115 this.phonemeText.append(phonemeRight.phonemeText); 116 } 117 118 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 119 this(phonemeLeft.phonemeText, languages); 120 this.phonemeText.append(phonemeRight.phonemeText); 121 } 122 123 public Phoneme append(final CharSequence str) { 124 this.phonemeText.append(str); 125 return this; 126 } 127 128 public Languages.LanguageSet getLanguages() { 129 return this.languages; 130 } 131 132 @Override 133 public Iterable<Phoneme> getPhonemes() { 134 return Collections.singleton(this); 135 } 136 137 public CharSequence getPhonemeText() { 138 return this.phonemeText; 139 } 140 141 /** 142 * Deprecated since 1.9. 143 * 144 * @param right the Phoneme to join 145 * @return a new Phoneme 146 * @deprecated since 1.9 147 */ 148 @Deprecated 149 public Phoneme join(final Phoneme right) { 150 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 151 this.languages.restrictTo(right.languages)); 152 } 153 154 /** 155 * Returns a new Phoneme with the same text but a union of its 156 * current language set and the given one. 157 * 158 * @param lang the language set to merge 159 * @return a new Phoneme 160 */ 161 public Phoneme mergeWithLanguage(final LanguageSet lang) { 162 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 163 } 164 165 @Override 166 public String toString() { 167 return phonemeText.toString() + "[" + languages + "]"; 168 } 169 } 170 171 public interface PhonemeExpr { 172 Iterable<Phoneme> getPhonemes(); 173 } 174 175 public static final class PhonemeList implements PhonemeExpr { 176 private final List<Phoneme> phonemes; 177 178 public PhonemeList(final List<Phoneme> phonemes) { 179 this.phonemes = phonemes; 180 } 181 182 @Override 183 public List<Phoneme> getPhonemes() { 184 return this.phonemes; 185 } 186 } 187 188 /** 189 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 190 */ 191 public interface RPattern { 192 boolean isMatch(CharSequence input); 193 } 194 195 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 196 @Override 197 public boolean isMatch(final CharSequence input) { 198 return true; 199 } 200 }; 201 202 public static final String ALL = "ALL"; 203 204 private static final String DOUBLE_QUOTE = "\""; 205 206 private static final String HASH_INCLUDE = "#include"; 207 208 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 209 new EnumMap<>(NameType.class); 210 211 static { 212 for (final NameType s : NameType.values()) { 213 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 214 new EnumMap<>(RuleType.class); 215 216 for (final RuleType rt : RuleType.values()) { 217 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 218 219 final Languages ls = Languages.getInstance(s); 220 for (final String l : ls.getLanguages()) { 221 try (final Scanner scanner = createScanner(s, rt, l)) { 222 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 223 } catch (final IllegalStateException e) { 224 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 225 } 226 } 227 if (!rt.equals(RuleType.RULES)) { 228 try (final Scanner scanner = createScanner(s, rt, "common")) { 229 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 230 } 231 } 232 233 rts.put(rt, Collections.unmodifiableMap(rs)); 234 } 235 236 RULES.put(s, Collections.unmodifiableMap(rts)); 237 } 238 } 239 240 private static boolean contains(final CharSequence chars, final char input) { 241 for (int i = 0; i < chars.length(); i++) { 242 if (chars.charAt(i) == input) { 243 return true; 244 } 245 } 246 return false; 247 } 248 249 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 250 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 251 nameType.getName(), rt.getName(), lang); 252 } 253 254 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 255 final String resName = createResourceName(nameType, rt, lang); 256 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 257 } 258 259 private static Scanner createScanner(final String lang) { 260 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 261 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 262 } 263 264 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 265 if (suffix.length() > input.length()) { 266 return false; 267 } 268 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 269 if (input.charAt(i) != suffix.charAt(j)) { 270 return false; 271 } 272 } 273 return true; 274 } 275 276 /** 277 * Gets rules for a combination of name type, rule type and languages. 278 * 279 * @param nameType 280 * the NameType to consider 281 * @param rt 282 * the RuleType to consider 283 * @param langs 284 * the set of languages to consider 285 * @return a list of Rules that apply 286 */ 287 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 288 final Languages.LanguageSet langs) { 289 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 290 final List<Rule> allRules = new ArrayList<>(); 291 for (final List<Rule> rules : ruleMap.values()) { 292 allRules.addAll(rules); 293 } 294 return allRules; 295 } 296 297 /** 298 * Gets rules for a combination of name type, rule type and a single language. 299 * 300 * @param nameType 301 * the NameType to consider 302 * @param rt 303 * the RuleType to consider 304 * @param lang 305 * the language to consider 306 * @return a list of Rules that apply 307 */ 308 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 309 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 310 } 311 312 /** 313 * Gets rules for a combination of name type, rule type and languages. 314 * 315 * @param nameType 316 * the NameType to consider 317 * @param rt 318 * the RuleType to consider 319 * @param langs 320 * the set of languages to consider 321 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 322 * @since 1.9 323 */ 324 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 325 final Languages.LanguageSet langs) { 326 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 327 getInstanceMap(nameType, rt, Languages.ANY); 328 } 329 330 /** 331 * Gets rules for a combination of name type, rule type and a single language. 332 * 333 * @param nameType 334 * the NameType to consider 335 * @param rt 336 * the RuleType to consider 337 * @param lang 338 * the language to consider 339 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 340 * @since 1.9 341 */ 342 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 343 final String lang) { 344 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 345 346 if (rules == null) { 347 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 348 nameType.getName(), rt.getName(), lang)); 349 } 350 351 return rules; 352 } 353 354 private static Phoneme parsePhoneme(final String ph) { 355 final int open = ph.indexOf("["); 356 if (open >= 0) { 357 if (!ph.endsWith("]")) { 358 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 359 } 360 final String before = ph.substring(0, open); 361 final String in = ph.substring(open + 1, ph.length() - 1); 362 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 363 364 return new Phoneme(before, Languages.LanguageSet.from(langs)); 365 } 366 return new Phoneme(ph, Languages.ANY_LANGUAGE); 367 } 368 369 private static PhonemeExpr parsePhonemeExpr(final String ph) { 370 if (ph.startsWith("(")) { // we have a bracketed list of options 371 if (!ph.endsWith(")")) { 372 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 373 } 374 375 final List<Phoneme> phs = new ArrayList<>(); 376 final String body = ph.substring(1, ph.length() - 1); 377 for (final String part : body.split("[|]")) { 378 phs.add(parsePhoneme(part)); 379 } 380 if (body.startsWith("|") || body.endsWith("|")) { 381 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 382 } 383 384 return new PhonemeList(phs); 385 } 386 return parsePhoneme(ph); 387 } 388 389 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 390 final Map<String, List<Rule>> lines = new HashMap<>(); 391 int currentLine = 0; 392 393 boolean inMultilineComment = false; 394 while (scanner.hasNextLine()) { 395 currentLine++; 396 final String rawLine = scanner.nextLine(); 397 String line = rawLine; 398 399 if (inMultilineComment) { 400 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 401 inMultilineComment = false; 402 } 403 } else { 404 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 405 inMultilineComment = true; 406 } else { 407 // discard comments 408 final int cmtI = line.indexOf(ResourceConstants.CMT); 409 if (cmtI >= 0) { 410 line = line.substring(0, cmtI); 411 } 412 413 // trim leading-trailing whitespace 414 line = line.trim(); 415 416 if (line.length() == 0) { 417 continue; // empty lines can be safely skipped 418 } 419 420 if (line.startsWith(HASH_INCLUDE)) { 421 // include statement 422 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 423 if (incl.contains(" ")) { 424 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 425 location); 426 } 427 try (final Scanner hashIncludeScanner = createScanner(incl)) { 428 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 429 } 430 } else { 431 // rule 432 final String[] parts = line.split("\\s+"); 433 if (parts.length != 4) { 434 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 435 " parts: " + rawLine + " in " + location); 436 } 437 try { 438 final String pat = stripQuotes(parts[0]); 439 final String lCon = stripQuotes(parts[1]); 440 final String rCon = stripQuotes(parts[2]); 441 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 442 final int cLine = currentLine; 443 final Rule r = new Rule(pat, lCon, rCon, ph) { 444 private final int myLine = cLine; 445 private final String loc = location; 446 447 @Override 448 public String toString() { 449 final StringBuilder sb = new StringBuilder(); 450 sb.append("Rule"); 451 sb.append("{line=").append(myLine); 452 sb.append(", loc='").append(loc).append('\''); 453 sb.append(", pat='").append(pat).append('\''); 454 sb.append(", lcon='").append(lCon).append('\''); 455 sb.append(", rcon='").append(rCon).append('\''); 456 sb.append('}'); 457 return sb.toString(); 458 } 459 }; 460 final String patternKey = r.pattern.substring(0,1); 461 List<Rule> rules = lines.get(patternKey); 462 if (rules == null) { 463 rules = new ArrayList<>(); 464 lines.put(patternKey, rules); 465 } 466 rules.add(r); 467 } catch (final IllegalArgumentException e) { 468 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 469 location, e); 470 } 471 } 472 } 473 } 474 } 475 476 return lines; 477 } 478 479 /** 480 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 481 * 482 * @param regex 483 * the regular expression to compile 484 * @return an RPattern that will match this regex 485 */ 486 private static RPattern pattern(final String regex) { 487 final boolean startsWith = regex.startsWith("^"); 488 final boolean endsWith = regex.endsWith("$"); 489 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 490 final boolean boxes = content.contains("["); 491 492 if (!boxes) { 493 if (startsWith && endsWith) { 494 // exact match 495 if (content.length() == 0) { 496 // empty 497 return new RPattern() { 498 @Override 499 public boolean isMatch(final CharSequence input) { 500 return input.length() == 0; 501 } 502 }; 503 } 504 return new RPattern() { 505 @Override 506 public boolean isMatch(final CharSequence input) { 507 return input.equals(content); 508 } 509 }; 510 } else if ((startsWith || endsWith) && content.length() == 0) { 511 // matches every string 512 return ALL_STRINGS_RMATCHER; 513 } else if (startsWith) { 514 // matches from start 515 return new RPattern() { 516 @Override 517 public boolean isMatch(final CharSequence input) { 518 return startsWith(input, content); 519 } 520 }; 521 } else if (endsWith) { 522 // matches from start 523 return new RPattern() { 524 @Override 525 public boolean isMatch(final CharSequence input) { 526 return endsWith(input, content); 527 } 528 }; 529 } 530 } else { 531 final boolean startsWithBox = content.startsWith("["); 532 final boolean endsWithBox = content.endsWith("]"); 533 534 if (startsWithBox && endsWithBox) { 535 String boxContent = content.substring(1, content.length() - 1); 536 if (!boxContent.contains("[")) { 537 // box containing alternatives 538 final boolean negate = boxContent.startsWith("^"); 539 if (negate) { 540 boxContent = boxContent.substring(1); 541 } 542 final String bContent = boxContent; 543 final boolean shouldMatch = !negate; 544 545 if (startsWith && endsWith) { 546 // exact match 547 return new RPattern() { 548 @Override 549 public boolean isMatch(final CharSequence input) { 550 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 551 } 552 }; 553 } else if (startsWith) { 554 // first char 555 return new RPattern() { 556 @Override 557 public boolean isMatch(final CharSequence input) { 558 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 559 } 560 }; 561 } else if (endsWith) { 562 // last char 563 return new RPattern() { 564 @Override 565 public boolean isMatch(final CharSequence input) { 566 return input.length() > 0 && 567 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 568 } 569 }; 570 } 571 } 572 } 573 } 574 575 return new RPattern() { 576 Pattern pattern = Pattern.compile(regex); 577 578 @Override 579 public boolean isMatch(final CharSequence input) { 580 final Matcher matcher = pattern.matcher(input); 581 return matcher.find(); 582 } 583 }; 584 } 585 586 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 587 if (prefix.length() > input.length()) { 588 return false; 589 } 590 for (int i = 0; i < prefix.length(); i++) { 591 if (input.charAt(i) != prefix.charAt(i)) { 592 return false; 593 } 594 } 595 return true; 596 } 597 598 private static String stripQuotes(String str) { 599 if (str.startsWith(DOUBLE_QUOTE)) { 600 str = str.substring(1); 601 } 602 603 if (str.endsWith(DOUBLE_QUOTE)) { 604 str = str.substring(0, str.length() - 1); 605 } 606 607 return str; 608 } 609 610 private final RPattern lContext; 611 612 private final String pattern; 613 614 private final PhonemeExpr phoneme; 615 616 private final RPattern rContext; 617 618 /** 619 * Creates a new rule. 620 * 621 * @param pattern 622 * the pattern 623 * @param lContext 624 * the left context 625 * @param rContext 626 * the right context 627 * @param phoneme 628 * the resulting phoneme 629 */ 630 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 631 this.pattern = pattern; 632 this.lContext = pattern(lContext + "$"); 633 this.rContext = pattern("^" + rContext); 634 this.phoneme = phoneme; 635 } 636 637 /** 638 * Gets the left context. This is a regular expression that must match to the left of the pattern. 639 * 640 * @return the left context Pattern 641 */ 642 public RPattern getLContext() { 643 return this.lContext; 644 } 645 646 /** 647 * Gets the pattern. This is a string-literal that must exactly match. 648 * 649 * @return the pattern 650 */ 651 public String getPattern() { 652 return this.pattern; 653 } 654 655 /** 656 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 657 * 658 * @return the phoneme 659 */ 660 public PhonemeExpr getPhoneme() { 661 return this.phoneme; 662 } 663 664 /** 665 * Gets the right context. This is a regular expression that must match to the right of the pattern. 666 * 667 * @return the right context Pattern 668 */ 669 public RPattern getRContext() { 670 return this.rContext; 671 } 672 673 /** 674 * Decides if the pattern and context match the input starting at a position. It is a match if the 675 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 676 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 677 * 678 * @param input 679 * the input String 680 * @param i 681 * the int position within the input 682 * @return true if the pattern and left/right context match, false otherwise 683 */ 684 public boolean patternAndContextMatches(final CharSequence input, final int i) { 685 if (i < 0) { 686 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 687 } 688 689 final int patternLength = this.pattern.length(); 690 final int ipl = i + patternLength; 691 692 if (ipl > input.length()) { 693 // not enough room for the pattern to match 694 return false; 695 } 696 697 // evaluate the pattern, left context and right context 698 // fail early if any of the evaluations is not successful 699 if (!input.subSequence(i, ipl).equals(this.pattern)) { 700 return false; 701 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 702 return false; 703 } 704 return this.lContext.isMatch(input.subSequence(0, i)); 705 } 706}