001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply and a logical flag indicating if all languages must 041 * be in play. A rule matches if: 042 * </p> 043 * <ul> 044 * <li>the pattern matches at the current position</li> 045 * <li>the string up until the beginning of the pattern matches the left context</li> 046 * <li>the string from the end of the pattern matches the right context</li> 047 * <li>logical is ALL and all languages are in scope; or</li> 048 * <li>logical is any other value and at least one language is in scope</li> 049 * </ul> 050 * <p> 051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user to explicitly construct their own. 052 * </p> 053 * <p> 054 * Rules are immutable and thread-safe. 055 * </p> 056 * <h2>Rules resources</h2> 057 * <p> 058 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the pattern: 059 * </p> 060 * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 061 * <p> 062 * The format of these resources is the following: 063 * </p> 064 * <ul> 065 * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these will be interpreted as: 066 * <ol> 067 * <li>pattern</li> 068 * <li>left context</li> 069 * <li>right context</li> 070 * <li>phoneme</li> 071 * </ol> 072 * </li> 073 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</li> 074 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending 075 * in '*' and '/' is found.</li> 076 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 */ 081public class Rule { 082 083 /** 084 * A phoneme. 085 */ 086 public static final class Phoneme implements PhonemeExpr { 087 088 /** 089 * The Phoneme Comparator. 090 */ 091 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> { 092 final int o1Length = o1.phonemeText.length(); 093 final int o2Length = o2.phonemeText.length(); 094 for (int i = 0; i < o1Length; i++) { 095 if (i >= o2Length) { 096 return +1; 097 } 098 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 099 if (c != 0) { 100 return c; 101 } 102 } 103 if (o1Length < o2Length) { 104 return -1; 105 } 106 return 0; 107 }; 108 private final StringBuilder phonemeText; 109 private final Languages.LanguageSet languages; 110 111 /** 112 * Constructs a new instance. 113 * 114 * @param phonemeText The phoneme text. 115 * @param languages A language set. 116 */ 117 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 118 this.phonemeText = new StringBuilder(phonemeText); 119 this.languages = languages; 120 } 121 122 /** 123 * Constructs a new instance. 124 * 125 * @param phonemeLeft The left phoneme text. 126 * @param phonemeRight The right phoneme text. 127 */ 128 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 129 this(phonemeLeft.phonemeText, phonemeLeft.languages); 130 this.phonemeText.append(phonemeRight.phonemeText); 131 } 132 133 /** 134 * Constructs a new instance. 135 * 136 * @param phonemeLeft The left phoneme text. 137 * @param phonemeRight The right phoneme text. 138 * @param languages A language set. 139 */ 140 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 141 this(phonemeLeft.phonemeText, languages); 142 this.phonemeText.append(phonemeRight.phonemeText); 143 } 144 145 /** 146 * Appends the sequence to the phone text. 147 * 148 * @param sequence The sequence to append. 149 * @return this instance. 150 */ 151 public Phoneme append(final CharSequence sequence) { 152 this.phonemeText.append(sequence); 153 return this; 154 } 155 156 /** 157 * Gets the language set. 158 * 159 * @return the language set. 160 */ 161 public Languages.LanguageSet getLanguages() { 162 return this.languages; 163 } 164 165 @Override 166 public Iterable<Phoneme> getPhonemes() { 167 return Collections.singleton(this); 168 } 169 170 /** 171 * Gets the phoneme text sequence. 172 * 173 * @return the phoneme text sequence. 174 */ 175 public CharSequence getPhonemeText() { 176 return this.phonemeText; 177 } 178 179 /** 180 * Deprecated since 1.9. 181 * 182 * @param right the Phoneme to join 183 * @return a new Phoneme 184 * @deprecated since 1.9 185 */ 186 @Deprecated 187 public Phoneme join(final Phoneme right) { 188 return new Phoneme(phonemeText.toString() + right.phonemeText.toString(), languages.restrictTo(right.languages)); 189 } 190 191 /** 192 * Returns a new Phoneme with the same text but a union of its current language set and the given one. 193 * 194 * @param lang the language set to merge 195 * @return a new Phoneme 196 */ 197 public Phoneme mergeWithLanguage(final LanguageSet lang) { 198 return new Phoneme(phonemeText.toString(), languages.merge(lang)); 199 } 200 201 @Override 202 public int size() { 203 return 1; 204 } 205 206 @Override 207 public String toString() { 208 return phonemeText.toString() + "[" + languages + "]"; 209 } 210 } 211 212 /** 213 * A phoneme expression. 214 */ 215 public interface PhonemeExpr { 216 217 /** 218 * Gets an iteration of phonemes. 219 * 220 * @return an iteration of phonemes. 221 */ 222 Iterable<Phoneme> getPhonemes(); 223 224 /** 225 * Gets the expression size in phonemes. 226 * 227 * @return the expression size in phonemes. 228 * @since 1.17.0 229 */ 230 default int size() { 231 // All implementations are int-bound. 232 return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE); 233 } 234 } 235 236 /** 237 * A list of phonemes. 238 */ 239 public static final class PhonemeList implements PhonemeExpr { 240 241 private final List<Phoneme> phonemeList; 242 243 /** 244 * Constructs a new instance. 245 * 246 * @param phonemes the phoneme list. 247 */ 248 public PhonemeList(final List<Phoneme> phonemes) { 249 this.phonemeList = phonemes; 250 } 251 252 @Override 253 public List<Phoneme> getPhonemes() { 254 return phonemeList; 255 } 256 257 @Override 258 public int size() { 259 return phonemeList.size(); 260 } 261 } 262 263 /** 264 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 265 */ 266 public interface RPattern { 267 268 /** 269 * Tests whether the given input matches this instance. 270 * 271 * @param input the input to test. 272 * @return whether the given input matches this instance. 273 */ 274 boolean isMatch(CharSequence input); 275 } 276 277 private static final String PIPE = "|"; 278 279 /** 280 * Always matches. 281 */ 282 public static final RPattern ALL_STRINGS_RMATCHER = input -> true; 283 284 /** 285 * Unused. 286 * 287 * @deprecated This is unused. 288 */ 289 @Deprecated 290 public static final String ALL = "ALL"; 291 292 private static final String DOUBLE_QUOTE = "\""; 293 private static final String HASH_INCLUDE = "#include"; 294 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); 295 private static final Pattern AROUND_PLUS = Pattern.compile("[+]"); 296 private static final Pattern AROUND_PIPE = Pattern.compile("[|]"); 297 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = new EnumMap<>(NameType.class); 298 299 /** 300 * Initializes {@code RULES}. 301 */ 302 static { 303 for (final NameType nameType : NameType.values()) { 304 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rtsMap = new EnumMap<>(RuleType.class); 305 for (final RuleType ruleType : RuleType.values()) { 306 final Map<String, Map<String, List<Rule>>> rsMap = new HashMap<>(); 307 final Languages languages = Languages.getInstance(nameType); 308 languages.getLanguages().forEach(l -> { 309 try (Scanner scanner = createScanner(nameType, ruleType, l)) { 310 rsMap.put(l, parseRules(scanner, createResourceName(nameType, ruleType, l))); 311 } catch (final IllegalStateException e) { 312 throw new IllegalStateException("Problem processing " + createResourceName(nameType, ruleType, l), e); 313 } 314 }); 315 if (!ruleType.equals(RuleType.RULES)) { 316 try (Scanner scanner = createScanner(nameType, ruleType, "common")) { 317 rsMap.put("common", parseRules(scanner, createResourceName(nameType, ruleType, "common"))); 318 } 319 } 320 rtsMap.put(ruleType, Collections.unmodifiableMap(rsMap)); 321 } 322 RULES.put(nameType, Collections.unmodifiableMap(rtsMap)); 323 } 324 } 325 326 private static boolean contains(final CharSequence chars, final char input) { 327 return chars.chars().anyMatch(c -> c == input); 328 } 329 330 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 331 return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang); 332 } 333 334 @SuppressWarnings("resource") // Closing the Scanner closes the resource 335 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 336 final String resName = createResourceName(nameType, rt, lang); 337 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 338 } 339 340 @SuppressWarnings("resource") // Closing the Scanner closes the resource 341 private static Scanner createScanner(final String lang) { 342 final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang); 343 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 344 } 345 346 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 347 final int suffixLength = suffix.length(); 348 final int inputLength = input.length(); 349 if (suffixLength > inputLength) { 350 return false; 351 } 352 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { 353 if (input.charAt(i) != suffix.charAt(j)) { 354 return false; 355 } 356 } 357 return true; 358 } 359 360 /** 361 * Gets rules for a combination of name type, rule type and languages. 362 * 363 * @param nameType the NameType to consider 364 * @param rt the RuleType to consider 365 * @param langs the set of languages to consider 366 * @return a list of Rules that apply 367 */ 368 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) { 369 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 370 final List<Rule> allRules = new ArrayList<>(); 371 ruleMap.values().forEach(rules -> allRules.addAll(rules)); 372 return allRules; 373 } 374 375 /** 376 * Gets rules for a combination of name type, rule type and a single language. 377 * 378 * @param nameType the NameType to consider 379 * @param rt the RuleType to consider 380 * @param lang the language to consider 381 * @return a list of Rules that apply 382 */ 383 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 384 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 385 } 386 387 /** 388 * Gets rules for a combination of name type, rule type and languages. 389 * 390 * @param nameType the NameType to consider 391 * @param rt the RuleType to consider 392 * @param langs the set of languages to consider 393 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 394 * @since 1.9 395 */ 396 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) { 397 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : getInstanceMap(nameType, rt, Languages.ANY); 398 } 399 400 /** 401 * Gets rules for a combination of name type, rule type and a single language. 402 * 403 * @param nameType the NameType to consider 404 * @param rt the RuleType to consider 405 * @param lang the language to consider 406 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 407 * @since 1.9 408 */ 409 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final String lang) { 410 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 411 if (rules == null) { 412 throw new IllegalArgumentException(String.format("No rules found for %s, %s, '%s'.", nameType.getName(), rt.getName(), lang)); 413 } 414 return rules; 415 } 416 417 private static Phoneme parsePhoneme(final String ph) { 418 final int open = ph.indexOf("["); 419 if (open >= 0) { 420 if (!ph.endsWith("]")) { 421 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 422 } 423 final String before = ph.substring(0, open); 424 final String in = ph.substring(open + 1, ph.length() - 1); 425 final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in))); 426 return new Phoneme(before, Languages.LanguageSet.from(langs)); 427 } 428 return new Phoneme(ph, Languages.ANY_LANGUAGE); 429 } 430 431 /* 432 * Package-private for testing only. 433 */ 434 static PhonemeExpr parsePhonemeExpr(final String ph) { 435 if (ph.startsWith("(")) { 436 // we have a bracketed list of options 437 if (!ph.endsWith(")")) { 438 throw new IllegalArgumentException("Phoneme starting with '(' must end with ')'"); 439 } 440 final List<Phoneme> phs = new ArrayList<>(); 441 final String body = ph.substring(1, ph.length() - 1); 442 final String[] split = AROUND_PIPE.split(body); 443 for (final String part : split) { 444 phs.add(parsePhoneme(part)); 445 } 446 if (split.length > 1 && split[0].length() != 0 && body.startsWith(PIPE) || split[split.length - 1].length() != 0 && body.endsWith(PIPE)) { 447 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 448 } 449 return new PhonemeList(phs); 450 } 451 return parsePhoneme(ph); 452 } 453 454 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 455 final Map<String, List<Rule>> lines = new HashMap<>(); 456 int currentLine = 0; 457 boolean inMultilineComment = false; 458 while (scanner.hasNextLine()) { 459 currentLine++; 460 final String rawLine = scanner.nextLine(); 461 String line = rawLine; 462 if (inMultilineComment) { 463 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 464 inMultilineComment = false; 465 } 466 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 467 inMultilineComment = true; 468 } else { 469 // discard comments 470 final int cmtI = line.indexOf(ResourceConstants.CMT); 471 if (cmtI >= 0) { 472 line = line.substring(0, cmtI); 473 } 474 // trim leading-trailing whitespace 475 line = line.trim(); 476 if (line.isEmpty()) { 477 continue; // empty lines can be safely skipped 478 } 479 if (line.startsWith(HASH_INCLUDE)) { 480 // include statement 481 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); 482 if (incl.contains(" ")) { 483 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + location); 484 } 485 try (Scanner hashIncludeScanner = createScanner(incl)) { 486 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 487 } 488 } else { 489 // rule 490 final String[] parts = ResourceConstants.SPACES.split(line); 491 if (parts.length != 4) { 492 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); 493 } 494 try { 495 final String pat = stripQuotes(parts[0]); 496 final String lCon = stripQuotes(parts[1]); 497 final String rCon = stripQuotes(parts[2]); 498 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 499 final int cLine = currentLine; 500 final Rule r = new Rule(pat, lCon, rCon, ph) { 501 502 private final int myLine = cLine; 503 private final String loc = location; 504 505 @Override 506 public String toString() { 507 final StringBuilder sb = new StringBuilder(); 508 sb.append("Rule"); 509 sb.append("{line=").append(myLine); 510 sb.append(", loc='").append(loc).append('\''); 511 sb.append(", pat='").append(pat).append('\''); 512 sb.append(", lcon='").append(lCon).append('\''); 513 sb.append(", rcon='").append(rCon).append('\''); 514 sb.append('}'); 515 return sb.toString(); 516 } 517 }; 518 final String patternKey = r.pattern.substring(0, 1); 519 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); 520 rules.add(r); 521 } catch (final IllegalArgumentException e) { 522 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e); 523 } 524 } 525 } 526 } 527 return lines; 528 } 529 530 /** 531 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 532 * 533 * @param regex the regular expression to compile 534 * @return an RPattern that will match this regex 535 */ 536 private static RPattern pattern(final String regex) { 537 final boolean startsWith = regex.startsWith("^"); 538 final boolean endsWith = regex.endsWith("$"); 539 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 540 final boolean boxes = content.contains("["); 541 if (!boxes) { 542 if (startsWith && endsWith) { 543 // exact match 544 if (content.isEmpty()) { 545 // empty 546 return input -> input.length() == 0; 547 } 548 return input -> input.equals(content); 549 } 550 if ((startsWith || endsWith) && content.isEmpty()) { 551 // matches every string 552 return ALL_STRINGS_RMATCHER; 553 } 554 if (startsWith) { 555 // matches from start 556 return input -> startsWith(input, content); 557 } 558 if (endsWith) { 559 // matches from start 560 return input -> endsWith(input, content); 561 } 562 } else { 563 final boolean startsWithBox = content.startsWith("["); 564 final boolean endsWithBox = content.endsWith("]"); 565 if (startsWithBox && endsWithBox) { 566 String boxContent = content.substring(1, content.length() - 1); 567 if (!boxContent.contains("[")) { 568 // box containing alternatives 569 final boolean negate = boxContent.startsWith("^"); 570 if (negate) { 571 boxContent = boxContent.substring(1); 572 } 573 final String bContent = boxContent; 574 final boolean shouldMatch = !negate; 575 if (startsWith && endsWith) { 576 // exact match 577 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 578 } 579 if (startsWith) { 580 // first char 581 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 582 } 583 if (endsWith) { 584 // last char 585 return input -> input.length() > 0 && contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 586 } 587 } 588 } 589 } 590 return new RPattern() { 591 592 final Pattern pattern = Pattern.compile(regex); 593 594 @Override 595 public boolean isMatch(final CharSequence input) { 596 final Matcher matcher = pattern.matcher(input); 597 return matcher.find(); 598 } 599 }; 600 } 601 602 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 603 if (prefix.length() > input.length()) { 604 return false; 605 } 606 for (int i = 0; i < prefix.length(); i++) { 607 if (input.charAt(i) != prefix.charAt(i)) { 608 return false; 609 } 610 } 611 return true; 612 } 613 614 private static String stripQuotes(String str) { 615 if (str.startsWith(DOUBLE_QUOTE)) { 616 str = str.substring(1); 617 } 618 if (str.endsWith(DOUBLE_QUOTE)) { 619 str = str.substring(0, str.length() - 1); 620 } 621 return str; 622 } 623 624 private final RPattern lContext; 625 private final String pattern; 626 private final PhonemeExpr phoneme; 627 private final RPattern rContext; 628 629 /** 630 * Creates a new rule. 631 * 632 * @param pattern the pattern 633 * @param lContext the left context 634 * @param rContext the right context 635 * @param phoneme the resulting phoneme 636 */ 637 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 638 this.pattern = pattern; 639 this.lContext = pattern(lContext + "$"); 640 this.rContext = pattern("^" + rContext); 641 this.phoneme = phoneme; 642 } 643 644 /** 645 * Gets the left context. This is a regular expression that must match to the left of the pattern. 646 * 647 * @return the left context Pattern 648 */ 649 public RPattern getLContext() { 650 return lContext; 651 } 652 653 /** 654 * Gets the pattern. This is a string-literal that must exactly match. 655 * 656 * @return the pattern 657 */ 658 public String getPattern() { 659 return pattern; 660 } 661 662 /** 663 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 664 * 665 * @return the phoneme 666 */ 667 public PhonemeExpr getPhoneme() { 668 return phoneme; 669 } 670 671 /** 672 * Gets the right context. This is a regular expression that must match to the right of the pattern. 673 * 674 * @return the right context Pattern 675 */ 676 public RPattern getRContext() { 677 return rContext; 678 } 679 680 /** 681 * Decides if the pattern and context match the input starting at a position. It is a match if the {@code lContext} matches {@code input} up to {@code i}, 682 * {@code pattern} matches at i and {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. 683 * 684 * @param input the input String 685 * @param i the int position within the input 686 * @return true if the pattern and left/right context match, false otherwise 687 */ 688 public boolean patternAndContextMatches(final CharSequence input, final int i) { 689 if (i < 0) { 690 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 691 } 692 final int patternLength = pattern.length(); 693 final int ipl = i + patternLength; 694 if (ipl > input.length()) { 695 // not enough room for the pattern to match 696 return false; 697 } 698 // evaluate the pattern, left context and right context 699 // fail early if any of the evaluations is not successful 700 if (!input.subSequence(i, ipl).equals(pattern)) { 701 return false; 702 } 703 if (!rContext.isMatch(input.subSequence(ipl, input.length()))) { 704 return false; 705 } 706 return lContext.isMatch(input.subSequence(0, i)); 707 } 708}