001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.EnumMap; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Scanner; 030import java.util.Set; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.apache.commons.codec.Resources; 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * </p> 043 * <ul> 044 * <li>the pattern matches at the current position</li> 045 * <li>the string up until the beginning of the pattern matches the left context</li> 046 * <li>the string from the end of the pattern matches the right context</li> 047 * <li>logical is ALL and all languages are in scope; or</li> 048 * <li>logical is any other value and at least one language is in scope</li> 049 * </ul> 050 * <p> 051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 052 * to explicitly construct their own. 053 * </p> 054 * <p> 055 * Rules are immutable and thread-safe. 056 * </p> 057 * <h2>Rules resources</h2> 058 * <p> 059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 060 * named following the pattern: 061 * </p> 062 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 063 * <p> 064 * The format of these resources is the following: 065 * </p> 066 * <ul> 067 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 068 * will be interpreted as: 069 * <ol> 070 * <li>pattern</li> 071 * <li>left context</li> 072 * <li>right context</li> 073 * <li>phoneme</li> 074 * </ol> 075 * </li> 076 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 077 * as a comment.</li> 078 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 079 * all content until a line ending in '*' and '/' is found.</li> 080 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 081 * </ul> 082 * 083 * @since 1.6 084 */ 085public class Rule { 086 087 public static final class Phoneme implements PhonemeExpr { 088 089 public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> { 090 final int o1Length = o1.phonemeText.length(); 091 final int o2Length = o2.phonemeText.length(); 092 for (int i = 0; i < o1Length; i++) { 093 if (i >= o2Length) { 094 return +1; 095 } 096 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 097 if (c != 0) { 098 return c; 099 } 100 } 101 102 if (o1Length < o2Length) { 103 return -1; 104 } 105 106 return 0; 107 }; 108 private final StringBuilder phonemeText; 109 110 private final Languages.LanguageSet languages; 111 112 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 113 this.phonemeText = new StringBuilder(phonemeText); 114 this.languages = languages; 115 } 116 117 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 118 this(phonemeLeft.phonemeText, phonemeLeft.languages); 119 this.phonemeText.append(phonemeRight.phonemeText); 120 } 121 122 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 123 this(phonemeLeft.phonemeText, languages); 124 this.phonemeText.append(phonemeRight.phonemeText); 125 } 126 127 public Phoneme append(final CharSequence str) { 128 this.phonemeText.append(str); 129 return this; 130 } 131 132 public Languages.LanguageSet getLanguages() { 133 return this.languages; 134 } 135 136 @Override 137 public Iterable<Phoneme> getPhonemes() { 138 return Collections.singleton(this); 139 } 140 141 public CharSequence getPhonemeText() { 142 return this.phonemeText; 143 } 144 145 /** 146 * Deprecated since 1.9. 147 * 148 * @param right the Phoneme to join 149 * @return a new Phoneme 150 * @deprecated since 1.9 151 */ 152 @Deprecated 153 public Phoneme join(final Phoneme right) { 154 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 155 this.languages.restrictTo(right.languages)); 156 } 157 158 /** 159 * Returns a new Phoneme with the same text but a union of its 160 * current language set and the given one. 161 * 162 * @param lang the language set to merge 163 * @return a new Phoneme 164 */ 165 public Phoneme mergeWithLanguage(final LanguageSet lang) { 166 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 167 } 168 169 @Override 170 public String toString() { 171 return phonemeText.toString() + "[" + languages + "]"; 172 } 173 } 174 175 public interface PhonemeExpr { 176 Iterable<Phoneme> getPhonemes(); 177 } 178 179 public static final class PhonemeList implements PhonemeExpr { 180 181 private final List<Phoneme> phonemes; 182 183 public PhonemeList(final List<Phoneme> phonemes) { 184 this.phonemes = phonemes; 185 } 186 187 @Override 188 public List<Phoneme> getPhonemes() { 189 return this.phonemes; 190 } 191 } 192 193 /** 194 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 195 */ 196 public interface RPattern { 197 boolean isMatch(CharSequence input); 198 } 199 200 public static final RPattern ALL_STRINGS_RMATCHER = input -> true; 201 202 public static final String ALL = "ALL"; 203 204 private static final String DOUBLE_QUOTE = "\""; 205 206 private static final String HASH_INCLUDE = "#include"; 207 208 private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length(); 209 210 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 211 new EnumMap<>(NameType.class); 212 213 static { 214 for (final NameType s : NameType.values()) { 215 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 216 new EnumMap<>(RuleType.class); 217 218 for (final RuleType rt : RuleType.values()) { 219 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 220 221 final Languages ls = Languages.getInstance(s); 222 ls.getLanguages().forEach(l -> { 223 try (final Scanner scanner = createScanner(s, rt, l)) { 224 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 225 } catch (final IllegalStateException e) { 226 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 227 } 228 }); 229 if (!rt.equals(RuleType.RULES)) { 230 try (final Scanner scanner = createScanner(s, rt, "common")) { 231 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 232 } 233 } 234 235 rts.put(rt, Collections.unmodifiableMap(rs)); 236 } 237 238 RULES.put(s, Collections.unmodifiableMap(rts)); 239 } 240 } 241 242 private static boolean contains(final CharSequence chars, final char input) { 243 return chars.chars().anyMatch(c -> c == input); 244 } 245 246 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 247 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 248 nameType.getName(), rt.getName(), lang); 249 } 250 251 @SuppressWarnings("resource") // Closing the Scanner closes the resource 252 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 253 final String resName = createResourceName(nameType, rt, lang); 254 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 255 } 256 257 @SuppressWarnings("resource") // Closing the Scanner closes the resource 258 private static Scanner createScanner(final String lang) { 259 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 260 return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING); 261 } 262 263 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 264 final int suffixLength = suffix.length(); 265 final int inputLength = input.length(); 266 267 if (suffixLength > inputLength) { 268 return false; 269 } 270 for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) { 271 if (input.charAt(i) != suffix.charAt(j)) { 272 return false; 273 } 274 } 275 return true; 276 } 277 278 /** 279 * Gets rules for a combination of name type, rule type and languages. 280 * 281 * @param nameType 282 * the NameType to consider 283 * @param rt 284 * the RuleType to consider 285 * @param langs 286 * the set of languages to consider 287 * @return a list of Rules that apply 288 */ 289 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 290 final Languages.LanguageSet langs) { 291 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 292 final List<Rule> allRules = new ArrayList<>(); 293 ruleMap.values().forEach(rules -> allRules.addAll(rules)); 294 return allRules; 295 } 296 297 /** 298 * Gets rules for a combination of name type, rule type and a single language. 299 * 300 * @param nameType 301 * the NameType to consider 302 * @param rt 303 * the RuleType to consider 304 * @param lang 305 * the language to consider 306 * @return a list of Rules that apply 307 */ 308 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 309 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 310 } 311 312 /** 313 * Gets rules for a combination of name type, rule type and languages. 314 * 315 * @param nameType 316 * the NameType to consider 317 * @param rt 318 * the RuleType to consider 319 * @param langs 320 * the set of languages to consider 321 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 322 * @since 1.9 323 */ 324 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 325 final Languages.LanguageSet langs) { 326 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 327 getInstanceMap(nameType, rt, Languages.ANY); 328 } 329 330 /** 331 * Gets rules for a combination of name type, rule type and a single language. 332 * 333 * @param nameType 334 * the NameType to consider 335 * @param rt 336 * the RuleType to consider 337 * @param lang 338 * the language to consider 339 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 340 * @since 1.9 341 */ 342 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 343 final String lang) { 344 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 345 346 if (rules == null) { 347 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 348 nameType.getName(), rt.getName(), lang)); 349 } 350 351 return rules; 352 } 353 354 private static Phoneme parsePhoneme(final String ph) { 355 final int open = ph.indexOf("["); 356 if (open >= 0) { 357 if (!ph.endsWith("]")) { 358 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 359 } 360 final String before = ph.substring(0, open); 361 final String in = ph.substring(open + 1, ph.length() - 1); 362 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 363 364 return new Phoneme(before, Languages.LanguageSet.from(langs)); 365 } 366 return new Phoneme(ph, Languages.ANY_LANGUAGE); 367 } 368 369 private static PhonemeExpr parsePhonemeExpr(final String ph) { 370 if (ph.startsWith("(")) { // we have a bracketed list of options 371 if (!ph.endsWith(")")) { 372 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 373 } 374 375 final List<Phoneme> phs = new ArrayList<>(); 376 final String body = ph.substring(1, ph.length() - 1); 377 for (final String part : body.split("[|]")) { 378 phs.add(parsePhoneme(part)); 379 } 380 if (body.startsWith("|") || body.endsWith("|")) { 381 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 382 } 383 384 return new PhonemeList(phs); 385 } 386 return parsePhoneme(ph); 387 } 388 389 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 390 final Map<String, List<Rule>> lines = new HashMap<>(); 391 int currentLine = 0; 392 393 boolean inMultilineComment = false; 394 while (scanner.hasNextLine()) { 395 currentLine++; 396 final String rawLine = scanner.nextLine(); 397 String line = rawLine; 398 399 if (inMultilineComment) { 400 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 401 inMultilineComment = false; 402 } 403 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 404 inMultilineComment = true; 405 } else { 406 // discard comments 407 final int cmtI = line.indexOf(ResourceConstants.CMT); 408 if (cmtI >= 0) { 409 line = line.substring(0, cmtI); 410 } 411 412 // trim leading-trailing whitespace 413 line = line.trim(); 414 415 if (line.isEmpty()) { 416 continue; // empty lines can be safely skipped 417 } 418 419 if (line.startsWith(HASH_INCLUDE)) { 420 // include statement 421 final String incl = line.substring(HASH_INCLUDE_LENGTH).trim(); 422 if (incl.contains(" ")) { 423 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 424 location); 425 } 426 try (final Scanner hashIncludeScanner = createScanner(incl)) { 427 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 428 } 429 } else { 430 // rule 431 final String[] parts = line.split("\\s+"); 432 if (parts.length != 4) { 433 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 434 " parts: " + rawLine + " in " + location); 435 } 436 try { 437 final String pat = stripQuotes(parts[0]); 438 final String lCon = stripQuotes(parts[1]); 439 final String rCon = stripQuotes(parts[2]); 440 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 441 final int cLine = currentLine; 442 final Rule r = new Rule(pat, lCon, rCon, ph) { 443 private final int myLine = cLine; 444 private final String loc = location; 445 446 @Override 447 public String toString() { 448 final StringBuilder sb = new StringBuilder(); 449 sb.append("Rule"); 450 sb.append("{line=").append(myLine); 451 sb.append(", loc='").append(loc).append('\''); 452 sb.append(", pat='").append(pat).append('\''); 453 sb.append(", lcon='").append(lCon).append('\''); 454 sb.append(", rcon='").append(rCon).append('\''); 455 sb.append('}'); 456 return sb.toString(); 457 } 458 }; 459 final String patternKey = r.pattern.substring(0, 1); 460 final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>()); 461 rules.add(r); 462 } catch (final IllegalArgumentException e) { 463 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 464 location, e); 465 } 466 } 467 } 468 } 469 470 return lines; 471 } 472 473 /** 474 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 475 * 476 * @param regex 477 * the regular expression to compile 478 * @return an RPattern that will match this regex 479 */ 480 private static RPattern pattern(final String regex) { 481 final boolean startsWith = regex.startsWith("^"); 482 final boolean endsWith = regex.endsWith("$"); 483 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 484 final boolean boxes = content.contains("["); 485 486 if (!boxes) { 487 if (startsWith && endsWith) { 488 // exact match 489 if (content.isEmpty()) { 490 // empty 491 return input -> input.length() == 0; 492 } 493 return input -> input.equals(content); 494 } 495 if ((startsWith || endsWith) && content.isEmpty()) { 496 // matches every string 497 return ALL_STRINGS_RMATCHER; 498 } 499 if (startsWith) { 500 // matches from start 501 return input -> startsWith(input, content); 502 } 503 if (endsWith) { 504 // matches from start 505 return input -> endsWith(input, content); 506 } 507 } else { 508 final boolean startsWithBox = content.startsWith("["); 509 final boolean endsWithBox = content.endsWith("]"); 510 511 if (startsWithBox && endsWithBox) { 512 String boxContent = content.substring(1, content.length() - 1); 513 if (!boxContent.contains("[")) { 514 // box containing alternatives 515 final boolean negate = boxContent.startsWith("^"); 516 if (negate) { 517 boxContent = boxContent.substring(1); 518 } 519 final String bContent = boxContent; 520 final boolean shouldMatch = !negate; 521 522 if (startsWith && endsWith) { 523 // exact match 524 return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 525 } 526 if (startsWith) { 527 // first char 528 return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 529 } 530 if (endsWith) { 531 // last char 532 return input -> input.length() > 0 && 533 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 534 } 535 } 536 } 537 } 538 539 return new RPattern() { 540 final Pattern pattern = Pattern.compile(regex); 541 542 @Override 543 public boolean isMatch(final CharSequence input) { 544 final Matcher matcher = pattern.matcher(input); 545 return matcher.find(); 546 } 547 }; 548 } 549 550 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 551 if (prefix.length() > input.length()) { 552 return false; 553 } 554 for (int i = 0; i < prefix.length(); i++) { 555 if (input.charAt(i) != prefix.charAt(i)) { 556 return false; 557 } 558 } 559 return true; 560 } 561 562 private static String stripQuotes(String str) { 563 if (str.startsWith(DOUBLE_QUOTE)) { 564 str = str.substring(1); 565 } 566 567 if (str.endsWith(DOUBLE_QUOTE)) { 568 str = str.substring(0, str.length() - 1); 569 } 570 571 return str; 572 } 573 574 private final RPattern lContext; 575 576 private final String pattern; 577 578 private final PhonemeExpr phoneme; 579 580 private final RPattern rContext; 581 582 /** 583 * Creates a new rule. 584 * 585 * @param pattern 586 * the pattern 587 * @param lContext 588 * the left context 589 * @param rContext 590 * the right context 591 * @param phoneme 592 * the resulting phoneme 593 */ 594 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 595 this.pattern = pattern; 596 this.lContext = pattern(lContext + "$"); 597 this.rContext = pattern("^" + rContext); 598 this.phoneme = phoneme; 599 } 600 601 /** 602 * Gets the left context. This is a regular expression that must match to the left of the pattern. 603 * 604 * @return the left context Pattern 605 */ 606 public RPattern getLContext() { 607 return this.lContext; 608 } 609 610 /** 611 * Gets the pattern. This is a string-literal that must exactly match. 612 * 613 * @return the pattern 614 */ 615 public String getPattern() { 616 return this.pattern; 617 } 618 619 /** 620 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 621 * 622 * @return the phoneme 623 */ 624 public PhonemeExpr getPhoneme() { 625 return this.phoneme; 626 } 627 628 /** 629 * Gets the right context. This is a regular expression that must match to the right of the pattern. 630 * 631 * @return the right context Pattern 632 */ 633 public RPattern getRContext() { 634 return this.rContext; 635 } 636 637 /** 638 * Decides if the pattern and context match the input starting at a position. It is a match if the 639 * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and 640 * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}. 641 * 642 * @param input 643 * the input String 644 * @param i 645 * the int position within the input 646 * @return true if the pattern and left/right context match, false otherwise 647 */ 648 public boolean patternAndContextMatches(final CharSequence input, final int i) { 649 if (i < 0) { 650 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 651 } 652 653 final int patternLength = this.pattern.length(); 654 final int ipl = i + patternLength; 655 656 if (ipl > input.length()) { 657 // not enough room for the pattern to match 658 return false; 659 } 660 661 // evaluate the pattern, left context and right context 662 // fail early if any of the evaluations is not successful 663 if (!input.subSequence(i, ipl).equals(this.pattern)) { 664 return false; 665 } 666 if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 667 return false; 668 } 669 return this.lContext.isMatch(input.subSequence(0, i)); 670 } 671}