001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.io.InputStream; 021import java.util.ArrayList; 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.Comparator; 025import java.util.EnumMap; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Scanner; 031import java.util.Set; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * <ul> 043 * <li>the pattern matches at the current position</li> 044 * <li>the string up until the beginning of the pattern matches the left context</li> 045 * <li>the string from the end of the pattern matches the right context</li> 046 * <li>logical is ALL and all languages are in scope; or</li> 047 * <li>logical is any other value and at least one language is in scope</li> 048 * </ul> 049 * <p> 050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 051 * to explicitly construct their own. 052 * <p> 053 * Rules are immutable and thread-safe. 054 * <p> 055 * <b>Rules resources</b> 056 * <p> 057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 058 * named following the pattern: 059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 060 * <p> 061 * The format of these resources is the following: 062 * <ul> 063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 064 * will be interpreted as: 065 * <ol> 066 * <li>pattern</li> 067 * <li>left context</li> 068 * <li>right context</li> 069 * <li>phoneme</li> 070 * </ol> 071 * </li> 072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 073 * as a comment.</li> 074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 075 * all content until a line ending in '*' and '/' is found.</li> 076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 * @version $Id: Rule.java 1760691 2016-09-14 12:14:26Z jochen $ 081 */ 082public class Rule { 083 084 public static final class Phoneme implements PhonemeExpr { 085 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 086 @Override 087 public int compare(final Phoneme o1, final Phoneme o2) { 088 for (int i = 0; i < o1.phonemeText.length(); i++) { 089 if (i >= o2.phonemeText.length()) { 090 return +1; 091 } 092 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 093 if (c != 0) { 094 return c; 095 } 096 } 097 098 if (o1.phonemeText.length() < o2.phonemeText.length()) { 099 return -1; 100 } 101 102 return 0; 103 } 104 }; 105 106 private final StringBuilder phonemeText; 107 private final Languages.LanguageSet languages; 108 109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 110 this.phonemeText = new StringBuilder(phonemeText); 111 this.languages = languages; 112 } 113 114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 115 this(phonemeLeft.phonemeText, phonemeLeft.languages); 116 this.phonemeText.append(phonemeRight.phonemeText); 117 } 118 119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 120 this(phonemeLeft.phonemeText, languages); 121 this.phonemeText.append(phonemeRight.phonemeText); 122 } 123 124 public Phoneme append(final CharSequence str) { 125 this.phonemeText.append(str); 126 return this; 127 } 128 129 public Languages.LanguageSet getLanguages() { 130 return this.languages; 131 } 132 133 @Override 134 public Iterable<Phoneme> getPhonemes() { 135 return Collections.singleton(this); 136 } 137 138 public CharSequence getPhonemeText() { 139 return this.phonemeText; 140 } 141 142 /** 143 * Deprecated since 1.9. 144 * 145 * @param right the Phoneme to join 146 * @return a new Phoneme 147 * @deprecated since 1.9 148 */ 149 @Deprecated 150 public Phoneme join(final Phoneme right) { 151 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 152 this.languages.restrictTo(right.languages)); 153 } 154 155 /** 156 * Returns a new Phoneme with the same text but a union of its 157 * current language set and the given one. 158 * 159 * @param lang the language set to merge 160 * @return a new Phoneme 161 */ 162 public Phoneme mergeWithLanguage(final LanguageSet lang) { 163 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 164 } 165 166 @Override 167 public String toString() { 168 return phonemeText.toString() + "[" + languages + "]"; 169 } 170 } 171 172 public interface PhonemeExpr { 173 Iterable<Phoneme> getPhonemes(); 174 } 175 176 public static final class PhonemeList implements PhonemeExpr { 177 private final List<Phoneme> phonemes; 178 179 public PhonemeList(final List<Phoneme> phonemes) { 180 this.phonemes = phonemes; 181 } 182 183 @Override 184 public List<Phoneme> getPhonemes() { 185 return this.phonemes; 186 } 187 } 188 189 /** 190 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 191 */ 192 public interface RPattern { 193 boolean isMatch(CharSequence input); 194 } 195 196 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 197 @Override 198 public boolean isMatch(final CharSequence input) { 199 return true; 200 } 201 }; 202 203 public static final String ALL = "ALL"; 204 205 private static final String DOUBLE_QUOTE = "\""; 206 207 private static final String HASH_INCLUDE = "#include"; 208 209 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 210 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class); 211 212 static { 213 for (final NameType s : NameType.values()) { 214 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 215 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class); 216 217 for (final RuleType rt : RuleType.values()) { 218 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>(); 219 220 final Languages ls = Languages.getInstance(s); 221 for (final String l : ls.getLanguages()) { 222 final Scanner scanner = createScanner(s, rt, l); 223 try { 224 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 225 } catch (final IllegalStateException e) { 226 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 227 } finally { 228 scanner.close(); 229 } 230 } 231 if (!rt.equals(RuleType.RULES)) { 232 final Scanner scanner = createScanner(s, rt, "common"); 233 try { 234 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 235 } finally { 236 scanner.close(); 237 } 238 } 239 240 rts.put(rt, Collections.unmodifiableMap(rs)); 241 } 242 243 RULES.put(s, Collections.unmodifiableMap(rts)); 244 } 245 } 246 247 private static boolean contains(final CharSequence chars, final char input) { 248 for (int i = 0; i < chars.length(); i++) { 249 if (chars.charAt(i) == input) { 250 return true; 251 } 252 } 253 return false; 254 } 255 256 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 257 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 258 nameType.getName(), rt.getName(), lang); 259 } 260 261 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 262 final String resName = createResourceName(nameType, rt, lang); 263 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 264 265 if (rulesIS == null) { 266 throw new IllegalArgumentException("Unable to load resource: " + resName); 267 } 268 269 return new Scanner(rulesIS, ResourceConstants.ENCODING); 270 } 271 272 private static Scanner createScanner(final String lang) { 273 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 274 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 275 276 if (rulesIS == null) { 277 throw new IllegalArgumentException("Unable to load resource: " + resName); 278 } 279 280 return new Scanner(rulesIS, ResourceConstants.ENCODING); 281 } 282 283 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 284 if (suffix.length() > input.length()) { 285 return false; 286 } 287 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 288 if (input.charAt(i) != suffix.charAt(j)) { 289 return false; 290 } 291 } 292 return true; 293 } 294 295 /** 296 * Gets rules for a combination of name type, rule type and languages. 297 * 298 * @param nameType 299 * the NameType to consider 300 * @param rt 301 * the RuleType to consider 302 * @param langs 303 * the set of languages to consider 304 * @return a list of Rules that apply 305 */ 306 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 307 final Languages.LanguageSet langs) { 308 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 309 final List<Rule> allRules = new ArrayList<Rule>(); 310 for (final List<Rule> rules : ruleMap.values()) { 311 allRules.addAll(rules); 312 } 313 return allRules; 314 } 315 316 /** 317 * Gets rules for a combination of name type, rule type and a single language. 318 * 319 * @param nameType 320 * the NameType to consider 321 * @param rt 322 * the RuleType to consider 323 * @param lang 324 * the language to consider 325 * @return a list of Rules that apply 326 */ 327 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 328 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang)))); 329 } 330 331 /** 332 * Gets rules for a combination of name type, rule type and languages. 333 * 334 * @param nameType 335 * the NameType to consider 336 * @param rt 337 * the RuleType to consider 338 * @param langs 339 * the set of languages to consider 340 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 341 * @since 1.9 342 */ 343 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 344 final Languages.LanguageSet langs) { 345 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 346 getInstanceMap(nameType, rt, Languages.ANY); 347 } 348 349 /** 350 * Gets rules for a combination of name type, rule type and a single language. 351 * 352 * @param nameType 353 * the NameType to consider 354 * @param rt 355 * the RuleType to consider 356 * @param lang 357 * the language to consider 358 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 359 * @since 1.9 360 */ 361 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 362 final String lang) { 363 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 364 365 if (rules == null) { 366 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 367 nameType.getName(), rt.getName(), lang)); 368 } 369 370 return rules; 371 } 372 373 private static Phoneme parsePhoneme(final String ph) { 374 final int open = ph.indexOf("["); 375 if (open >= 0) { 376 if (!ph.endsWith("]")) { 377 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 378 } 379 final String before = ph.substring(0, open); 380 final String in = ph.substring(open + 1, ph.length() - 1); 381 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); 382 383 return new Phoneme(before, Languages.LanguageSet.from(langs)); 384 } 385 return new Phoneme(ph, Languages.ANY_LANGUAGE); 386 } 387 388 private static PhonemeExpr parsePhonemeExpr(final String ph) { 389 if (ph.startsWith("(")) { // we have a bracketed list of options 390 if (!ph.endsWith(")")) { 391 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 392 } 393 394 final List<Phoneme> phs = new ArrayList<Phoneme>(); 395 final String body = ph.substring(1, ph.length() - 1); 396 for (final String part : body.split("[|]")) { 397 phs.add(parsePhoneme(part)); 398 } 399 if (body.startsWith("|") || body.endsWith("|")) { 400 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 401 } 402 403 return new PhonemeList(phs); 404 } 405 return parsePhoneme(ph); 406 } 407 408 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 409 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>(); 410 int currentLine = 0; 411 412 boolean inMultilineComment = false; 413 while (scanner.hasNextLine()) { 414 currentLine++; 415 final String rawLine = scanner.nextLine(); 416 String line = rawLine; 417 418 if (inMultilineComment) { 419 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 420 inMultilineComment = false; 421 } 422 } else { 423 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 424 inMultilineComment = true; 425 } else { 426 // discard comments 427 final int cmtI = line.indexOf(ResourceConstants.CMT); 428 if (cmtI >= 0) { 429 line = line.substring(0, cmtI); 430 } 431 432 // trim leading-trailing whitespace 433 line = line.trim(); 434 435 if (line.length() == 0) { 436 continue; // empty lines can be safely skipped 437 } 438 439 if (line.startsWith(HASH_INCLUDE)) { 440 // include statement 441 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 442 if (incl.contains(" ")) { 443 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 444 location); 445 } 446 final Scanner hashIncludeScanner = createScanner(incl); 447 try { 448 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 449 } finally { 450 hashIncludeScanner.close(); 451 } 452 } else { 453 // rule 454 final String[] parts = line.split("\\s+"); 455 if (parts.length != 4) { 456 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 457 " parts: " + rawLine + " in " + location); 458 } 459 try { 460 final String pat = stripQuotes(parts[0]); 461 final String lCon = stripQuotes(parts[1]); 462 final String rCon = stripQuotes(parts[2]); 463 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 464 final int cLine = currentLine; 465 final Rule r = new Rule(pat, lCon, rCon, ph) { 466 private final int myLine = cLine; 467 private final String loc = location; 468 469 @Override 470 public String toString() { 471 final StringBuilder sb = new StringBuilder(); 472 sb.append("Rule"); 473 sb.append("{line=").append(myLine); 474 sb.append(", loc='").append(loc).append('\''); 475 sb.append(", pat='").append(pat).append('\''); 476 sb.append(", lcon='").append(lCon).append('\''); 477 sb.append(", rcon='").append(rCon).append('\''); 478 sb.append('}'); 479 return sb.toString(); 480 } 481 }; 482 final String patternKey = r.pattern.substring(0,1); 483 List<Rule> rules = lines.get(patternKey); 484 if (rules == null) { 485 rules = new ArrayList<Rule>(); 486 lines.put(patternKey, rules); 487 } 488 rules.add(r); 489 } catch (final IllegalArgumentException e) { 490 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 491 location, e); 492 } 493 } 494 } 495 } 496 } 497 498 return lines; 499 } 500 501 /** 502 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 503 * 504 * @param regex 505 * the regular expression to compile 506 * @return an RPattern that will match this regex 507 */ 508 private static RPattern pattern(final String regex) { 509 final boolean startsWith = regex.startsWith("^"); 510 final boolean endsWith = regex.endsWith("$"); 511 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 512 final boolean boxes = content.contains("["); 513 514 if (!boxes) { 515 if (startsWith && endsWith) { 516 // exact match 517 if (content.length() == 0) { 518 // empty 519 return new RPattern() { 520 @Override 521 public boolean isMatch(final CharSequence input) { 522 return input.length() == 0; 523 } 524 }; 525 } 526 return new RPattern() { 527 @Override 528 public boolean isMatch(final CharSequence input) { 529 return input.equals(content); 530 } 531 }; 532 } else if ((startsWith || endsWith) && content.length() == 0) { 533 // matches every string 534 return ALL_STRINGS_RMATCHER; 535 } else if (startsWith) { 536 // matches from start 537 return new RPattern() { 538 @Override 539 public boolean isMatch(final CharSequence input) { 540 return startsWith(input, content); 541 } 542 }; 543 } else if (endsWith) { 544 // matches from start 545 return new RPattern() { 546 @Override 547 public boolean isMatch(final CharSequence input) { 548 return endsWith(input, content); 549 } 550 }; 551 } 552 } else { 553 final boolean startsWithBox = content.startsWith("["); 554 final boolean endsWithBox = content.endsWith("]"); 555 556 if (startsWithBox && endsWithBox) { 557 String boxContent = content.substring(1, content.length() - 1); 558 if (!boxContent.contains("[")) { 559 // box containing alternatives 560 final boolean negate = boxContent.startsWith("^"); 561 if (negate) { 562 boxContent = boxContent.substring(1); 563 } 564 final String bContent = boxContent; 565 final boolean shouldMatch = !negate; 566 567 if (startsWith && endsWith) { 568 // exact match 569 return new RPattern() { 570 @Override 571 public boolean isMatch(final CharSequence input) { 572 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 573 } 574 }; 575 } else if (startsWith) { 576 // first char 577 return new RPattern() { 578 @Override 579 public boolean isMatch(final CharSequence input) { 580 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 581 } 582 }; 583 } else if (endsWith) { 584 // last char 585 return new RPattern() { 586 @Override 587 public boolean isMatch(final CharSequence input) { 588 return input.length() > 0 && 589 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 590 } 591 }; 592 } 593 } 594 } 595 } 596 597 return new RPattern() { 598 Pattern pattern = Pattern.compile(regex); 599 600 @Override 601 public boolean isMatch(final CharSequence input) { 602 final Matcher matcher = pattern.matcher(input); 603 return matcher.find(); 604 } 605 }; 606 } 607 608 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 609 if (prefix.length() > input.length()) { 610 return false; 611 } 612 for (int i = 0; i < prefix.length(); i++) { 613 if (input.charAt(i) != prefix.charAt(i)) { 614 return false; 615 } 616 } 617 return true; 618 } 619 620 private static String stripQuotes(String str) { 621 if (str.startsWith(DOUBLE_QUOTE)) { 622 str = str.substring(1); 623 } 624 625 if (str.endsWith(DOUBLE_QUOTE)) { 626 str = str.substring(0, str.length() - 1); 627 } 628 629 return str; 630 } 631 632 private final RPattern lContext; 633 634 private final String pattern; 635 636 private final PhonemeExpr phoneme; 637 638 private final RPattern rContext; 639 640 /** 641 * Creates a new rule. 642 * 643 * @param pattern 644 * the pattern 645 * @param lContext 646 * the left context 647 * @param rContext 648 * the right context 649 * @param phoneme 650 * the resulting phoneme 651 */ 652 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 653 this.pattern = pattern; 654 this.lContext = pattern(lContext + "$"); 655 this.rContext = pattern("^" + rContext); 656 this.phoneme = phoneme; 657 } 658 659 /** 660 * Gets the left context. This is a regular expression that must match to the left of the pattern. 661 * 662 * @return the left context Pattern 663 */ 664 public RPattern getLContext() { 665 return this.lContext; 666 } 667 668 /** 669 * Gets the pattern. This is a string-literal that must exactly match. 670 * 671 * @return the pattern 672 */ 673 public String getPattern() { 674 return this.pattern; 675 } 676 677 /** 678 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 679 * 680 * @return the phoneme 681 */ 682 public PhonemeExpr getPhoneme() { 683 return this.phoneme; 684 } 685 686 /** 687 * Gets the right context. This is a regular expression that must match to the right of the pattern. 688 * 689 * @return the right context Pattern 690 */ 691 public RPattern getRContext() { 692 return this.rContext; 693 } 694 695 /** 696 * Decides if the pattern and context match the input starting at a position. It is a match if the 697 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 698 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 699 * 700 * @param input 701 * the input String 702 * @param i 703 * the int position within the input 704 * @return true if the pattern and left/right context match, false otherwise 705 */ 706 public boolean patternAndContextMatches(final CharSequence input, final int i) { 707 if (i < 0) { 708 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 709 } 710 711 final int patternLength = this.pattern.length(); 712 final int ipl = i + patternLength; 713 714 if (ipl > input.length()) { 715 // not enough room for the pattern to match 716 return false; 717 } 718 719 // evaluate the pattern, left context and right context 720 // fail early if any of the evaluations is not successful 721 if (!input.subSequence(i, ipl).equals(this.pattern)) { 722 return false; 723 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 724 return false; 725 } 726 return this.lContext.isMatch(input.subSequence(0, i)); 727 } 728}