001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.io.InputStream; 021import java.util.ArrayList; 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.Comparator; 025import java.util.EnumMap; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Scanner; 031import java.util.Set; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * <ul> 043 * <li>the pattern matches at the current position</li> 044 * <li>the string up until the beginning of the pattern matches the left context</li> 045 * <li>the string from the end of the pattern matches the right context</li> 046 * <li>logical is ALL and all languages are in scope; or</li> 047 * <li>logical is any other value and at least one language is in scope</li> 048 * </ul> 049 * <p> 050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 051 * to explicitly construct their own. 052 * <p> 053 * Rules are immutable and thread-safe. 054 * <p> 055 * <b>Rules resources</b> 056 * <p> 057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 058 * named following the pattern: 059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 060 * <p> 061 * The format of these resources is the following: 062 * <ul> 063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 064 * will be interpreted as: 065 * <ol> 066 * <li>pattern</li> 067 * <li>left context</li> 068 * <li>right context</li> 069 * <li>phoneme</li> 070 * </ol> 071 * </li> 072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 073 * as a comment.</li> 074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 075 * all content until a line ending in '*' and '/' is found.</li> 076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 * @version $Id$ 081 */ 082public class Rule { 083 084 public static final class Phoneme implements PhonemeExpr { 085 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 086 @Override 087 public int compare(final Phoneme o1, final Phoneme o2) { 088 for (int i = 0; i < o1.phonemeText.length(); i++) { 089 if (i >= o2.phonemeText.length()) { 090 return +1; 091 } 092 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 093 if (c != 0) { 094 return c; 095 } 096 } 097 098 if (o1.phonemeText.length() < o2.phonemeText.length()) { 099 return -1; 100 } 101 102 return 0; 103 } 104 }; 105 106 private final StringBuilder phonemeText; 107 private final Languages.LanguageSet languages; 108 109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 110 this.phonemeText = new StringBuilder(phonemeText); 111 this.languages = languages; 112 } 113 114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 115 this(phonemeLeft.phonemeText, phonemeLeft.languages); 116 this.phonemeText.append(phonemeRight.phonemeText); 117 } 118 119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 120 this(phonemeLeft.phonemeText, languages); 121 this.phonemeText.append(phonemeRight.phonemeText); 122 } 123 124 public Phoneme append(final CharSequence str) { 125 this.phonemeText.append(str); 126 return this; 127 } 128 129 public Languages.LanguageSet getLanguages() { 130 return this.languages; 131 } 132 133 @Override 134 public Iterable<Phoneme> getPhonemes() { 135 return Collections.singleton(this); 136 } 137 138 public CharSequence getPhonemeText() { 139 return this.phonemeText; 140 } 141 142 /** 143 * Deprecated since 1.9. 144 * 145 * @param right the Phoneme to join 146 * @return a new Phoneme 147 * @deprecated since 1.9 148 */ 149 @Deprecated 150 public Phoneme join(final Phoneme right) { 151 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 152 this.languages.restrictTo(right.languages)); 153 } 154 155 /** 156 * Returns a new Phoneme with the same text but a union of its 157 * current language set and the given one. 158 * 159 * @param lang the language set to merge 160 * @return a new Phoneme 161 */ 162 public Phoneme mergeWithLanguage(final LanguageSet lang) { 163 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 164 } 165 166 @Override 167 public String toString() { 168 return phonemeText.toString() + "[" + languages + "]"; 169 } 170 } 171 172 public interface PhonemeExpr { 173 Iterable<Phoneme> getPhonemes(); 174 } 175 176 public static final class PhonemeList implements PhonemeExpr { 177 private final List<Phoneme> phonemes; 178 179 public PhonemeList(final List<Phoneme> phonemes) { 180 this.phonemes = phonemes; 181 } 182 183 @Override 184 public List<Phoneme> getPhonemes() { 185 return this.phonemes; 186 } 187 } 188 189 /** 190 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 191 */ 192 public interface RPattern { 193 boolean isMatch(CharSequence input); 194 } 195 196 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 197 @Override 198 public boolean isMatch(final CharSequence input) { 199 return true; 200 } 201 }; 202 203 public static final String ALL = "ALL"; 204 205 private static final String DOUBLE_QUOTE = "\""; 206 207 private static final String HASH_INCLUDE = "#include"; 208 209 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 210 new EnumMap<>(NameType.class); 211 212 static { 213 for (final NameType s : NameType.values()) { 214 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 215 new EnumMap<>(RuleType.class); 216 217 for (final RuleType rt : RuleType.values()) { 218 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>(); 219 220 final Languages ls = Languages.getInstance(s); 221 for (final String l : ls.getLanguages()) { 222 try (final Scanner scanner = createScanner(s, rt, l)) { 223 rs.put(l, parseRules(scanner, createResourceName(s, rt, l))); 224 } catch (final IllegalStateException e) { 225 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 226 } 227 } 228 if (!rt.equals(RuleType.RULES)) { 229 try (final Scanner scanner = createScanner(s, rt, "common")) { 230 rs.put("common", parseRules(scanner, createResourceName(s, rt, "common"))); 231 } 232 } 233 234 rts.put(rt, Collections.unmodifiableMap(rs)); 235 } 236 237 RULES.put(s, Collections.unmodifiableMap(rts)); 238 } 239 } 240 241 private static boolean contains(final CharSequence chars, final char input) { 242 for (int i = 0; i < chars.length(); i++) { 243 if (chars.charAt(i) == input) { 244 return true; 245 } 246 } 247 return false; 248 } 249 250 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 251 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 252 nameType.getName(), rt.getName(), lang); 253 } 254 255 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 256 final String resName = createResourceName(nameType, rt, lang); 257 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 258 259 if (rulesIS == null) { 260 throw new IllegalArgumentException("Unable to load resource: " + resName); 261 } 262 263 return new Scanner(rulesIS, ResourceConstants.ENCODING); 264 } 265 266 private static Scanner createScanner(final String lang) { 267 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 268 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 269 270 if (rulesIS == null) { 271 throw new IllegalArgumentException("Unable to load resource: " + resName); 272 } 273 274 return new Scanner(rulesIS, ResourceConstants.ENCODING); 275 } 276 277 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 278 if (suffix.length() > input.length()) { 279 return false; 280 } 281 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 282 if (input.charAt(i) != suffix.charAt(j)) { 283 return false; 284 } 285 } 286 return true; 287 } 288 289 /** 290 * Gets rules for a combination of name type, rule type and languages. 291 * 292 * @param nameType 293 * the NameType to consider 294 * @param rt 295 * the RuleType to consider 296 * @param langs 297 * the set of languages to consider 298 * @return a list of Rules that apply 299 */ 300 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 301 final Languages.LanguageSet langs) { 302 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 303 final List<Rule> allRules = new ArrayList<>(); 304 for (final List<Rule> rules : ruleMap.values()) { 305 allRules.addAll(rules); 306 } 307 return allRules; 308 } 309 310 /** 311 * Gets rules for a combination of name type, rule type and a single language. 312 * 313 * @param nameType 314 * the NameType to consider 315 * @param rt 316 * the RuleType to consider 317 * @param lang 318 * the language to consider 319 * @return a list of Rules that apply 320 */ 321 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 322 return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang)))); 323 } 324 325 /** 326 * Gets rules for a combination of name type, rule type and languages. 327 * 328 * @param nameType 329 * the NameType to consider 330 * @param rt 331 * the RuleType to consider 332 * @param langs 333 * the set of languages to consider 334 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 335 * @since 1.9 336 */ 337 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 338 final Languages.LanguageSet langs) { 339 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 340 getInstanceMap(nameType, rt, Languages.ANY); 341 } 342 343 /** 344 * Gets rules for a combination of name type, rule type and a single language. 345 * 346 * @param nameType 347 * the NameType to consider 348 * @param rt 349 * the RuleType to consider 350 * @param lang 351 * the language to consider 352 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 353 * @since 1.9 354 */ 355 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 356 final String lang) { 357 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 358 359 if (rules == null) { 360 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 361 nameType.getName(), rt.getName(), lang)); 362 } 363 364 return rules; 365 } 366 367 private static Phoneme parsePhoneme(final String ph) { 368 final int open = ph.indexOf("["); 369 if (open >= 0) { 370 if (!ph.endsWith("]")) { 371 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 372 } 373 final String before = ph.substring(0, open); 374 final String in = ph.substring(open + 1, ph.length() - 1); 375 final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]"))); 376 377 return new Phoneme(before, Languages.LanguageSet.from(langs)); 378 } 379 return new Phoneme(ph, Languages.ANY_LANGUAGE); 380 } 381 382 private static PhonemeExpr parsePhonemeExpr(final String ph) { 383 if (ph.startsWith("(")) { // we have a bracketed list of options 384 if (!ph.endsWith(")")) { 385 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 386 } 387 388 final List<Phoneme> phs = new ArrayList<>(); 389 final String body = ph.substring(1, ph.length() - 1); 390 for (final String part : body.split("[|]")) { 391 phs.add(parsePhoneme(part)); 392 } 393 if (body.startsWith("|") || body.endsWith("|")) { 394 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 395 } 396 397 return new PhonemeList(phs); 398 } 399 return parsePhoneme(ph); 400 } 401 402 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 403 final Map<String, List<Rule>> lines = new HashMap<>(); 404 int currentLine = 0; 405 406 boolean inMultilineComment = false; 407 while (scanner.hasNextLine()) { 408 currentLine++; 409 final String rawLine = scanner.nextLine(); 410 String line = rawLine; 411 412 if (inMultilineComment) { 413 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 414 inMultilineComment = false; 415 } 416 } else { 417 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 418 inMultilineComment = true; 419 } else { 420 // discard comments 421 final int cmtI = line.indexOf(ResourceConstants.CMT); 422 if (cmtI >= 0) { 423 line = line.substring(0, cmtI); 424 } 425 426 // trim leading-trailing whitespace 427 line = line.trim(); 428 429 if (line.length() == 0) { 430 continue; // empty lines can be safely skipped 431 } 432 433 if (line.startsWith(HASH_INCLUDE)) { 434 // include statement 435 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 436 if (incl.contains(" ")) { 437 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 438 location); 439 } 440 try (final Scanner hashIncludeScanner = createScanner(incl)) { 441 lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl)); 442 } 443 } else { 444 // rule 445 final String[] parts = line.split("\\s+"); 446 if (parts.length != 4) { 447 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 448 " parts: " + rawLine + " in " + location); 449 } 450 try { 451 final String pat = stripQuotes(parts[0]); 452 final String lCon = stripQuotes(parts[1]); 453 final String rCon = stripQuotes(parts[2]); 454 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 455 final int cLine = currentLine; 456 final Rule r = new Rule(pat, lCon, rCon, ph) { 457 private final int myLine = cLine; 458 private final String loc = location; 459 460 @Override 461 public String toString() { 462 final StringBuilder sb = new StringBuilder(); 463 sb.append("Rule"); 464 sb.append("{line=").append(myLine); 465 sb.append(", loc='").append(loc).append('\''); 466 sb.append(", pat='").append(pat).append('\''); 467 sb.append(", lcon='").append(lCon).append('\''); 468 sb.append(", rcon='").append(rCon).append('\''); 469 sb.append('}'); 470 return sb.toString(); 471 } 472 }; 473 final String patternKey = r.pattern.substring(0,1); 474 List<Rule> rules = lines.get(patternKey); 475 if (rules == null) { 476 rules = new ArrayList<>(); 477 lines.put(patternKey, rules); 478 } 479 rules.add(r); 480 } catch (final IllegalArgumentException e) { 481 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 482 location, e); 483 } 484 } 485 } 486 } 487 } 488 489 return lines; 490 } 491 492 /** 493 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 494 * 495 * @param regex 496 * the regular expression to compile 497 * @return an RPattern that will match this regex 498 */ 499 private static RPattern pattern(final String regex) { 500 final boolean startsWith = regex.startsWith("^"); 501 final boolean endsWith = regex.endsWith("$"); 502 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 503 final boolean boxes = content.contains("["); 504 505 if (!boxes) { 506 if (startsWith && endsWith) { 507 // exact match 508 if (content.length() == 0) { 509 // empty 510 return new RPattern() { 511 @Override 512 public boolean isMatch(final CharSequence input) { 513 return input.length() == 0; 514 } 515 }; 516 } 517 return new RPattern() { 518 @Override 519 public boolean isMatch(final CharSequence input) { 520 return input.equals(content); 521 } 522 }; 523 } else if ((startsWith || endsWith) && content.length() == 0) { 524 // matches every string 525 return ALL_STRINGS_RMATCHER; 526 } else if (startsWith) { 527 // matches from start 528 return new RPattern() { 529 @Override 530 public boolean isMatch(final CharSequence input) { 531 return startsWith(input, content); 532 } 533 }; 534 } else if (endsWith) { 535 // matches from start 536 return new RPattern() { 537 @Override 538 public boolean isMatch(final CharSequence input) { 539 return endsWith(input, content); 540 } 541 }; 542 } 543 } else { 544 final boolean startsWithBox = content.startsWith("["); 545 final boolean endsWithBox = content.endsWith("]"); 546 547 if (startsWithBox && endsWithBox) { 548 String boxContent = content.substring(1, content.length() - 1); 549 if (!boxContent.contains("[")) { 550 // box containing alternatives 551 final boolean negate = boxContent.startsWith("^"); 552 if (negate) { 553 boxContent = boxContent.substring(1); 554 } 555 final String bContent = boxContent; 556 final boolean shouldMatch = !negate; 557 558 if (startsWith && endsWith) { 559 // exact match 560 return new RPattern() { 561 @Override 562 public boolean isMatch(final CharSequence input) { 563 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 564 } 565 }; 566 } else if (startsWith) { 567 // first char 568 return new RPattern() { 569 @Override 570 public boolean isMatch(final CharSequence input) { 571 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 572 } 573 }; 574 } else if (endsWith) { 575 // last char 576 return new RPattern() { 577 @Override 578 public boolean isMatch(final CharSequence input) { 579 return input.length() > 0 && 580 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 581 } 582 }; 583 } 584 } 585 } 586 } 587 588 return new RPattern() { 589 Pattern pattern = Pattern.compile(regex); 590 591 @Override 592 public boolean isMatch(final CharSequence input) { 593 final Matcher matcher = pattern.matcher(input); 594 return matcher.find(); 595 } 596 }; 597 } 598 599 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 600 if (prefix.length() > input.length()) { 601 return false; 602 } 603 for (int i = 0; i < prefix.length(); i++) { 604 if (input.charAt(i) != prefix.charAt(i)) { 605 return false; 606 } 607 } 608 return true; 609 } 610 611 private static String stripQuotes(String str) { 612 if (str.startsWith(DOUBLE_QUOTE)) { 613 str = str.substring(1); 614 } 615 616 if (str.endsWith(DOUBLE_QUOTE)) { 617 str = str.substring(0, str.length() - 1); 618 } 619 620 return str; 621 } 622 623 private final RPattern lContext; 624 625 private final String pattern; 626 627 private final PhonemeExpr phoneme; 628 629 private final RPattern rContext; 630 631 /** 632 * Creates a new rule. 633 * 634 * @param pattern 635 * the pattern 636 * @param lContext 637 * the left context 638 * @param rContext 639 * the right context 640 * @param phoneme 641 * the resulting phoneme 642 */ 643 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 644 this.pattern = pattern; 645 this.lContext = pattern(lContext + "$"); 646 this.rContext = pattern("^" + rContext); 647 this.phoneme = phoneme; 648 } 649 650 /** 651 * Gets the left context. This is a regular expression that must match to the left of the pattern. 652 * 653 * @return the left context Pattern 654 */ 655 public RPattern getLContext() { 656 return this.lContext; 657 } 658 659 /** 660 * Gets the pattern. This is a string-literal that must exactly match. 661 * 662 * @return the pattern 663 */ 664 public String getPattern() { 665 return this.pattern; 666 } 667 668 /** 669 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 670 * 671 * @return the phoneme 672 */ 673 public PhonemeExpr getPhoneme() { 674 return this.phoneme; 675 } 676 677 /** 678 * Gets the right context. This is a regular expression that must match to the right of the pattern. 679 * 680 * @return the right context Pattern 681 */ 682 public RPattern getRContext() { 683 return this.rContext; 684 } 685 686 /** 687 * Decides if the pattern and context match the input starting at a position. It is a match if the 688 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 689 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 690 * 691 * @param input 692 * the input String 693 * @param i 694 * the int position within the input 695 * @return true if the pattern and left/right context match, false otherwise 696 */ 697 public boolean patternAndContextMatches(final CharSequence input, final int i) { 698 if (i < 0) { 699 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 700 } 701 702 final int patternLength = this.pattern.length(); 703 final int ipl = i + patternLength; 704 705 if (ipl > input.length()) { 706 // not enough room for the pattern to match 707 return false; 708 } 709 710 // evaluate the pattern, left context and right context 711 // fail early if any of the evaluations is not successful 712 if (!input.subSequence(i, ipl).equals(this.pattern)) { 713 return false; 714 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 715 return false; 716 } 717 return this.lContext.isMatch(input.subSequence(0, i)); 718 } 719}