001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.io.InputStream; 021import java.util.ArrayList; 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.Comparator; 025import java.util.EnumMap; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Scanner; 031import java.util.Set; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * <ul> 043 * <li>the pattern matches at the current position</li> 044 * <li>the string up until the beginning of the pattern matches the left context</li> 045 * <li>the string from the end of the pattern matches the right context</li> 046 * <li>logical is ALL and all languages are in scope; or</li> 047 * <li>logical is any other value and at least one language is in scope</li> 048 * </ul> 049 * <p> 050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 051 * to explicitly construct their own. 052 * <p> 053 * Rules are immutable and thread-safe. 054 * <p> 055 * <b>Rules resources</b> 056 * <p> 057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 058 * named following the pattern: 059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 060 * <p> 061 * The format of these resources is the following: 062 * <ul> 063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 064 * will be interpreted as: 065 * <ol> 066 * <li>pattern</li> 067 * <li>left context</li> 068 * <li>right context</li> 069 * <li>phoneme</li> 070 * </ol> 071 * </li> 072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 073 * as a comment.</li> 074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 075 * all content until a line ending in '*' and '/' is found.</li> 076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 * @version $Id: Rule.Phoneme.html 928559 2014-11-10 02:53:54Z ggregory $ 081 */ 082public class Rule { 083 084 public static final class Phoneme implements PhonemeExpr { 085 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 086 @Override 087 public int compare(final Phoneme o1, final Phoneme o2) { 088 for (int i = 0; i < o1.phonemeText.length(); i++) { 089 if (i >= o2.phonemeText.length()) { 090 return +1; 091 } 092 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 093 if (c != 0) { 094 return c; 095 } 096 } 097 098 if (o1.phonemeText.length() < o2.phonemeText.length()) { 099 return -1; 100 } 101 102 return 0; 103 } 104 }; 105 106 private final StringBuilder phonemeText; 107 private final Languages.LanguageSet languages; 108 109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 110 this.phonemeText = new StringBuilder(phonemeText); 111 this.languages = languages; 112 } 113 114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 115 this(phonemeLeft.phonemeText, phonemeLeft.languages); 116 this.phonemeText.append(phonemeRight.phonemeText); 117 } 118 119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 120 this(phonemeLeft.phonemeText, languages); 121 this.phonemeText.append(phonemeRight.phonemeText); 122 } 123 124 public Phoneme append(final CharSequence str) { 125 this.phonemeText.append(str); 126 return this; 127 } 128 129 public Languages.LanguageSet getLanguages() { 130 return this.languages; 131 } 132 133 @Override 134 public Iterable<Phoneme> getPhonemes() { 135 return Collections.singleton(this); 136 } 137 138 public CharSequence getPhonemeText() { 139 return this.phonemeText; 140 } 141 142 /** 143 * Deprecated since 1.9. 144 * 145 * @param right the Phoneme to join 146 * @return a new Phoneme 147 * @deprecated since 1.9 148 */ 149 @Deprecated 150 public Phoneme join(final Phoneme right) { 151 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 152 this.languages.restrictTo(right.languages)); 153 } 154 155 /** 156 * Returns a new Phoneme with the same text but a union of its 157 * current language set and the given one. 158 * 159 * @param lang the language set to merge 160 * @return a new Phoneme 161 */ 162 public Phoneme mergeWithLanguage(final LanguageSet lang) { 163 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang)); 164 } 165 166 @Override 167 public String toString() { 168 return phonemeText.toString() + "[" + languages + "]"; 169 } 170 } 171 172 public interface PhonemeExpr { 173 Iterable<Phoneme> getPhonemes(); 174 } 175 176 public static final class PhonemeList implements PhonemeExpr { 177 private final List<Phoneme> phonemes; 178 179 public PhonemeList(final List<Phoneme> phonemes) { 180 this.phonemes = phonemes; 181 } 182 183 @Override 184 public List<Phoneme> getPhonemes() { 185 return this.phonemes; 186 } 187 } 188 189 /** 190 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 191 */ 192 public interface RPattern { 193 boolean isMatch(CharSequence input); 194 } 195 196 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 197 @Override 198 public boolean isMatch(final CharSequence input) { 199 return true; 200 } 201 }; 202 203 public static final String ALL = "ALL"; 204 205 private static final String DOUBLE_QUOTE = "\""; 206 207 private static final String HASH_INCLUDE = "#include"; 208 209 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 210 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class); 211 212 static { 213 for (final NameType s : NameType.values()) { 214 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 215 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class); 216 217 for (final RuleType rt : RuleType.values()) { 218 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>(); 219 220 final Languages ls = Languages.getInstance(s); 221 for (final String l : ls.getLanguages()) { 222 try { 223 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); 224 } catch (final IllegalStateException e) { 225 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 226 } 227 } 228 if (!rt.equals(RuleType.RULES)) { 229 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); 230 } 231 232 rts.put(rt, Collections.unmodifiableMap(rs)); 233 } 234 235 RULES.put(s, Collections.unmodifiableMap(rts)); 236 } 237 } 238 239 private static boolean contains(final CharSequence chars, final char input) { 240 for (int i = 0; i < chars.length(); i++) { 241 if (chars.charAt(i) == input) { 242 return true; 243 } 244 } 245 return false; 246 } 247 248 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 249 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 250 nameType.getName(), rt.getName(), lang); 251 } 252 253 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 254 final String resName = createResourceName(nameType, rt, lang); 255 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 256 257 if (rulesIS == null) { 258 throw new IllegalArgumentException("Unable to load resource: " + resName); 259 } 260 261 return new Scanner(rulesIS, ResourceConstants.ENCODING); 262 } 263 264 private static Scanner createScanner(final String lang) { 265 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 266 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 267 268 if (rulesIS == null) { 269 throw new IllegalArgumentException("Unable to load resource: " + resName); 270 } 271 272 return new Scanner(rulesIS, ResourceConstants.ENCODING); 273 } 274 275 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 276 if (suffix.length() > input.length()) { 277 return false; 278 } 279 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 280 if (input.charAt(i) != suffix.charAt(j)) { 281 return false; 282 } 283 } 284 return true; 285 } 286 287 /** 288 * Gets rules for a combination of name type, rule type and languages. 289 * 290 * @param nameType 291 * the NameType to consider 292 * @param rt 293 * the RuleType to consider 294 * @param langs 295 * the set of languages to consider 296 * @return a list of Rules that apply 297 */ 298 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 299 final Languages.LanguageSet langs) { 300 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 301 final List<Rule> allRules = new ArrayList<Rule>(); 302 for (final List<Rule> rules : ruleMap.values()) { 303 allRules.addAll(rules); 304 } 305 return allRules; 306 } 307 308 /** 309 * Gets rules for a combination of name type, rule type and a single language. 310 * 311 * @param nameType 312 * the NameType to consider 313 * @param rt 314 * the RuleType to consider 315 * @param lang 316 * the language to consider 317 * @return a list of Rules that apply 318 */ 319 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 320 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang)))); 321 } 322 323 /** 324 * Gets rules for a combination of name type, rule type and languages. 325 * 326 * @param nameType 327 * the NameType to consider 328 * @param rt 329 * the RuleType to consider 330 * @param langs 331 * the set of languages to consider 332 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 333 * @since 1.9 334 */ 335 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 336 final Languages.LanguageSet langs) { 337 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 338 getInstanceMap(nameType, rt, Languages.ANY); 339 } 340 341 /** 342 * Gets rules for a combination of name type, rule type and a single language. 343 * 344 * @param nameType 345 * the NameType to consider 346 * @param rt 347 * the RuleType to consider 348 * @param lang 349 * the language to consider 350 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 351 * @since 1.9 352 */ 353 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 354 final String lang) { 355 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 356 357 if (rules == null) { 358 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 359 nameType.getName(), rt.getName(), lang)); 360 } 361 362 return rules; 363 } 364 365 private static Phoneme parsePhoneme(final String ph) { 366 final int open = ph.indexOf("["); 367 if (open >= 0) { 368 if (!ph.endsWith("]")) { 369 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 370 } 371 final String before = ph.substring(0, open); 372 final String in = ph.substring(open + 1, ph.length() - 1); 373 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); 374 375 return new Phoneme(before, Languages.LanguageSet.from(langs)); 376 } else { 377 return new Phoneme(ph, Languages.ANY_LANGUAGE); 378 } 379 } 380 381 private static PhonemeExpr parsePhonemeExpr(final String ph) { 382 if (ph.startsWith("(")) { // we have a bracketed list of options 383 if (!ph.endsWith(")")) { 384 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 385 } 386 387 final List<Phoneme> phs = new ArrayList<Phoneme>(); 388 final String body = ph.substring(1, ph.length() - 1); 389 for (final String part : body.split("[|]")) { 390 phs.add(parsePhoneme(part)); 391 } 392 if (body.startsWith("|") || body.endsWith("|")) { 393 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 394 } 395 396 return new PhonemeList(phs); 397 } else { 398 return parsePhoneme(ph); 399 } 400 } 401 402 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 403 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>(); 404 int currentLine = 0; 405 406 boolean inMultilineComment = false; 407 while (scanner.hasNextLine()) { 408 currentLine++; 409 final String rawLine = scanner.nextLine(); 410 String line = rawLine; 411 412 if (inMultilineComment) { 413 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 414 inMultilineComment = false; 415 } 416 } else { 417 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 418 inMultilineComment = true; 419 } else { 420 // discard comments 421 final int cmtI = line.indexOf(ResourceConstants.CMT); 422 if (cmtI >= 0) { 423 line = line.substring(0, cmtI); 424 } 425 426 // trim leading-trailing whitespace 427 line = line.trim(); 428 429 if (line.length() == 0) { 430 continue; // empty lines can be safely skipped 431 } 432 433 if (line.startsWith(HASH_INCLUDE)) { 434 // include statement 435 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 436 if (incl.contains(" ")) { 437 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 438 location); 439 } else { 440 lines.putAll(parseRules(createScanner(incl), location + "->" + incl)); 441 } 442 } else { 443 // rule 444 final String[] parts = line.split("\\s+"); 445 if (parts.length != 4) { 446 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 447 " parts: " + rawLine + " in " + location); 448 } else { 449 try { 450 final String pat = stripQuotes(parts[0]); 451 final String lCon = stripQuotes(parts[1]); 452 final String rCon = stripQuotes(parts[2]); 453 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 454 final int cLine = currentLine; 455 final Rule r = new Rule(pat, lCon, rCon, ph) { 456 private final int myLine = cLine; 457 private final String loc = location; 458 459 @Override 460 public String toString() { 461 final StringBuilder sb = new StringBuilder(); 462 sb.append("Rule"); 463 sb.append("{line=").append(myLine); 464 sb.append(", loc='").append(loc).append('\''); 465 sb.append(", pat='").append(pat).append('\''); 466 sb.append(", lcon='").append(lCon).append('\''); 467 sb.append(", rcon='").append(rCon).append('\''); 468 sb.append('}'); 469 return sb.toString(); 470 } 471 }; 472 final String patternKey = r.pattern.substring(0,1); 473 List<Rule> rules = lines.get(patternKey); 474 if (rules == null) { 475 rules = new ArrayList<Rule>(); 476 lines.put(patternKey, rules); 477 } 478 rules.add(r); 479 } catch (final IllegalArgumentException e) { 480 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 481 location, e); 482 } 483 } 484 } 485 } 486 } 487 } 488 489 return lines; 490 } 491 492 /** 493 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 494 * 495 * @param regex 496 * the regular expression to compile 497 * @return an RPattern that will match this regex 498 */ 499 private static RPattern pattern(final String regex) { 500 final boolean startsWith = regex.startsWith("^"); 501 final boolean endsWith = regex.endsWith("$"); 502 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 503 final boolean boxes = content.contains("["); 504 505 if (!boxes) { 506 if (startsWith && endsWith) { 507 // exact match 508 if (content.length() == 0) { 509 // empty 510 return new RPattern() { 511 @Override 512 public boolean isMatch(final CharSequence input) { 513 return input.length() == 0; 514 } 515 }; 516 } else { 517 return new RPattern() { 518 @Override 519 public boolean isMatch(final CharSequence input) { 520 return input.equals(content); 521 } 522 }; 523 } 524 } else if ((startsWith || endsWith) && content.length() == 0) { 525 // matches every string 526 return ALL_STRINGS_RMATCHER; 527 } else if (startsWith) { 528 // matches from start 529 return new RPattern() { 530 @Override 531 public boolean isMatch(final CharSequence input) { 532 return startsWith(input, content); 533 } 534 }; 535 } else if (endsWith) { 536 // matches from start 537 return new RPattern() { 538 @Override 539 public boolean isMatch(final CharSequence input) { 540 return endsWith(input, content); 541 } 542 }; 543 } 544 } else { 545 final boolean startsWithBox = content.startsWith("["); 546 final boolean endsWithBox = content.endsWith("]"); 547 548 if (startsWithBox && endsWithBox) { 549 String boxContent = content.substring(1, content.length() - 1); 550 if (!boxContent.contains("[")) { 551 // box containing alternatives 552 final boolean negate = boxContent.startsWith("^"); 553 if (negate) { 554 boxContent = boxContent.substring(1); 555 } 556 final String bContent = boxContent; 557 final boolean shouldMatch = !negate; 558 559 if (startsWith && endsWith) { 560 // exact match 561 return new RPattern() { 562 @Override 563 public boolean isMatch(final CharSequence input) { 564 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 565 } 566 }; 567 } else if (startsWith) { 568 // first char 569 return new RPattern() { 570 @Override 571 public boolean isMatch(final CharSequence input) { 572 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 573 } 574 }; 575 } else if (endsWith) { 576 // last char 577 return new RPattern() { 578 @Override 579 public boolean isMatch(final CharSequence input) { 580 return input.length() > 0 && 581 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 582 } 583 }; 584 } 585 } 586 } 587 } 588 589 return new RPattern() { 590 Pattern pattern = Pattern.compile(regex); 591 592 @Override 593 public boolean isMatch(final CharSequence input) { 594 final Matcher matcher = pattern.matcher(input); 595 return matcher.find(); 596 } 597 }; 598 } 599 600 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 601 if (prefix.length() > input.length()) { 602 return false; 603 } 604 for (int i = 0; i < prefix.length(); i++) { 605 if (input.charAt(i) != prefix.charAt(i)) { 606 return false; 607 } 608 } 609 return true; 610 } 611 612 private static String stripQuotes(String str) { 613 if (str.startsWith(DOUBLE_QUOTE)) { 614 str = str.substring(1); 615 } 616 617 if (str.endsWith(DOUBLE_QUOTE)) { 618 str = str.substring(0, str.length() - 1); 619 } 620 621 return str; 622 } 623 624 private final RPattern lContext; 625 626 private final String pattern; 627 628 private final PhonemeExpr phoneme; 629 630 private final RPattern rContext; 631 632 /** 633 * Creates a new rule. 634 * 635 * @param pattern 636 * the pattern 637 * @param lContext 638 * the left context 639 * @param rContext 640 * the right context 641 * @param phoneme 642 * the resulting phoneme 643 */ 644 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 645 this.pattern = pattern; 646 this.lContext = pattern(lContext + "$"); 647 this.rContext = pattern("^" + rContext); 648 this.phoneme = phoneme; 649 } 650 651 /** 652 * Gets the left context. This is a regular expression that must match to the left of the pattern. 653 * 654 * @return the left context Pattern 655 */ 656 public RPattern getLContext() { 657 return this.lContext; 658 } 659 660 /** 661 * Gets the pattern. This is a string-literal that must exactly match. 662 * 663 * @return the pattern 664 */ 665 public String getPattern() { 666 return this.pattern; 667 } 668 669 /** 670 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 671 * 672 * @return the phoneme 673 */ 674 public PhonemeExpr getPhoneme() { 675 return this.phoneme; 676 } 677 678 /** 679 * Gets the right context. This is a regular expression that must match to the right of the pattern. 680 * 681 * @return the right context Pattern 682 */ 683 public RPattern getRContext() { 684 return this.rContext; 685 } 686 687 /** 688 * Decides if the pattern and context match the input starting at a position. It is a match if the 689 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 690 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 691 * 692 * @param input 693 * the input String 694 * @param i 695 * the int position within the input 696 * @return true if the pattern and left/right context match, false otherwise 697 */ 698 public boolean patternAndContextMatches(final CharSequence input, final int i) { 699 if (i < 0) { 700 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 701 } 702 703 final int patternLength = this.pattern.length(); 704 final int ipl = i + patternLength; 705 706 if (ipl > input.length()) { 707 // not enough room for the pattern to match 708 return false; 709 } 710 711 // evaluate the pattern, left context and right context 712 // fail early if any of the evaluations is not successful 713 if (!input.subSequence(i, ipl).equals(this.pattern)) { 714 return false; 715 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 716 return false; 717 } 718 return this.lContext.isMatch(input.subSequence(0, i)); 719 } 720}