001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.io.InputStream; 021import java.util.ArrayList; 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.Comparator; 025import java.util.EnumMap; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Scanner; 031import java.util.Set; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.commons.codec.language.bm.Languages.LanguageSet; 036 037/** 038 * A phoneme rule. 039 * <p> 040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 041 * and a logical flag indicating if all languages must be in play. A rule matches if: 042 * <ul> 043 * <li>the pattern matches at the current position</li> 044 * <li>the string up until the beginning of the pattern matches the left context</li> 045 * <li>the string from the end of the pattern matches the right context</li> 046 * <li>logical is ALL and all languages are in scope; or</li> 047 * <li>logical is any other value and at least one language is in scope</li> 048 * </ul> 049 * <p> 050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 051 * to explicitly construct their own. 052 * <p> 053 * Rules are immutable and thread-safe. 054 * <p> 055 * <b>Rules resources</b> 056 * <p> 057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 058 * named following the pattern: 059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 060 * <p> 061 * The format of these resources is the following: 062 * <ul> 063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 064 * will be interpreted as: 065 * <ol> 066 * <li>pattern</li> 067 * <li>left context</li> 068 * <li>right context</li> 069 * <li>phoneme</li> 070 * </ol> 071 * </li> 072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 073 * as a comment.</li> 074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 075 * all content until a line ending in '*' and '/' is found.</li> 076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 077 * </ul> 078 * 079 * @since 1.6 080 * @version $Id: Rule.PhonemeExpr.html 891688 2013-12-24 20:49:46Z ggregory $ 081 */ 082public class Rule { 083 084 public static final class Phoneme implements PhonemeExpr { 085 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 086 @Override 087 public int compare(final Phoneme o1, final Phoneme o2) { 088 for (int i = 0; i < o1.phonemeText.length(); i++) { 089 if (i >= o2.phonemeText.length()) { 090 return +1; 091 } 092 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 093 if (c != 0) { 094 return c; 095 } 096 } 097 098 if (o1.phonemeText.length() < o2.phonemeText.length()) { 099 return -1; 100 } 101 102 return 0; 103 } 104 }; 105 106 private final StringBuilder phonemeText; 107 private final Languages.LanguageSet languages; 108 109 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 110 this.phonemeText = new StringBuilder(phonemeText); 111 this.languages = languages; 112 } 113 114 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) { 115 this(phonemeLeft.phonemeText, phonemeLeft.languages); 116 this.phonemeText.append(phonemeRight.phonemeText); 117 } 118 119 public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) { 120 this(phonemeLeft.phonemeText, languages); 121 this.phonemeText.append(phonemeRight.phonemeText); 122 } 123 124 public Phoneme append(final CharSequence str) { 125 this.phonemeText.append(str); 126 return this; 127 } 128 129 public Languages.LanguageSet getLanguages() { 130 return this.languages; 131 } 132 133 @Override 134 public Iterable<Phoneme> getPhonemes() { 135 return Collections.singleton(this); 136 } 137 138 public CharSequence getPhonemeText() { 139 return this.phonemeText; 140 } 141 142 /** 143 * @deprecated since 1.9 144 */ 145 @Deprecated 146 public Phoneme join(final Phoneme right) { 147 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 148 this.languages.restrictTo(right.languages)); 149 } 150 } 151 152 public interface PhonemeExpr { 153 Iterable<Phoneme> getPhonemes(); 154 } 155 156 public static final class PhonemeList implements PhonemeExpr { 157 private final List<Phoneme> phonemes; 158 159 public PhonemeList(final List<Phoneme> phonemes) { 160 this.phonemes = phonemes; 161 } 162 163 @Override 164 public List<Phoneme> getPhonemes() { 165 return this.phonemes; 166 } 167 } 168 169 /** 170 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 171 */ 172 public interface RPattern { 173 boolean isMatch(CharSequence input); 174 } 175 176 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 177 @Override 178 public boolean isMatch(final CharSequence input) { 179 return true; 180 } 181 }; 182 183 public static final String ALL = "ALL"; 184 185 private static final String DOUBLE_QUOTE = "\""; 186 187 private static final String HASH_INCLUDE = "#include"; 188 189 private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = 190 new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class); 191 192 static { 193 for (final NameType s : NameType.values()) { 194 final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts = 195 new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class); 196 197 for (final RuleType rt : RuleType.values()) { 198 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>(); 199 200 final Languages ls = Languages.getInstance(s); 201 for (final String l : ls.getLanguages()) { 202 try { 203 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); 204 } catch (final IllegalStateException e) { 205 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 206 } 207 } 208 if (!rt.equals(RuleType.RULES)) { 209 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); 210 } 211 212 rts.put(rt, Collections.unmodifiableMap(rs)); 213 } 214 215 RULES.put(s, Collections.unmodifiableMap(rts)); 216 } 217 } 218 219 private static boolean contains(final CharSequence chars, final char input) { 220 for (int i = 0; i < chars.length(); i++) { 221 if (chars.charAt(i) == input) { 222 return true; 223 } 224 } 225 return false; 226 } 227 228 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 229 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 230 nameType.getName(), rt.getName(), lang); 231 } 232 233 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 234 final String resName = createResourceName(nameType, rt, lang); 235 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 236 237 if (rulesIS == null) { 238 throw new IllegalArgumentException("Unable to load resource: " + resName); 239 } 240 241 return new Scanner(rulesIS, ResourceConstants.ENCODING); 242 } 243 244 private static Scanner createScanner(final String lang) { 245 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 246 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 247 248 if (rulesIS == null) { 249 throw new IllegalArgumentException("Unable to load resource: " + resName); 250 } 251 252 return new Scanner(rulesIS, ResourceConstants.ENCODING); 253 } 254 255 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 256 if (suffix.length() > input.length()) { 257 return false; 258 } 259 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 260 if (input.charAt(i) != suffix.charAt(j)) { 261 return false; 262 } 263 } 264 return true; 265 } 266 267 /** 268 * Gets rules for a combination of name type, rule type and languages. 269 * 270 * @param nameType 271 * the NameType to consider 272 * @param rt 273 * the RuleType to consider 274 * @param langs 275 * the set of languages to consider 276 * @return a list of Rules that apply 277 */ 278 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 279 final Languages.LanguageSet langs) { 280 final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs); 281 final List<Rule> allRules = new ArrayList<Rule>(); 282 for (final List<Rule> rules : ruleMap.values()) { 283 allRules.addAll(rules); 284 } 285 return allRules; 286 } 287 288 /** 289 * Gets rules for a combination of name type, rule type and a single language. 290 * 291 * @param nameType 292 * the NameType to consider 293 * @param rt 294 * the RuleType to consider 295 * @param lang 296 * the language to consider 297 * @return a list of Rules that apply 298 */ 299 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 300 return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang)))); 301 } 302 303 /** 304 * Gets rules for a combination of name type, rule type and languages. 305 * 306 * @param nameType 307 * the NameType to consider 308 * @param rt 309 * the RuleType to consider 310 * @param langs 311 * the set of languages to consider 312 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 313 * @since 1.9 314 */ 315 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 316 final Languages.LanguageSet langs) { 317 return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : 318 getInstanceMap(nameType, rt, Languages.ANY); 319 } 320 321 /** 322 * Gets rules for a combination of name type, rule type and a single language. 323 * 324 * @param nameType 325 * the NameType to consider 326 * @param rt 327 * the RuleType to consider 328 * @param lang 329 * the language to consider 330 * @return a map containing all Rules that apply, grouped by the first character of the rule pattern 331 * @since 1.9 332 */ 333 public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, 334 final String lang) { 335 final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang); 336 337 if (rules == null) { 338 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 339 nameType.getName(), rt.getName(), lang)); 340 } 341 342 return rules; 343 } 344 345 private static Phoneme parsePhoneme(final String ph) { 346 final int open = ph.indexOf("["); 347 if (open >= 0) { 348 if (!ph.endsWith("]")) { 349 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 350 } 351 final String before = ph.substring(0, open); 352 final String in = ph.substring(open + 1, ph.length() - 1); 353 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); 354 355 return new Phoneme(before, Languages.LanguageSet.from(langs)); 356 } else { 357 return new Phoneme(ph, Languages.ANY_LANGUAGE); 358 } 359 } 360 361 private static PhonemeExpr parsePhonemeExpr(final String ph) { 362 if (ph.startsWith("(")) { // we have a bracketed list of options 363 if (!ph.endsWith(")")) { 364 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 365 } 366 367 final List<Phoneme> phs = new ArrayList<Phoneme>(); 368 final String body = ph.substring(1, ph.length() - 1); 369 for (final String part : body.split("[|]")) { 370 phs.add(parsePhoneme(part)); 371 } 372 if (body.startsWith("|") || body.endsWith("|")) { 373 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 374 } 375 376 return new PhonemeList(phs); 377 } else { 378 return parsePhoneme(ph); 379 } 380 } 381 382 private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) { 383 final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>(); 384 int currentLine = 0; 385 386 boolean inMultilineComment = false; 387 while (scanner.hasNextLine()) { 388 currentLine++; 389 final String rawLine = scanner.nextLine(); 390 String line = rawLine; 391 392 if (inMultilineComment) { 393 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 394 inMultilineComment = false; 395 } 396 } else { 397 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 398 inMultilineComment = true; 399 } else { 400 // discard comments 401 final int cmtI = line.indexOf(ResourceConstants.CMT); 402 if (cmtI >= 0) { 403 line = line.substring(0, cmtI); 404 } 405 406 // trim leading-trailing whitespace 407 line = line.trim(); 408 409 if (line.length() == 0) { 410 continue; // empty lines can be safely skipped 411 } 412 413 if (line.startsWith(HASH_INCLUDE)) { 414 // include statement 415 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 416 if (incl.contains(" ")) { 417 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 418 location); 419 } else { 420 lines.putAll(parseRules(createScanner(incl), location + "->" + incl)); 421 } 422 } else { 423 // rule 424 final String[] parts = line.split("\\s+"); 425 if (parts.length != 4) { 426 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 427 " parts: " + rawLine + " in " + location); 428 } else { 429 try { 430 final String pat = stripQuotes(parts[0]); 431 final String lCon = stripQuotes(parts[1]); 432 final String rCon = stripQuotes(parts[2]); 433 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 434 final int cLine = currentLine; 435 final Rule r = new Rule(pat, lCon, rCon, ph) { 436 private final int myLine = cLine; 437 private final String loc = location; 438 439 @Override 440 public String toString() { 441 final StringBuilder sb = new StringBuilder(); 442 sb.append("Rule"); 443 sb.append("{line=").append(myLine); 444 sb.append(", loc='").append(loc).append('\''); 445 sb.append('}'); 446 return sb.toString(); 447 } 448 }; 449 final String patternKey = r.pattern.substring(0,1); 450 List<Rule> rules = lines.get(patternKey); 451 if (rules == null) { 452 rules = new ArrayList<Rule>(); 453 lines.put(patternKey, rules); 454 } 455 rules.add(r); 456 } catch (final IllegalArgumentException e) { 457 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 458 location, e); 459 } 460 } 461 } 462 } 463 } 464 } 465 466 return lines; 467 } 468 469 /** 470 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 471 * 472 * @param regex 473 * the regular expression to compile 474 * @return an RPattern that will match this regex 475 */ 476 private static RPattern pattern(final String regex) { 477 final boolean startsWith = regex.startsWith("^"); 478 final boolean endsWith = regex.endsWith("$"); 479 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 480 final boolean boxes = content.contains("["); 481 482 if (!boxes) { 483 if (startsWith && endsWith) { 484 // exact match 485 if (content.length() == 0) { 486 // empty 487 return new RPattern() { 488 @Override 489 public boolean isMatch(final CharSequence input) { 490 return input.length() == 0; 491 } 492 }; 493 } else { 494 return new RPattern() { 495 @Override 496 public boolean isMatch(final CharSequence input) { 497 return input.equals(content); 498 } 499 }; 500 } 501 } else if ((startsWith || endsWith) && content.length() == 0) { 502 // matches every string 503 return ALL_STRINGS_RMATCHER; 504 } else if (startsWith) { 505 // matches from start 506 return new RPattern() { 507 @Override 508 public boolean isMatch(final CharSequence input) { 509 return startsWith(input, content); 510 } 511 }; 512 } else if (endsWith) { 513 // matches from start 514 return new RPattern() { 515 @Override 516 public boolean isMatch(final CharSequence input) { 517 return endsWith(input, content); 518 } 519 }; 520 } 521 } else { 522 final boolean startsWithBox = content.startsWith("["); 523 final boolean endsWithBox = content.endsWith("]"); 524 525 if (startsWithBox && endsWithBox) { 526 String boxContent = content.substring(1, content.length() - 1); 527 if (!boxContent.contains("[")) { 528 // box containing alternatives 529 final boolean negate = boxContent.startsWith("^"); 530 if (negate) { 531 boxContent = boxContent.substring(1); 532 } 533 final String bContent = boxContent; 534 final boolean shouldMatch = !negate; 535 536 if (startsWith && endsWith) { 537 // exact match 538 return new RPattern() { 539 @Override 540 public boolean isMatch(final CharSequence input) { 541 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 542 } 543 }; 544 } else if (startsWith) { 545 // first char 546 return new RPattern() { 547 @Override 548 public boolean isMatch(final CharSequence input) { 549 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 550 } 551 }; 552 } else if (endsWith) { 553 // last char 554 return new RPattern() { 555 @Override 556 public boolean isMatch(final CharSequence input) { 557 return input.length() > 0 && 558 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 559 } 560 }; 561 } 562 } 563 } 564 } 565 566 return new RPattern() { 567 Pattern pattern = Pattern.compile(regex); 568 569 @Override 570 public boolean isMatch(final CharSequence input) { 571 final Matcher matcher = pattern.matcher(input); 572 return matcher.find(); 573 } 574 }; 575 } 576 577 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 578 if (prefix.length() > input.length()) { 579 return false; 580 } 581 for (int i = 0; i < prefix.length(); i++) { 582 if (input.charAt(i) != prefix.charAt(i)) { 583 return false; 584 } 585 } 586 return true; 587 } 588 589 private static String stripQuotes(String str) { 590 if (str.startsWith(DOUBLE_QUOTE)) { 591 str = str.substring(1); 592 } 593 594 if (str.endsWith(DOUBLE_QUOTE)) { 595 str = str.substring(0, str.length() - 1); 596 } 597 598 return str; 599 } 600 601 private final RPattern lContext; 602 603 private final String pattern; 604 605 private final PhonemeExpr phoneme; 606 607 private final RPattern rContext; 608 609 /** 610 * Creates a new rule. 611 * 612 * @param pattern 613 * the pattern 614 * @param lContext 615 * the left context 616 * @param rContext 617 * the right context 618 * @param phoneme 619 * the resulting phoneme 620 */ 621 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 622 this.pattern = pattern; 623 this.lContext = pattern(lContext + "$"); 624 this.rContext = pattern("^" + rContext); 625 this.phoneme = phoneme; 626 } 627 628 /** 629 * Gets the left context. This is a regular expression that must match to the left of the pattern. 630 * 631 * @return the left context Pattern 632 */ 633 public RPattern getLContext() { 634 return this.lContext; 635 } 636 637 /** 638 * Gets the pattern. This is a string-literal that must exactly match. 639 * 640 * @return the pattern 641 */ 642 public String getPattern() { 643 return this.pattern; 644 } 645 646 /** 647 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 648 * 649 * @return the phoneme 650 */ 651 public PhonemeExpr getPhoneme() { 652 return this.phoneme; 653 } 654 655 /** 656 * Gets the right context. This is a regular expression that must match to the right of the pattern. 657 * 658 * @return the right context Pattern 659 */ 660 public RPattern getRContext() { 661 return this.rContext; 662 } 663 664 /** 665 * Decides if the pattern and context match the input starting at a position. It is a match if the 666 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 667 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 668 * 669 * @param input 670 * the input String 671 * @param i 672 * the int position within the input 673 * @return true if the pattern and left/right context match, false otherwise 674 */ 675 public boolean patternAndContextMatches(final CharSequence input, final int i) { 676 if (i < 0) { 677 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 678 } 679 680 final int patternLength = this.pattern.length(); 681 final int ipl = i + patternLength; 682 683 if (ipl > input.length()) { 684 // not enough room for the pattern to match 685 return false; 686 } 687 688 // evaluate the pattern, left context and right context 689 // fail early if any of the evaluations is not successful 690 if (!input.subSequence(i, ipl).equals(this.pattern)) { 691 return false; 692 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 693 return false; 694 } 695 return this.lContext.isMatch(input.subSequence(0, i)); 696 } 697}