001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language.bm; 019 020 import java.io.InputStream; 021 import java.util.ArrayList; 022 import java.util.Arrays; 023 import java.util.Collections; 024 import java.util.Comparator; 025 import java.util.EnumMap; 026 import java.util.HashMap; 027 import java.util.HashSet; 028 import java.util.List; 029 import java.util.Map; 030 import java.util.Scanner; 031 import java.util.Set; 032 import java.util.regex.Matcher; 033 import java.util.regex.Pattern; 034 035 /** 036 * A phoneme rule. 037 * <p> 038 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 039 * and a logical flag indicating if all languages must be in play. A rule matches if: 040 * <ul> 041 * <li>the pattern matches at the current position</li> 042 * <li>the string up until the beginning of the pattern matches the left context</li> 043 * <li>the string from the end of the pattern matches the right context</li> 044 * <li>logical is ALL and all languages are in scope; or</li> 045 * <li>logical is any other value and at least one language is in scope</li> 046 * </ul> 047 * <p> 048 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 049 * to explicitly construct their own. 050 * <p> 051 * Rules are immutable and thread-safe. 052 * <p> 053 * <b>Rules resources</b> 054 * <p> 055 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 056 * named following the pattern: 057 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 058 * <p> 059 * The format of these resources is the following: 060 * <ul> 061 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 062 * will be interpreted as: 063 * <ol> 064 * <li>pattern</li> 065 * <li>left context</li> 066 * <li>right context</li> 067 * <li>phoneme</li> 068 * </ol> 069 * </li> 070 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 071 * as a comment.</li> 072 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 073 * all content until a line ending in '*' and '/' is found.</li> 074 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 075 * </ul> 076 * 077 * @since 1.6 078 * @version $Id: Rule.html 889935 2013-12-11 05:05:13Z ggregory $ 079 */ 080 public class Rule { 081 082 public static final class Phoneme implements PhonemeExpr { 083 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 084 @Override 085 public int compare(final Phoneme o1, final Phoneme o2) { 086 for (int i = 0; i < o1.phonemeText.length(); i++) { 087 if (i >= o2.phonemeText.length()) { 088 return +1; 089 } 090 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 091 if (c != 0) { 092 return c; 093 } 094 } 095 096 if (o1.phonemeText.length() < o2.phonemeText.length()) { 097 return -1; 098 } 099 100 return 0; 101 } 102 }; 103 104 private final CharSequence phonemeText; 105 private final Languages.LanguageSet languages; 106 107 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { 108 this.phonemeText = phonemeText; 109 this.languages = languages; 110 } 111 112 public Phoneme append(final CharSequence str) { 113 return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); 114 } 115 116 public Languages.LanguageSet getLanguages() { 117 return this.languages; 118 } 119 120 @Override 121 public Iterable<Phoneme> getPhonemes() { 122 return Collections.singleton(this); 123 } 124 125 public CharSequence getPhonemeText() { 126 return this.phonemeText; 127 } 128 129 public Phoneme join(final Phoneme right) { 130 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 131 this.languages.restrictTo(right.languages)); 132 } 133 } 134 135 public interface PhonemeExpr { 136 Iterable<Phoneme> getPhonemes(); 137 } 138 139 public static final class PhonemeList implements PhonemeExpr { 140 private final List<Phoneme> phonemes; 141 142 public PhonemeList(final List<Phoneme> phonemes) { 143 this.phonemes = phonemes; 144 } 145 146 @Override 147 public List<Phoneme> getPhonemes() { 148 return this.phonemes; 149 } 150 } 151 152 /** 153 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 154 */ 155 public static interface RPattern { 156 boolean isMatch(CharSequence input); 157 } 158 159 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 160 @Override 161 public boolean isMatch(final CharSequence input) { 162 return true; 163 } 164 }; 165 166 public static final String ALL = "ALL"; 167 168 private static final String DOUBLE_QUOTE = "\""; 169 170 private static final String HASH_INCLUDE = "#include"; 171 172 private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES = 173 new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class); 174 175 static { 176 for (final NameType s : NameType.values()) { 177 final Map<RuleType, Map<String, List<Rule>>> rts = 178 new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class); 179 180 for (final RuleType rt : RuleType.values()) { 181 final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>(); 182 183 final Languages ls = Languages.getInstance(s); 184 for (final String l : ls.getLanguages()) { 185 try { 186 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); 187 } catch (final IllegalStateException e) { 188 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 189 } 190 } 191 if (!rt.equals(RuleType.RULES)) { 192 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); 193 } 194 195 rts.put(rt, Collections.unmodifiableMap(rs)); 196 } 197 198 RULES.put(s, Collections.unmodifiableMap(rts)); 199 } 200 } 201 202 private static boolean contains(final CharSequence chars, final char input) { 203 for (int i = 0; i < chars.length(); i++) { 204 if (chars.charAt(i) == input) { 205 return true; 206 } 207 } 208 return false; 209 } 210 211 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { 212 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 213 nameType.getName(), rt.getName(), lang); 214 } 215 216 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { 217 final String resName = createResourceName(nameType, rt, lang); 218 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 219 220 if (rulesIS == null) { 221 throw new IllegalArgumentException("Unable to load resource: " + resName); 222 } 223 224 return new Scanner(rulesIS, ResourceConstants.ENCODING); 225 } 226 227 private static Scanner createScanner(final String lang) { 228 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 229 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 230 231 if (rulesIS == null) { 232 throw new IllegalArgumentException("Unable to load resource: " + resName); 233 } 234 235 return new Scanner(rulesIS, ResourceConstants.ENCODING); 236 } 237 238 private static boolean endsWith(final CharSequence input, final CharSequence suffix) { 239 if (suffix.length() > input.length()) { 240 return false; 241 } 242 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 243 if (input.charAt(i) != suffix.charAt(j)) { 244 return false; 245 } 246 } 247 return true; 248 } 249 250 /** 251 * Gets rules for a combination of name type, rule type and languages. 252 * 253 * @param nameType 254 * the NameType to consider 255 * @param rt 256 * the RuleType to consider 257 * @param langs 258 * the set of languages to consider 259 * @return a list of Rules that apply 260 */ 261 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, 262 final Languages.LanguageSet langs) { 263 return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) : 264 getInstance(nameType, rt, Languages.ANY); 265 } 266 267 /** 268 * Gets rules for a combination of name type, rule type and a single language. 269 * 270 * @param nameType 271 * the NameType to consider 272 * @param rt 273 * the RuleType to consider 274 * @param lang 275 * the language to consider 276 * @return a list rules for a combination of name type, rule type and a single language. 277 */ 278 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { 279 final List<Rule> rules = RULES.get(nameType).get(rt).get(lang); 280 281 if (rules == null) { 282 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 283 nameType.getName(), rt.getName(), lang)); 284 } 285 286 return rules; 287 } 288 289 private static Phoneme parsePhoneme(final String ph) { 290 final int open = ph.indexOf("["); 291 if (open >= 0) { 292 if (!ph.endsWith("]")) { 293 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 294 } 295 final String before = ph.substring(0, open); 296 final String in = ph.substring(open + 1, ph.length() - 1); 297 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); 298 299 return new Phoneme(before, Languages.LanguageSet.from(langs)); 300 } else { 301 return new Phoneme(ph, Languages.ANY_LANGUAGE); 302 } 303 } 304 305 private static PhonemeExpr parsePhonemeExpr(final String ph) { 306 if (ph.startsWith("(")) { // we have a bracketed list of options 307 if (!ph.endsWith(")")) { 308 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 309 } 310 311 final List<Phoneme> phs = new ArrayList<Phoneme>(); 312 final String body = ph.substring(1, ph.length() - 1); 313 for (final String part : body.split("[|]")) { 314 phs.add(parsePhoneme(part)); 315 } 316 if (body.startsWith("|") || body.endsWith("|")) { 317 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 318 } 319 320 return new PhonemeList(phs); 321 } else { 322 return parsePhoneme(ph); 323 } 324 } 325 326 private static List<Rule> parseRules(final Scanner scanner, final String location) { 327 final List<Rule> lines = new ArrayList<Rule>(); 328 int currentLine = 0; 329 330 boolean inMultilineComment = false; 331 while (scanner.hasNextLine()) { 332 currentLine++; 333 final String rawLine = scanner.nextLine(); 334 String line = rawLine; 335 336 if (inMultilineComment) { 337 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 338 inMultilineComment = false; 339 } 340 } else { 341 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 342 inMultilineComment = true; 343 } else { 344 // discard comments 345 final int cmtI = line.indexOf(ResourceConstants.CMT); 346 if (cmtI >= 0) { 347 line = line.substring(0, cmtI); 348 } 349 350 // trim leading-trailing whitespace 351 line = line.trim(); 352 353 if (line.length() == 0) { 354 continue; // empty lines can be safely skipped 355 } 356 357 if (line.startsWith(HASH_INCLUDE)) { 358 // include statement 359 final String incl = line.substring(HASH_INCLUDE.length()).trim(); 360 if (incl.contains(" ")) { 361 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 362 location); 363 } else { 364 lines.addAll(parseRules(createScanner(incl), location + "->" + incl)); 365 } 366 } else { 367 // rule 368 final String[] parts = line.split("\\s+"); 369 if (parts.length != 4) { 370 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 371 " parts: " + rawLine + " in " + location); 372 } else { 373 try { 374 final String pat = stripQuotes(parts[0]); 375 final String lCon = stripQuotes(parts[1]); 376 final String rCon = stripQuotes(parts[2]); 377 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 378 final int cLine = currentLine; 379 final Rule r = new Rule(pat, lCon, rCon, ph) { 380 private final int myLine = cLine; 381 private final String loc = location; 382 383 @Override 384 public String toString() { 385 final StringBuilder sb = new StringBuilder(); 386 sb.append("Rule"); 387 sb.append("{line=").append(myLine); 388 sb.append(", loc='").append(loc).append('\''); 389 sb.append('}'); 390 return sb.toString(); 391 } 392 }; 393 lines.add(r); 394 } catch (final IllegalArgumentException e) { 395 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 396 location, e); 397 } 398 } 399 } 400 } 401 } 402 } 403 404 return lines; 405 } 406 407 /** 408 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 409 * 410 * @param regex 411 * the regular expression to compile 412 * @return an RPattern that will match this regex 413 */ 414 private static RPattern pattern(final String regex) { 415 final boolean startsWith = regex.startsWith("^"); 416 final boolean endsWith = regex.endsWith("$"); 417 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 418 final boolean boxes = content.contains("["); 419 420 if (!boxes) { 421 if (startsWith && endsWith) { 422 // exact match 423 if (content.length() == 0) { 424 // empty 425 return new RPattern() { 426 @Override 427 public boolean isMatch(final CharSequence input) { 428 return input.length() == 0; 429 } 430 }; 431 } else { 432 return new RPattern() { 433 @Override 434 public boolean isMatch(final CharSequence input) { 435 return input.equals(content); 436 } 437 }; 438 } 439 } else if ((startsWith || endsWith) && content.length() == 0) { 440 // matches every string 441 return ALL_STRINGS_RMATCHER; 442 } else if (startsWith) { 443 // matches from start 444 return new RPattern() { 445 @Override 446 public boolean isMatch(final CharSequence input) { 447 return startsWith(input, content); 448 } 449 }; 450 } else if (endsWith) { 451 // matches from start 452 return new RPattern() { 453 @Override 454 public boolean isMatch(final CharSequence input) { 455 return endsWith(input, content); 456 } 457 }; 458 } 459 } else { 460 final boolean startsWithBox = content.startsWith("["); 461 final boolean endsWithBox = content.endsWith("]"); 462 463 if (startsWithBox && endsWithBox) { 464 String boxContent = content.substring(1, content.length() - 1); 465 if (!boxContent.contains("[")) { 466 // box containing alternatives 467 final boolean negate = boxContent.startsWith("^"); 468 if (negate) { 469 boxContent = boxContent.substring(1); 470 } 471 final String bContent = boxContent; 472 final boolean shouldMatch = !negate; 473 474 if (startsWith && endsWith) { 475 // exact match 476 return new RPattern() { 477 @Override 478 public boolean isMatch(final CharSequence input) { 479 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 480 } 481 }; 482 } else if (startsWith) { 483 // first char 484 return new RPattern() { 485 @Override 486 public boolean isMatch(final CharSequence input) { 487 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 488 } 489 }; 490 } else if (endsWith) { 491 // last char 492 return new RPattern() { 493 @Override 494 public boolean isMatch(final CharSequence input) { 495 return input.length() > 0 && 496 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 497 } 498 }; 499 } 500 } 501 } 502 } 503 504 return new RPattern() { 505 Pattern pattern = Pattern.compile(regex); 506 507 @Override 508 public boolean isMatch(final CharSequence input) { 509 final Matcher matcher = pattern.matcher(input); 510 return matcher.find(); 511 } 512 }; 513 } 514 515 private static boolean startsWith(final CharSequence input, final CharSequence prefix) { 516 if (prefix.length() > input.length()) { 517 return false; 518 } 519 for (int i = 0; i < prefix.length(); i++) { 520 if (input.charAt(i) != prefix.charAt(i)) { 521 return false; 522 } 523 } 524 return true; 525 } 526 527 private static String stripQuotes(String str) { 528 if (str.startsWith(DOUBLE_QUOTE)) { 529 str = str.substring(1); 530 } 531 532 if (str.endsWith(DOUBLE_QUOTE)) { 533 str = str.substring(0, str.length() - 1); 534 } 535 536 return str; 537 } 538 539 private final RPattern lContext; 540 541 private final String pattern; 542 543 private final PhonemeExpr phoneme; 544 545 private final RPattern rContext; 546 547 /** 548 * Creates a new rule. 549 * 550 * @param pattern 551 * the pattern 552 * @param lContext 553 * the left context 554 * @param rContext 555 * the right context 556 * @param phoneme 557 * the resulting phoneme 558 */ 559 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { 560 this.pattern = pattern; 561 this.lContext = pattern(lContext + "$"); 562 this.rContext = pattern("^" + rContext); 563 this.phoneme = phoneme; 564 } 565 566 /** 567 * Gets the left context. This is a regular expression that must match to the left of the pattern. 568 * 569 * @return the left context Pattern 570 */ 571 public RPattern getLContext() { 572 return this.lContext; 573 } 574 575 /** 576 * Gets the pattern. This is a string-literal that must exactly match. 577 * 578 * @return the pattern 579 */ 580 public String getPattern() { 581 return this.pattern; 582 } 583 584 /** 585 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 586 * 587 * @return the phoneme 588 */ 589 public PhonemeExpr getPhoneme() { 590 return this.phoneme; 591 } 592 593 /** 594 * Gets the right context. This is a regular expression that must match to the right of the pattern. 595 * 596 * @return the right context Pattern 597 */ 598 public RPattern getRContext() { 599 return this.rContext; 600 } 601 602 /** 603 * Decides if the pattern and context match the input starting at a position. It is a match if the 604 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 605 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 606 * 607 * @param input 608 * the input String 609 * @param i 610 * the int position within the input 611 * @return true if the pattern and left/right context match, false otherwise 612 */ 613 public boolean patternAndContextMatches(final CharSequence input, final int i) { 614 if (i < 0) { 615 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 616 } 617 618 final int patternLength = this.pattern.length(); 619 final int ipl = i + patternLength; 620 621 if (ipl > input.length()) { 622 // not enough room for the pattern to match 623 return false; 624 } 625 626 // evaluate the pattern, left context and right context 627 // fail early if any of the evaluations is not successful 628 if (!input.subSequence(i, ipl).equals(this.pattern)) { 629 return false; 630 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 631 return false; 632 } 633 return this.lContext.isMatch(input.subSequence(0, i)); 634 } 635 }