001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language.bm; 019 020 import java.io.InputStream; 021 import java.util.ArrayList; 022 import java.util.Arrays; 023 import java.util.Collections; 024 import java.util.Comparator; 025 import java.util.EnumMap; 026 import java.util.HashMap; 027 import java.util.HashSet; 028 import java.util.List; 029 import java.util.Map; 030 import java.util.Scanner; 031 import java.util.Set; 032 import java.util.regex.Matcher; 033 import java.util.regex.Pattern; 034 035 /** 036 * A phoneme rule. 037 * <p> 038 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply 039 * and a logical flag indicating if all languages must be in play. A rule matches if: 040 * <ul> 041 * <li>the pattern matches at the current position</li> 042 * <li>the string up until the beginning of the pattern matches the left context</li> 043 * <li>the string from the end of the pattern matches the right context</li> 044 * <li>logical is ALL and all languages are in scope; or</li> 045 * <li>logical is any other value and at least one language is in scope</li> 046 * </ul> 047 * <p> 048 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user 049 * to explicitly construct their own. 050 * <p> 051 * Rules are immutable and thread-safe. 052 * <p> 053 * <b>Rules resources</b> 054 * <p> 055 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically 056 * named following the pattern: 057 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote> 058 * <p> 059 * The format of these resources is the following: 060 * <ul> 061 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these 062 * will be interpreted as: 063 * <ol> 064 * <li>pattern</li> 065 * <li>left context</li> 066 * <li>right context</li> 067 * <li>phoneme</li> 068 * </ol> 069 * </li> 070 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded 071 * as a comment.</li> 072 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip 073 * all content until a line ending in '*' and '/' is found.</li> 074 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 075 * </ul> 076 * 077 * @since 1.6 078 * @version $Id: Rule.PhonemeExpr.html 889935 2013-12-11 05:05:13Z ggregory $ 079 */ 080 public class Rule { 081 082 public static final class Phoneme implements PhonemeExpr { 083 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { 084 @Override 085 public int compare(Phoneme o1, Phoneme o2) { 086 for (int i = 0; i < o1.phonemeText.length(); i++) { 087 if (i >= o2.phonemeText.length()) { 088 return +1; 089 } 090 int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); 091 if (c != 0) { 092 return c; 093 } 094 } 095 096 if (o1.phonemeText.length() < o2.phonemeText.length()) { 097 return -1; 098 } 099 100 return 0; 101 } 102 }; 103 104 private final CharSequence phonemeText; 105 private final Languages.LanguageSet languages; 106 107 public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) { 108 this.phonemeText = phonemeText; 109 this.languages = languages; 110 } 111 112 public Phoneme append(CharSequence str) { 113 return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); 114 } 115 116 public Languages.LanguageSet getLanguages() { 117 return this.languages; 118 } 119 120 @Override 121 public Iterable<Phoneme> getPhonemes() { 122 return Collections.singleton(this); 123 } 124 125 public CharSequence getPhonemeText() { 126 return this.phonemeText; 127 } 128 129 public Phoneme join(Phoneme right) { 130 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), 131 this.languages.restrictTo(right.languages)); 132 } 133 } 134 135 public interface PhonemeExpr { 136 Iterable<Phoneme> getPhonemes(); 137 } 138 139 public static final class PhonemeList implements PhonemeExpr { 140 private final List<Phoneme> phonemes; 141 142 public PhonemeList(List<Phoneme> phonemes) { 143 this.phonemes = phonemes; 144 } 145 146 @Override 147 public List<Phoneme> getPhonemes() { 148 return this.phonemes; 149 } 150 } 151 152 /** 153 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. 154 */ 155 public static interface RPattern { 156 boolean isMatch(CharSequence input); 157 } 158 159 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { 160 @Override 161 public boolean isMatch(CharSequence input) { 162 return true; 163 } 164 }; 165 166 public static final String ALL = "ALL"; 167 168 private static final String DOUBLE_QUOTE = "\""; 169 170 private static final String HASH_INCLUDE = "#include"; 171 172 private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES = 173 new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class); 174 175 static { 176 for (NameType s : NameType.values()) { 177 Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class); 178 179 for (RuleType rt : RuleType.values()) { 180 Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>(); 181 182 Languages ls = Languages.getInstance(s); 183 for (String l : ls.getLanguages()) { 184 try { 185 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); 186 } catch (IllegalStateException e) { 187 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); 188 } 189 } 190 if (!rt.equals(RuleType.RULES)) { 191 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); 192 } 193 194 rts.put(rt, Collections.unmodifiableMap(rs)); 195 } 196 197 RULES.put(s, Collections.unmodifiableMap(rts)); 198 } 199 } 200 201 private static boolean contains(CharSequence chars, char input) { 202 for (int i = 0; i < chars.length(); i++) { 203 if (chars.charAt(i) == input) { 204 return true; 205 } 206 } 207 return false; 208 } 209 210 private static String createResourceName(NameType nameType, RuleType rt, String lang) { 211 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", 212 nameType.getName(), rt.getName(), lang); 213 } 214 215 private static Scanner createScanner(NameType nameType, RuleType rt, String lang) { 216 String resName = createResourceName(nameType, rt, lang); 217 InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 218 219 if (rulesIS == null) { 220 throw new IllegalArgumentException("Unable to load resource: " + resName); 221 } 222 223 return new Scanner(rulesIS, ResourceConstants.ENCODING); 224 } 225 226 private static Scanner createScanner(String lang) { 227 String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); 228 InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); 229 230 if (rulesIS == null) { 231 throw new IllegalArgumentException("Unable to load resource: " + resName); 232 } 233 234 return new Scanner(rulesIS, ResourceConstants.ENCODING); 235 } 236 237 private static boolean endsWith(CharSequence input, CharSequence suffix) { 238 if (suffix.length() > input.length()) { 239 return false; 240 } 241 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { 242 if (input.charAt(i) != suffix.charAt(j)) { 243 return false; 244 } 245 } 246 return true; 247 } 248 249 /** 250 * Gets rules for a combination of name type, rule type and languages. 251 * 252 * @param nameType 253 * the NameType to consider 254 * @param rt 255 * the RuleType to consider 256 * @param langs 257 * the set of languages to consider 258 * @return a list of Rules that apply 259 */ 260 public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) { 261 return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) : 262 getInstance(nameType, rt, Languages.ANY); 263 } 264 265 /** 266 * Gets rules for a combination of name type, rule type and a single language. 267 * 268 * @param nameType 269 * the NameType to consider 270 * @param rt 271 * the RuleType to consider 272 * @param lang 273 * the language to consider 274 * @return a list rules for a combination of name type, rule type and a single language. 275 */ 276 public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) { 277 List<Rule> rules = RULES.get(nameType).get(rt).get(lang); 278 279 if (rules == null) { 280 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", 281 nameType.getName(), rt.getName(), lang)); 282 } 283 284 return rules; 285 } 286 287 private static Phoneme parsePhoneme(String ph) { 288 int open = ph.indexOf("["); 289 if (open >= 0) { 290 if (!ph.endsWith("]")) { 291 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); 292 } 293 String before = ph.substring(0, open); 294 String in = ph.substring(open + 1, ph.length() - 1); 295 Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); 296 297 return new Phoneme(before, Languages.LanguageSet.from(langs)); 298 } else { 299 return new Phoneme(ph, Languages.ANY_LANGUAGE); 300 } 301 } 302 303 private static PhonemeExpr parsePhonemeExpr(String ph) { 304 if (ph.startsWith("(")) { // we have a bracketed list of options 305 if (!ph.endsWith(")")) { 306 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); 307 } 308 309 List<Phoneme> phs = new ArrayList<Phoneme>(); 310 String body = ph.substring(1, ph.length() - 1); 311 for (String part : body.split("[|]")) { 312 phs.add(parsePhoneme(part)); 313 } 314 if (body.startsWith("|") || body.endsWith("|")) { 315 phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); 316 } 317 318 return new PhonemeList(phs); 319 } else { 320 return parsePhoneme(ph); 321 } 322 } 323 324 private static List<Rule> parseRules(final Scanner scanner, final String location) { 325 List<Rule> lines = new ArrayList<Rule>(); 326 int currentLine = 0; 327 328 boolean inMultilineComment = false; 329 while (scanner.hasNextLine()) { 330 currentLine++; 331 String rawLine = scanner.nextLine(); 332 String line = rawLine; 333 334 if (inMultilineComment) { 335 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 336 inMultilineComment = false; 337 } 338 } else { 339 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 340 inMultilineComment = true; 341 } else { 342 // discard comments 343 int cmtI = line.indexOf(ResourceConstants.CMT); 344 if (cmtI >= 0) { 345 line = line.substring(0, cmtI); 346 } 347 348 // trim leading-trailing whitespace 349 line = line.trim(); 350 351 if (line.length() == 0) { 352 continue; // empty lines can be safely skipped 353 } 354 355 if (line.startsWith(HASH_INCLUDE)) { 356 // include statement 357 String incl = line.substring(HASH_INCLUDE.length()).trim(); 358 if (incl.contains(" ")) { 359 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + 360 location); 361 } else { 362 lines.addAll(parseRules(createScanner(incl), location + "->" + incl)); 363 } 364 } else { 365 // rule 366 String[] parts = line.split("\\s+"); 367 if (parts.length != 4) { 368 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 369 " parts: " + rawLine + " in " + location); 370 } else { 371 try { 372 String pat = stripQuotes(parts[0]); 373 String lCon = stripQuotes(parts[1]); 374 String rCon = stripQuotes(parts[2]); 375 PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); 376 final int cLine = currentLine; 377 Rule r = new Rule(pat, lCon, rCon, ph) { 378 private final int myLine = cLine; 379 private final String loc = location; 380 381 @Override 382 public String toString() { 383 final StringBuilder sb = new StringBuilder(); 384 sb.append("Rule"); 385 sb.append("{line=").append(myLine); 386 sb.append(", loc='").append(loc).append('\''); 387 sb.append('}'); 388 return sb.toString(); 389 } 390 }; 391 lines.add(r); 392 } catch (IllegalArgumentException e) { 393 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + 394 location, e); 395 } 396 } 397 } 398 } 399 } 400 } 401 402 return lines; 403 } 404 405 /** 406 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. 407 * 408 * @param regex 409 * the regular expression to compile 410 * @return an RPattern that will match this regex 411 */ 412 private static RPattern pattern(final String regex) { 413 boolean startsWith = regex.startsWith("^"); 414 boolean endsWith = regex.endsWith("$"); 415 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); 416 boolean boxes = content.contains("["); 417 418 if (!boxes) { 419 if (startsWith && endsWith) { 420 // exact match 421 if (content.length() == 0) { 422 // empty 423 return new RPattern() { 424 @Override 425 public boolean isMatch(CharSequence input) { 426 return input.length() == 0; 427 } 428 }; 429 } else { 430 return new RPattern() { 431 @Override 432 public boolean isMatch(CharSequence input) { 433 return input.equals(content); 434 } 435 }; 436 } 437 } else if ((startsWith || endsWith) && content.length() == 0) { 438 // matches every string 439 return ALL_STRINGS_RMATCHER; 440 } else if (startsWith) { 441 // matches from start 442 return new RPattern() { 443 @Override 444 public boolean isMatch(CharSequence input) { 445 return startsWith(input, content); 446 } 447 }; 448 } else if (endsWith) { 449 // matches from start 450 return new RPattern() { 451 @Override 452 public boolean isMatch(CharSequence input) { 453 return endsWith(input, content); 454 } 455 }; 456 } 457 } else { 458 boolean startsWithBox = content.startsWith("["); 459 boolean endsWithBox = content.endsWith("]"); 460 461 if (startsWithBox && endsWithBox) { 462 String boxContent = content.substring(1, content.length() - 1); 463 if (!boxContent.contains("[")) { 464 // box containing alternatives 465 boolean negate = boxContent.startsWith("^"); 466 if (negate) { 467 boxContent = boxContent.substring(1); 468 } 469 final String bContent = boxContent; 470 final boolean shouldMatch = !negate; 471 472 if (startsWith && endsWith) { 473 // exact match 474 return new RPattern() { 475 @Override 476 public boolean isMatch(CharSequence input) { 477 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; 478 } 479 }; 480 } else if (startsWith) { 481 // first char 482 return new RPattern() { 483 @Override 484 public boolean isMatch(CharSequence input) { 485 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; 486 } 487 }; 488 } else if (endsWith) { 489 // last char 490 return new RPattern() { 491 @Override 492 public boolean isMatch(CharSequence input) { 493 return input.length() > 0 && 494 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; 495 } 496 }; 497 } 498 } 499 } 500 } 501 502 return new RPattern() { 503 Pattern pattern = Pattern.compile(regex); 504 505 @Override 506 public boolean isMatch(CharSequence input) { 507 Matcher matcher = pattern.matcher(input); 508 return matcher.find(); 509 } 510 }; 511 } 512 513 private static boolean startsWith(CharSequence input, CharSequence prefix) { 514 if (prefix.length() > input.length()) { 515 return false; 516 } 517 for (int i = 0; i < prefix.length(); i++) { 518 if (input.charAt(i) != prefix.charAt(i)) { 519 return false; 520 } 521 } 522 return true; 523 } 524 525 private static String stripQuotes(String str) { 526 if (str.startsWith(DOUBLE_QUOTE)) { 527 str = str.substring(1); 528 } 529 530 if (str.endsWith(DOUBLE_QUOTE)) { 531 str = str.substring(0, str.length() - 1); 532 } 533 534 return str; 535 } 536 537 private final RPattern lContext; 538 539 private final String pattern; 540 541 private final PhonemeExpr phoneme; 542 543 private final RPattern rContext; 544 545 /** 546 * Creates a new rule. 547 * 548 * @param pattern 549 * the pattern 550 * @param lContext 551 * the left context 552 * @param rContext 553 * the right context 554 * @param phoneme 555 * the resulting phoneme 556 */ 557 public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) { 558 this.pattern = pattern; 559 this.lContext = pattern(lContext + "$"); 560 this.rContext = pattern("^" + rContext); 561 this.phoneme = phoneme; 562 } 563 564 /** 565 * Gets the left context. This is a regular expression that must match to the left of the pattern. 566 * 567 * @return the left context Pattern 568 */ 569 public RPattern getLContext() { 570 return this.lContext; 571 } 572 573 /** 574 * Gets the pattern. This is a string-literal that must exactly match. 575 * 576 * @return the pattern 577 */ 578 public String getPattern() { 579 return this.pattern; 580 } 581 582 /** 583 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. 584 * 585 * @return the phoneme 586 */ 587 public PhonemeExpr getPhoneme() { 588 return this.phoneme; 589 } 590 591 /** 592 * Gets the right context. This is a regular expression that must match to the right of the pattern. 593 * 594 * @return the right context Pattern 595 */ 596 public RPattern getRContext() { 597 return this.rContext; 598 } 599 600 /** 601 * Decides if the pattern and context match the input starting at a position. It is a match if the 602 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and 603 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>. 604 * 605 * @param input 606 * the input String 607 * @param i 608 * the int position within the input 609 * @return true if the pattern and left/right context match, false otherwise 610 */ 611 public boolean patternAndContextMatches(CharSequence input, int i) { 612 if (i < 0) { 613 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); 614 } 615 616 int patternLength = this.pattern.length(); 617 int ipl = i + patternLength; 618 619 if (ipl > input.length()) { 620 // not enough room for the pattern to match 621 return false; 622 } 623 624 // evaluate the pattern, left context and right context 625 // fail early if any of the evaluations is not successful 626 if (!input.subSequence(i, ipl).equals(this.pattern)) { 627 return false; 628 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { 629 return false; 630 } 631 return this.lContext.isMatch(input.subSequence(0, i)); 632 } 633 }