Rule.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.codec.language.bm;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collections;
- import java.util.Comparator;
- import java.util.EnumMap;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Scanner;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.codec.Resources;
- import org.apache.commons.codec.language.bm.Languages.LanguageSet;
- /**
- * A phoneme rule.
- * <p>
- * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
- * and a logical flag indicating if all languages must be in play. A rule matches if:
- * </p>
- * <ul>
- * <li>the pattern matches at the current position</li>
- * <li>the string up until the beginning of the pattern matches the left context</li>
- * <li>the string from the end of the pattern matches the right context</li>
- * <li>logical is ALL and all languages are in scope; or</li>
- * <li>logical is any other value and at least one language is in scope</li>
- * </ul>
- * <p>
- * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
- * to explicitly construct their own.
- * </p>
- * <p>
- * Rules are immutable and thread-safe.
- * </p>
- * <h2>Rules resources</h2>
- * <p>
- * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
- * named following the pattern:
- * </p>
- * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
- * <p>
- * The format of these resources is the following:
- * </p>
- * <ul>
- * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
- * will be interpreted as:
- * <ol>
- * <li>pattern</li>
- * <li>left context</li>
- * <li>right context</li>
- * <li>phoneme</li>
- * </ol>
- * </li>
- * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded
- * as a comment.</li>
- * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip
- * all content until a line ending in '*' and '/' is found.</li>
- * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
- * </ul>
- *
- * @since 1.6
- */
- public class Rule {
- /**
- * A phoneme.
- */
- public static final class Phoneme implements PhonemeExpr {
- /**
- * The Phoneme Comparator.
- */
- public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
- final int o1Length = o1.phonemeText.length();
- final int o2Length = o2.phonemeText.length();
- for (int i = 0; i < o1Length; i++) {
- if (i >= o2Length) {
- return +1;
- }
- final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
- if (c != 0) {
- return c;
- }
- }
- if (o1Length < o2Length) {
- return -1;
- }
- return 0;
- };
- private final StringBuilder phonemeText;
- private final Languages.LanguageSet languages;
- /**
- * Constructs a new instance.
- *
- * @param phonemeText The phoneme text.
- * @param languages A language set.
- */
- public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
- this.phonemeText = new StringBuilder(phonemeText);
- this.languages = languages;
- }
- /**
- * Constructs a new instance.
- *
- * @param phonemeLeft The left phoneme text.
- * @param phonemeRight The right phoneme text.
- */
- public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
- this(phonemeLeft.phonemeText, phonemeLeft.languages);
- this.phonemeText.append(phonemeRight.phonemeText);
- }
- /**
- * Constructs a new instance.
- *
- * @param phonemeLeft The left phoneme text.
- * @param phonemeRight The right phoneme text.
- * @param languages A language set.
- */
- public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
- this(phonemeLeft.phonemeText, languages);
- this.phonemeText.append(phonemeRight.phonemeText);
- }
- /**
- * Appends the sequence to the phone text.
- *
- * @param sequence The sequence to append.
- * @return this instance.
- */
- public Phoneme append(final CharSequence sequence) {
- this.phonemeText.append(sequence);
- return this;
- }
- /**
- * Gets the language set.
- *
- * @return the language set.
- */
- public Languages.LanguageSet getLanguages() {
- return this.languages;
- }
- @Override
- public Iterable<Phoneme> getPhonemes() {
- return Collections.singleton(this);
- }
- /**
- * Gets the phoneme text sequence.
- *
- * @return the phoneme text sequence.
- */
- public CharSequence getPhonemeText() {
- return this.phonemeText;
- }
- /**
- * Deprecated since 1.9.
- *
- * @param right the Phoneme to join
- * @return a new Phoneme
- * @deprecated since 1.9
- */
- @Deprecated
- public Phoneme join(final Phoneme right) {
- return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
- this.languages.restrictTo(right.languages));
- }
- /**
- * Returns a new Phoneme with the same text but a union of its
- * current language set and the given one.
- *
- * @param lang the language set to merge
- * @return a new Phoneme
- */
- public Phoneme mergeWithLanguage(final LanguageSet lang) {
- return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
- }
- @Override
- public int size() {
- return 1;
- }
- @Override
- public String toString() {
- return phonemeText.toString() + "[" + languages + "]";
- }
- }
- /**
- * A phoneme expression.
- */
- public interface PhonemeExpr {
- /**
- * Gets an iteration of phonemes.
- *
- * @return an iteration of phonemes.
- */
- Iterable<Phoneme> getPhonemes();
- /**
- * Gets the expression size in phonemes.
- *
- * @return the expression size in phonemes.
- * @since 1.17.0
- */
- default int size() {
- // All implementations are int-bound.
- return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
- }
- }
- /**
- * A list of phonemes.
- */
- public static final class PhonemeList implements PhonemeExpr {
- private final List<Phoneme> phonemeList;
- /**
- * Constructs a new instance.
- *
- * @param phonemes the phoneme list.
- */
- public PhonemeList(final List<Phoneme> phonemes) {
- this.phonemeList = phonemes;
- }
- @Override
- public List<Phoneme> getPhonemes() {
- return phonemeList;
- }
- @Override
- public int size() {
- return phonemeList.size();
- }
- }
- /**
- * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
- */
- public interface RPattern {
- /**
- * Tests whether the given input matches this instance.
- *
- * @param input the input to test.
- * @return whether the given input matches this instance.
- */
- boolean isMatch(CharSequence input);
- }
- /**
- * Always matches.
- */
- public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
- /**
- * Unused.
- */
- public static final String ALL = "ALL";
- private static final String DOUBLE_QUOTE = "\"";
- private static final String HASH_INCLUDE = "#include";
- private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
- private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
- new EnumMap<>(NameType.class);
- static {
- for (final NameType s : NameType.values()) {
- final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
- new EnumMap<>(RuleType.class);
- for (final RuleType rt : RuleType.values()) {
- final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
- final Languages ls = Languages.getInstance(s);
- ls.getLanguages().forEach(l -> {
- try (Scanner scanner = createScanner(s, rt, l)) {
- rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
- } catch (final IllegalStateException e) {
- throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
- }
- });
- if (!rt.equals(RuleType.RULES)) {
- try (Scanner scanner = createScanner(s, rt, "common")) {
- rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
- }
- }
- rts.put(rt, Collections.unmodifiableMap(rs));
- }
- RULES.put(s, Collections.unmodifiableMap(rts));
- }
- }
- private static boolean contains(final CharSequence chars, final char input) {
- return chars.chars().anyMatch(c -> c == input);
- }
- private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
- return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt",
- nameType.getName(), rt.getName(), lang);
- }
- @SuppressWarnings("resource") // Closing the Scanner closes the resource
- private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
- final String resName = createResourceName(nameType, rt, lang);
- return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
- }
- @SuppressWarnings("resource") // Closing the Scanner closes the resource
- private static Scanner createScanner(final String lang) {
- final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
- return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
- }
- private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
- final int suffixLength = suffix.length();
- final int inputLength = input.length();
- if (suffixLength > inputLength) {
- return false;
- }
- for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
- if (input.charAt(i) != suffix.charAt(j)) {
- return false;
- }
- }
- return true;
- }
- /**
- * Gets rules for a combination of name type, rule type and languages.
- *
- * @param nameType
- * the NameType to consider
- * @param rt
- * the RuleType to consider
- * @param langs
- * the set of languages to consider
- * @return a list of Rules that apply
- */
- public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
- final Languages.LanguageSet langs) {
- final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
- final List<Rule> allRules = new ArrayList<>();
- ruleMap.values().forEach(rules -> allRules.addAll(rules));
- return allRules;
- }
- /**
- * Gets rules for a combination of name type, rule type and a single language.
- *
- * @param nameType
- * the NameType to consider
- * @param rt
- * the RuleType to consider
- * @param lang
- * the language to consider
- * @return a list of Rules that apply
- */
- public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
- return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
- }
- /**
- * Gets rules for a combination of name type, rule type and languages.
- *
- * @param nameType
- * the NameType to consider
- * @param rt
- * the RuleType to consider
- * @param langs
- * the set of languages to consider
- * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
- * @since 1.9
- */
- public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
- final Languages.LanguageSet langs) {
- return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
- getInstanceMap(nameType, rt, Languages.ANY);
- }
- /**
- * Gets rules for a combination of name type, rule type and a single language.
- *
- * @param nameType
- * the NameType to consider
- * @param rt
- * the RuleType to consider
- * @param lang
- * the language to consider
- * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
- * @since 1.9
- */
- public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
- final String lang) {
- final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
- if (rules == null) {
- throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
- nameType.getName(), rt.getName(), lang));
- }
- return rules;
- }
- private static Phoneme parsePhoneme(final String ph) {
- final int open = ph.indexOf("[");
- if (open >= 0) {
- if (!ph.endsWith("]")) {
- throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
- }
- final String before = ph.substring(0, open);
- final String in = ph.substring(open + 1, ph.length() - 1);
- final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
- return new Phoneme(before, Languages.LanguageSet.from(langs));
- }
- return new Phoneme(ph, Languages.ANY_LANGUAGE);
- }
- private static PhonemeExpr parsePhonemeExpr(final String ph) {
- if (ph.startsWith("(")) { // we have a bracketed list of options
- if (!ph.endsWith(")")) {
- throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
- }
- final List<Phoneme> phs = new ArrayList<>();
- final String body = ph.substring(1, ph.length() - 1);
- for (final String part : body.split("[|]")) {
- phs.add(parsePhoneme(part));
- }
- if (body.startsWith("|") || body.endsWith("|")) {
- phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
- }
- return new PhonemeList(phs);
- }
- return parsePhoneme(ph);
- }
- private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
- final Map<String, List<Rule>> lines = new HashMap<>();
- int currentLine = 0;
- boolean inMultilineComment = false;
- while (scanner.hasNextLine()) {
- currentLine++;
- final String rawLine = scanner.nextLine();
- String line = rawLine;
- if (inMultilineComment) {
- if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
- inMultilineComment = false;
- }
- } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
- inMultilineComment = true;
- } else {
- // discard comments
- final int cmtI = line.indexOf(ResourceConstants.CMT);
- if (cmtI >= 0) {
- line = line.substring(0, cmtI);
- }
- // trim leading-trailing whitespace
- line = line.trim();
- if (line.isEmpty()) {
- continue; // empty lines can be safely skipped
- }
- if (line.startsWith(HASH_INCLUDE)) {
- // include statement
- final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
- if (incl.contains(" ")) {
- throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
- location);
- }
- try (Scanner hashIncludeScanner = createScanner(incl)) {
- lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
- }
- } else {
- // rule
- final String[] parts = line.split("\\s+");
- if (parts.length != 4) {
- throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
- " parts: " + rawLine + " in " + location);
- }
- try {
- final String pat = stripQuotes(parts[0]);
- final String lCon = stripQuotes(parts[1]);
- final String rCon = stripQuotes(parts[2]);
- final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
- final int cLine = currentLine;
- final Rule r = new Rule(pat, lCon, rCon, ph) {
- private final int myLine = cLine;
- private final String loc = location;
- @Override
- public String toString() {
- final StringBuilder sb = new StringBuilder();
- sb.append("Rule");
- sb.append("{line=").append(myLine);
- sb.append(", loc='").append(loc).append('\'');
- sb.append(", pat='").append(pat).append('\'');
- sb.append(", lcon='").append(lCon).append('\'');
- sb.append(", rcon='").append(rCon).append('\'');
- sb.append('}');
- return sb.toString();
- }
- };
- final String patternKey = r.pattern.substring(0, 1);
- final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
- rules.add(r);
- } catch (final IllegalArgumentException e) {
- throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
- location, e);
- }
- }
- }
- }
- return lines;
- }
- /**
- * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
- *
- * @param regex
- * the regular expression to compile
- * @return an RPattern that will match this regex
- */
- private static RPattern pattern(final String regex) {
- final boolean startsWith = regex.startsWith("^");
- final boolean endsWith = regex.endsWith("$");
- final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
- final boolean boxes = content.contains("[");
- if (!boxes) {
- if (startsWith && endsWith) {
- // exact match
- if (content.isEmpty()) {
- // empty
- return input -> input.length() == 0;
- }
- return input -> input.equals(content);
- }
- if ((startsWith || endsWith) && content.isEmpty()) {
- // matches every string
- return ALL_STRINGS_RMATCHER;
- }
- if (startsWith) {
- // matches from start
- return input -> startsWith(input, content);
- }
- if (endsWith) {
- // matches from start
- return input -> endsWith(input, content);
- }
- } else {
- final boolean startsWithBox = content.startsWith("[");
- final boolean endsWithBox = content.endsWith("]");
- if (startsWithBox && endsWithBox) {
- String boxContent = content.substring(1, content.length() - 1);
- if (!boxContent.contains("[")) {
- // box containing alternatives
- final boolean negate = boxContent.startsWith("^");
- if (negate) {
- boxContent = boxContent.substring(1);
- }
- final String bContent = boxContent;
- final boolean shouldMatch = !negate;
- if (startsWith && endsWith) {
- // exact match
- return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
- }
- if (startsWith) {
- // first char
- return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
- }
- if (endsWith) {
- // last char
- return input -> input.length() > 0 &&
- contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
- }
- }
- }
- }
- return new RPattern() {
- final Pattern pattern = Pattern.compile(regex);
- @Override
- public boolean isMatch(final CharSequence input) {
- final Matcher matcher = pattern.matcher(input);
- return matcher.find();
- }
- };
- }
- private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
- if (prefix.length() > input.length()) {
- return false;
- }
- for (int i = 0; i < prefix.length(); i++) {
- if (input.charAt(i) != prefix.charAt(i)) {
- return false;
- }
- }
- return true;
- }
- private static String stripQuotes(String str) {
- if (str.startsWith(DOUBLE_QUOTE)) {
- str = str.substring(1);
- }
- if (str.endsWith(DOUBLE_QUOTE)) {
- str = str.substring(0, str.length() - 1);
- }
- return str;
- }
- private final RPattern lContext;
- private final String pattern;
- private final PhonemeExpr phoneme;
- private final RPattern rContext;
- /**
- * Creates a new rule.
- *
- * @param pattern
- * the pattern
- * @param lContext
- * the left context
- * @param rContext
- * the right context
- * @param phoneme
- * the resulting phoneme
- */
- public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
- this.pattern = pattern;
- this.lContext = pattern(lContext + "$");
- this.rContext = pattern("^" + rContext);
- this.phoneme = phoneme;
- }
- /**
- * Gets the left context. This is a regular expression that must match to the left of the pattern.
- *
- * @return the left context Pattern
- */
- public RPattern getLContext() {
- return this.lContext;
- }
- /**
- * Gets the pattern. This is a string-literal that must exactly match.
- *
- * @return the pattern
- */
- public String getPattern() {
- return this.pattern;
- }
- /**
- * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
- *
- * @return the phoneme
- */
- public PhonemeExpr getPhoneme() {
- return this.phoneme;
- }
- /**
- * Gets the right context. This is a regular expression that must match to the right of the pattern.
- *
- * @return the right context Pattern
- */
- public RPattern getRContext() {
- return this.rContext;
- }
- /**
- * Decides if the pattern and context match the input starting at a position. It is a match if the
- * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
- * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
- *
- * @param input
- * the input String
- * @param i
- * the int position within the input
- * @return true if the pattern and left/right context match, false otherwise
- */
- public boolean patternAndContextMatches(final CharSequence input, final int i) {
- if (i < 0) {
- throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
- }
- final int patternLength = this.pattern.length();
- final int ipl = i + patternLength;
- if (ipl > input.length()) {
- // not enough room for the pattern to match
- return false;
- }
- // evaluate the pattern, left context and right context
- // fail early if any of the evaluations is not successful
- if (!input.subSequence(i, ipl).equals(this.pattern)) {
- return false;
- }
- if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
- return false;
- }
- return this.lContext.isMatch(input.subSequence(0, i));
- }
- }