Rule

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.Resources;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;

/**
 * A phoneme rule.
 * <p>
 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply and a logical flag indicating if all languages must
 * be in play. A rule matches if:
 * </p>
 * <ul>
 * <li>the pattern matches at the current position</li>
 * <li>the string up until the beginning of the pattern matches the left context</li>
 * <li>the string from the end of the pattern matches the right context</li>
 * <li>logical is ALL and all languages are in scope; or</li>
 * <li>logical is any other value and at least one language is in scope</li>
 * </ul>
 * <p>
 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user to explicitly construct their own.
 * </p>
 * <p>
 * Rules are immutable and thread-safe.
 * </p>
 * <h2>Rules resources</h2>
 * <p>
 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the pattern:
 * </p>
 * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
 * <p>
 * The format of these resources is the following:
 * </p>
 * <ul>
 * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these will be interpreted as:
 * <ol>
 * <li>pattern</li>
 * <li>left context</li>
 * <li>right context</li>
 * <li>phoneme</li>
 * </ol>
 * </li>
 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</li>
 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending
 * in '*' and '/' is found.</li>
 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
 * </ul>
 *
 * @since 1.6
 */
public class Rule {

    /**
     * A phoneme.
     */
    public static final class Phoneme implements PhonemeExpr {

        /**
         * The Phoneme Comparator.
         */
        public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
            final int o1Length = o1.phonemeText.length();
            final int o2Length = o2.phonemeText.length();
            for (int i = 0; i < o1Length; i++) {
                if (i >= o2Length) {
                    return +1;
                }
                final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
                if (c != 0) {
                    return c;
                }
            }
            if (o1Length < o2Length) {
                return -1;
            }
            return 0;
        };
        private final StringBuilder phonemeText;
        private final Languages.LanguageSet languages;

        /**
         * Constructs a new instance.
         *
         * @param phonemeText The phoneme text.
         * @param languages   A language set.
         */
        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
            this.phonemeText = new StringBuilder(phonemeText);
            this.languages = languages;
        }

        /**
         * Constructs a new instance.
         *
         * @param phonemeLeft  The left phoneme text.
         * @param phonemeRight The right phoneme text.
         */
        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
            this(phonemeLeft.phonemeText, phonemeLeft.languages);
            this.phonemeText.append(phonemeRight.phonemeText);
        }

        /**
         * Constructs a new instance.
         *
         * @param phonemeLeft  The left phoneme text.
         * @param phonemeRight The right phoneme text.
         * @param languages    A language set.
         */
        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
            this(phonemeLeft.phonemeText, languages);
            this.phonemeText.append(phonemeRight.phonemeText);
        }

        /**
         * Appends the sequence to the phone text.
         *
         * @param sequence The sequence to append.
         * @return {@code this} instance.
         */
        public Phoneme append(final CharSequence sequence) {
            this.phonemeText.append(sequence);
            return this;
        }

        /**
         * Gets the language set.
         *
         * @return the language set.
         */
        public Languages.LanguageSet getLanguages() {
            return this.languages;
        }

        @Override
        public Iterable<Phoneme> getPhonemes() {
            return Collections.singleton(this);
        }

        /**
         * Gets the phoneme text sequence.
         *
         * @return the phoneme text sequence.
         */
        public CharSequence getPhonemeText() {
            return this.phonemeText;
        }

        /**
         * Deprecated since 1.9.
         *
         * @param right the Phoneme to join
         * @return a new Phoneme
         * @deprecated since 1.9
         */
        @Deprecated
        public Phoneme join(final Phoneme right) {
            return new Phoneme(phonemeText.toString() + right.phonemeText.toString(), languages.restrictTo(right.languages));
        }

        /**
         * Returns a new Phoneme with the same text but a union of its current language set and the given one.
         *
         * @param lang the language set to merge
         * @return a new Phoneme
         */
        public Phoneme mergeWithLanguage(final LanguageSet lang) {
            return new Phoneme(phonemeText.toString(), languages.merge(lang));
        }

        @Override
        public int size() {
            return 1;
        }

        @Override
        public String toString() {
            return phonemeText.toString() + "[" + languages + "]";
        }
    }

    /**
     * A phoneme expression.
     */
    public interface PhonemeExpr {

        /**
         * Gets an iteration of phonemes.
         *
         * @return an iteration of phonemes.
         */
        Iterable<Phoneme> getPhonemes();

        /**
         * Gets the expression size in phonemes.
         *
         * @return the expression size in phonemes.
         * @since 1.17.0
         */
        default int size() {
            // All implementations are int-bound.
            return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
        }
    }

    /**
     * A list of phonemes.
     */
    public static final class PhonemeList implements PhonemeExpr {

        private final List<Phoneme> phonemeList;

        /**
         * Constructs a new instance.
         *
         * @param phonemes the phoneme list.
         */
        public PhonemeList(final List<Phoneme> phonemes) {
            this.phonemeList = phonemes;
        }

        @Override
        public List<Phoneme> getPhonemes() {
            return phonemeList;
        }

        @Override
        public int size() {
            return phonemeList.size();
        }
    }

    /**
     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
     */
    public interface RPattern {

        /**
         * Tests whether the given input matches this instance.
         *
         * @param input the input to test.
         * @return whether the given input matches this instance.
         */
        boolean isMatch(CharSequence input);
    }

    private static final String PIPE = "|";

    /**
     * Always matches.
     */
    public static final RPattern ALL_STRINGS_RMATCHER = input -> true;

    /**
     * Unused.
     *
     * @deprecated This is unused.
     */
    @Deprecated
    public static final String ALL = "ALL";

    private static final String DOUBLE_QUOTE = "\"";
    private static final String HASH_INCLUDE = "#include";
    private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
    private static final Pattern AROUND_PLUS = Pattern.compile("[+]");
    private static final Pattern AROUND_PIPE = Pattern.compile("[|]");
    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = new EnumMap<>(NameType.class);

    /**
     * Initializes {@code RULES}.
     */
    static {
        for (final NameType nameType : NameType.values()) {
            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rtsMap = new EnumMap<>(RuleType.class);
            for (final RuleType ruleType : RuleType.values()) {
                final Map<String, Map<String, List<Rule>>> rsMap = new HashMap<>();
                final Languages languages = Languages.getInstance(nameType);
                languages.getLanguages().forEach(l -> {
                    try (Scanner scanner = createScanner(nameType, ruleType, l)) {
                        rsMap.put(l, parseRules(scanner, createResourceName(nameType, ruleType, l)));
                    } catch (final IllegalStateException e) {
                        throw new IllegalStateException("Problem processing " + createResourceName(nameType, ruleType, l), e);
                    }
                });
                if (!ruleType.equals(RuleType.RULES)) {
                    try (Scanner scanner = createScanner(nameType, ruleType, "common")) {
                        rsMap.put("common", parseRules(scanner, createResourceName(nameType, ruleType, "common")));
                    }
                }
                rtsMap.put(ruleType, Collections.unmodifiableMap(rsMap));
            }
            RULES.put(nameType, Collections.unmodifiableMap(rtsMap));
        }
    }

    private static boolean contains(final CharSequence chars, final char input) {
        return chars.chars().anyMatch(c -> c == input);
    }

    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
        return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang);
    }

    @SuppressWarnings("resource") // Closing the Scanner closes the resource
    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
        final String resName = createResourceName(nameType, rt, lang);
        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
    }

    @SuppressWarnings("resource") // Closing the Scanner closes the resource
    private static Scanner createScanner(final String lang) {
        final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
    }

    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
        final int suffixLength = suffix.length();
        final int inputLength = input.length();
        if (suffixLength > inputLength) {
            return false;
        }
        for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
            if (input.charAt(i) != suffix.charAt(j)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Gets rules for a combination of name type, rule type and languages.
     *
     * @param nameType the NameType to consider
     * @param rt       the RuleType to consider
     * @param langs    the set of languages to consider
     * @return a list of Rules that apply
     */
    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
        final List<Rule> allRules = new ArrayList<>();
        ruleMap.values().forEach(rules -> allRules.addAll(rules));
        return allRules;
    }

    /**
     * Gets rules for a combination of name type, rule type and a single language.
     *
     * @param nameType the NameType to consider
     * @param rt       the RuleType to consider
     * @param lang     the language to consider
     * @return a list of Rules that apply
     */
    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
    }

    /**
     * Gets rules for a combination of name type, rule type and languages.
     *
     * @param nameType the NameType to consider
     * @param rt       the RuleType to consider
     * @param langs    the set of languages to consider
     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
     * @since 1.9
     */
    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : getInstanceMap(nameType, rt, Languages.ANY);
    }

    /**
     * Gets rules for a combination of name type, rule type and a single language.
     *
     * @param nameType the NameType to consider
     * @param rt       the RuleType to consider
     * @param lang     the language to consider
     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
     * @since 1.9
     */
    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final String lang) {
        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
        if (rules == null) {
            throw new IllegalArgumentException(String.format("No rules found for %s, %s, '%s'.", nameType.getName(), rt.getName(), lang));
        }
        return rules;
    }

    private static Phoneme parsePhoneme(final String ph) {
        final int open = ph.indexOf("[");
        if (open >= 0) {
            if (!ph.endsWith("]")) {
                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
            }
            final String before = ph.substring(0, open);
            final String in = ph.substring(open + 1, ph.length() - 1);
            final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in)));
            return new Phoneme(before, Languages.LanguageSet.from(langs));
        }
        return new Phoneme(ph, Languages.ANY_LANGUAGE);
    }

    /*
     * Package-private for testing only.
     */
    static PhonemeExpr parsePhonemeExpr(final String ph) {
        if (ph.startsWith("(")) {
            // we have a bracketed list of options
            if (!ph.endsWith(")")) {
                throw new IllegalArgumentException("Phoneme starting with '(' must end with ')'");
            }
            final List<Phoneme> phs = new ArrayList<>();
            final String body = ph.substring(1, ph.length() - 1);
            final String[] split = AROUND_PIPE.split(body);
            for (final String part : split) {
                phs.add(parsePhoneme(part));
            }
            if (split.length > 1 && split[0].length() != 0 && body.startsWith(PIPE) || split[split.length - 1].length() != 0 && body.endsWith(PIPE)) {
                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
            }
            return new PhonemeList(phs);
        }
        return parsePhoneme(ph);
    }

    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
        final Map<String, List<Rule>> lines = new HashMap<>();
        int currentLine = 0;
        boolean inMultilineComment = false;
        while (scanner.hasNextLine()) {
            currentLine++;
            final String rawLine = scanner.nextLine();
            String line = rawLine;
            if (inMultilineComment) {
                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                    inMultilineComment = false;
                }
            } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                inMultilineComment = true;
            } else {
                // discard comments
                final int cmtI = line.indexOf(ResourceConstants.CMT);
                if (cmtI >= 0) {
                    line = line.substring(0, cmtI);
                }
                // trim leading-trailing whitespace
                line = line.trim();
                if (line.isEmpty()) {
                    continue; // empty lines can be safely skipped
                }
                if (line.startsWith(HASH_INCLUDE)) {
                    // include statement
                    final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
                    if (incl.contains(" ")) {
                        throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + location);
                    }
                    try (Scanner hashIncludeScanner = createScanner(incl)) {
                        lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
                    }
                } else {
                    // rule
                    final String[] parts = ResourceConstants.SPACES.split(line);
                    if (parts.length != 4) {
                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
                    }
                    try {
                        final String pat = stripQuotes(parts[0]);
                        final String lCon = stripQuotes(parts[1]);
                        final String rCon = stripQuotes(parts[2]);
                        final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
                        final int cLine = currentLine;
                        final Rule r = new Rule(pat, lCon, rCon, ph) {

                            private final int myLine = cLine;
                            private final String loc = location;

                            @Override
                            public String toString() {
                                final StringBuilder sb = new StringBuilder();
                                sb.append("Rule");
                                sb.append("{line=").append(myLine);
                                sb.append(", loc='").append(loc).append('\'');
                                sb.append(", pat='").append(pat).append('\'');
                                sb.append(", lcon='").append(lCon).append('\'');
                                sb.append(", rcon='").append(rCon).append('\'');
                                sb.append('}');
                                return sb.toString();
                            }
                        };
                        final String patternKey = r.pattern.substring(0, 1);
                        final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
                        rules.add(r);
                    } catch (final IllegalArgumentException e) {
                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
                    }
                }
            }
        }
        return lines;
    }

    /**
     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
     *
     * @param regex the regular expression to compile
     * @return an RPattern that will match this regex
     */
    private static RPattern pattern(final String regex) {
        final boolean startsWith = regex.startsWith("^");
        final boolean endsWith = regex.endsWith("$");
        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
        final boolean boxes = content.contains("[");
        if (!boxes) {
            if (startsWith && endsWith) {
                // exact match
                if (content.isEmpty()) {
                    // empty
                    return input -> input.length() == 0;
                }
                return input -> input.equals(content);
            }
            if ((startsWith || endsWith) && content.isEmpty()) {
                // matches every string
                return ALL_STRINGS_RMATCHER;
            }
            if (startsWith) {
                // matches from start
                return input -> startsWith(input, content);
            }
            if (endsWith) {
                // matches from start
                return input -> endsWith(input, content);
            }
        } else {
            final boolean startsWithBox = content.startsWith("[");
            final boolean endsWithBox = content.endsWith("]");
            if (startsWithBox && endsWithBox) {
                String boxContent = content.substring(1, content.length() - 1);
                if (!boxContent.contains("[")) {
                    // box containing alternatives
                    final boolean negate = boxContent.startsWith("^");
                    if (negate) {
                        boxContent = boxContent.substring(1);
                    }
                    final String bContent = boxContent;
                    final boolean shouldMatch = !negate;
                    if (startsWith && endsWith) {
                        // exact match
                        return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
                    }
                    if (startsWith) {
                        // first char
                        return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
                    }
                    if (endsWith) {
                        // last char
                        return input -> input.length() > 0 && contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
                    }
                }
            }
        }
        return new RPattern() {

            final Pattern pattern = Pattern.compile(regex);

            @Override
            public boolean isMatch(final CharSequence input) {
                final Matcher matcher = pattern.matcher(input);
                return matcher.find();
            }
        };
    }

    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
        if (prefix.length() > input.length()) {
            return false;
        }
        for (int i = 0; i < prefix.length(); i++) {
            if (input.charAt(i) != prefix.charAt(i)) {
                return false;
            }
        }
        return true;
    }

    private static String stripQuotes(String str) {
        if (str.startsWith(DOUBLE_QUOTE)) {
            str = str.substring(1);
        }
        if (str.endsWith(DOUBLE_QUOTE)) {
            str = str.substring(0, str.length() - 1);
        }
        return str;
    }

    private final RPattern lContext;
    private final String pattern;
    private final PhonemeExpr phoneme;
    private final RPattern rContext;

    /**
     * Creates a new rule.
     *
     * @param pattern  the pattern
     * @param lContext the left context
     * @param rContext the right context
     * @param phoneme  the resulting phoneme
     */
    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
        this.pattern = pattern;
        this.lContext = pattern(lContext + "$");
        this.rContext = pattern("^" + rContext);
        this.phoneme = phoneme;
    }

    /**
     * Gets the left context. This is a regular expression that must match to the left of the pattern.
     *
     * @return the left context Pattern
     */
    public RPattern getLContext() {
        return lContext;
    }

    /**
     * Gets the pattern. This is a string-literal that must exactly match.
     *
     * @return the pattern
     */
    public String getPattern() {
        return pattern;
    }

    /**
     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
     *
     * @return the phoneme
     */
    public PhonemeExpr getPhoneme() {
        return phoneme;
    }

    /**
     * Gets the right context. This is a regular expression that must match to the right of the pattern.
     *
     * @return the right context Pattern
     */
    public RPattern getRContext() {
        return rContext;
    }

    /**
     * Decides if the pattern and context match the input starting at a position. It is a match if the {@code lContext} matches {@code input} up to {@code i},
     * {@code pattern} matches at i and {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
     *
     * @param input the input String
     * @param i     the int position within the input
     * @return true if the pattern and left/right context match, false otherwise
     */
    public boolean patternAndContextMatches(final CharSequence input, final int i) {
        if (i < 0) {
            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
        }
        final int patternLength = pattern.length();
        final int ipl = i + patternLength;
        if (ipl > input.length()) {
            // not enough room for the pattern to match
            return false;
        }
        // evaluate the pattern, left context and right context
        // fail early if any of the evaluations is not successful
        if (!input.subSequence(i, ipl).equals(pattern)) {
            return false;
        }
        if (!rContext.isMatch(input.subSequence(ipl, input.length()))) {
            return false;
        }
        return lContext.isMatch(input.subSequence(0, i));
    }
}