001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.EnumMap;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.apache.commons.codec.Resources;
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * </p>
043 * <ul>
044 * <li>the pattern matches at the current position</li>
045 * <li>the string up until the beginning of the pattern matches the left context</li>
046 * <li>the string from the end of the pattern matches the right context</li>
047 * <li>logical is ALL and all languages are in scope; or</li>
048 * <li>logical is any other value and at least one language is in scope</li>
049 * </ul>
050 * <p>
051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
052 * to explicitly construct their own.
053 * </p>
054 * <p>
055 * Rules are immutable and thread-safe.
056 * </p>
057 * <h2>Rules resources</h2>
058 * <p>
059 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
060 * named following the pattern:
061 * </p>
062 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
063 * <p>
064 * The format of these resources is the following:
065 * </p>
066 * <ul>
067 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
068 * will be interpreted as:
069 * <ol>
070 * <li>pattern</li>
071 * <li>left context</li>
072 * <li>right context</li>
073 * <li>phoneme</li>
074 * </ol>
075 * </li>
076 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
077 * as a comment.</li>
078 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
079 * all content until a line ending in '*' and '/' is found.</li>
080 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
081 * </ul>
082 *
083 * @since 1.6
084 */
085public class Rule {
086
087    public static final class Phoneme implements PhonemeExpr {
088
089        public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
090            final int o1Length = o1.phonemeText.length();
091            final int o2Length = o2.phonemeText.length();
092            for (int i = 0; i < o1Length; i++) {
093                if (i >= o2Length) {
094                    return +1;
095                }
096                final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
097                if (c != 0) {
098                    return c;
099                }
100            }
101
102            if (o1Length < o2Length) {
103                return -1;
104            }
105
106            return 0;
107        };
108        private final StringBuilder phonemeText;
109
110        private final Languages.LanguageSet languages;
111
112        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
113            this.phonemeText = new StringBuilder(phonemeText);
114            this.languages = languages;
115        }
116
117        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
118            this(phonemeLeft.phonemeText, phonemeLeft.languages);
119            this.phonemeText.append(phonemeRight.phonemeText);
120        }
121
122        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
123            this(phonemeLeft.phonemeText, languages);
124            this.phonemeText.append(phonemeRight.phonemeText);
125        }
126
127        public Phoneme append(final CharSequence str) {
128            this.phonemeText.append(str);
129            return this;
130        }
131
132        public Languages.LanguageSet getLanguages() {
133            return this.languages;
134        }
135
136        @Override
137        public Iterable<Phoneme> getPhonemes() {
138            return Collections.singleton(this);
139        }
140
141        public CharSequence getPhonemeText() {
142            return this.phonemeText;
143        }
144
145        /**
146         * Deprecated since 1.9.
147         *
148         * @param right the Phoneme to join
149         * @return a new Phoneme
150         * @deprecated since 1.9
151         */
152        @Deprecated
153        public Phoneme join(final Phoneme right) {
154            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
155                               this.languages.restrictTo(right.languages));
156        }
157
158        /**
159         * Returns a new Phoneme with the same text but a union of its
160         * current language set and the given one.
161         *
162         * @param lang the language set to merge
163         * @return a new Phoneme
164         */
165        public Phoneme mergeWithLanguage(final LanguageSet lang) {
166          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
167        }
168
169        @Override
170        public String toString() {
171          return phonemeText.toString() + "[" + languages + "]";
172        }
173    }
174
175    public interface PhonemeExpr {
176        Iterable<Phoneme> getPhonemes();
177    }
178
179    public static final class PhonemeList implements PhonemeExpr {
180
181        private final List<Phoneme> phonemes;
182
183        public PhonemeList(final List<Phoneme> phonemes) {
184            this.phonemes = phonemes;
185        }
186
187        @Override
188        public List<Phoneme> getPhonemes() {
189            return this.phonemes;
190        }
191    }
192
193    /**
194     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
195     */
196    public interface RPattern {
197        boolean isMatch(CharSequence input);
198    }
199
200    public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
201
202    public static final String ALL = "ALL";
203
204    private static final String DOUBLE_QUOTE = "\"";
205
206    private static final String HASH_INCLUDE = "#include";
207
208    private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
209
210    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
211            new EnumMap<>(NameType.class);
212
213    static {
214        for (final NameType s : NameType.values()) {
215            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
216                    new EnumMap<>(RuleType.class);
217
218            for (final RuleType rt : RuleType.values()) {
219                final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
220
221                final Languages ls = Languages.getInstance(s);
222                ls.getLanguages().forEach(l -> {
223                    try (final Scanner scanner = createScanner(s, rt, l)) {
224                        rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
225                    } catch (final IllegalStateException e) {
226                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
227                    }
228                });
229                if (!rt.equals(RuleType.RULES)) {
230                    try (final Scanner scanner = createScanner(s, rt, "common")) {
231                        rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
232                    }
233                }
234
235                rts.put(rt, Collections.unmodifiableMap(rs));
236            }
237
238            RULES.put(s, Collections.unmodifiableMap(rts));
239        }
240    }
241
242    private static boolean contains(final CharSequence chars, final char input) {
243        return chars.chars().anyMatch(c -> c == input);
244    }
245
246    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
247        return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
248                             nameType.getName(), rt.getName(), lang);
249    }
250
251    @SuppressWarnings("resource") // Closing the Scanner closes the resource
252    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
253        final String resName = createResourceName(nameType, rt, lang);
254        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
255    }
256
257    @SuppressWarnings("resource") // Closing the Scanner closes the resource
258    private static Scanner createScanner(final String lang) {
259        final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
260        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
261    }
262
263    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
264        final int suffixLength = suffix.length();
265        final int inputLength = input.length();
266
267        if (suffixLength > inputLength) {
268            return false;
269        }
270        for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
271            if (input.charAt(i) != suffix.charAt(j)) {
272                return false;
273            }
274        }
275        return true;
276    }
277
278    /**
279     * Gets rules for a combination of name type, rule type and languages.
280     *
281     * @param nameType
282     *            the NameType to consider
283     * @param rt
284     *            the RuleType to consider
285     * @param langs
286     *            the set of languages to consider
287     * @return a list of Rules that apply
288     */
289    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
290                                         final Languages.LanguageSet langs) {
291        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
292        final List<Rule> allRules = new ArrayList<>();
293        ruleMap.values().forEach(rules -> allRules.addAll(rules));
294        return allRules;
295    }
296
297    /**
298     * Gets rules for a combination of name type, rule type and a single language.
299     *
300     * @param nameType
301     *            the NameType to consider
302     * @param rt
303     *            the RuleType to consider
304     * @param lang
305     *            the language to consider
306     * @return a list of Rules that apply
307     */
308    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
309        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
310    }
311
312    /**
313     * Gets rules for a combination of name type, rule type and languages.
314     *
315     * @param nameType
316     *            the NameType to consider
317     * @param rt
318     *            the RuleType to consider
319     * @param langs
320     *            the set of languages to consider
321     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
322     * @since 1.9
323     */
324    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
325                                                         final Languages.LanguageSet langs) {
326        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
327                                     getInstanceMap(nameType, rt, Languages.ANY);
328    }
329
330    /**
331     * Gets rules for a combination of name type, rule type and a single language.
332     *
333     * @param nameType
334     *            the NameType to consider
335     * @param rt
336     *            the RuleType to consider
337     * @param lang
338     *            the language to consider
339     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
340     * @since 1.9
341     */
342    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
343                                                         final String lang) {
344        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
345
346        if (rules == null) {
347            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
348                                               nameType.getName(), rt.getName(), lang));
349        }
350
351        return rules;
352    }
353
354    private static Phoneme parsePhoneme(final String ph) {
355        final int open = ph.indexOf("[");
356        if (open >= 0) {
357            if (!ph.endsWith("]")) {
358                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
359            }
360            final String before = ph.substring(0, open);
361            final String in = ph.substring(open + 1, ph.length() - 1);
362            final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
363
364            return new Phoneme(before, Languages.LanguageSet.from(langs));
365        }
366        return new Phoneme(ph, Languages.ANY_LANGUAGE);
367    }
368
369    private static PhonemeExpr parsePhonemeExpr(final String ph) {
370        if (ph.startsWith("(")) { // we have a bracketed list of options
371            if (!ph.endsWith(")")) {
372                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
373            }
374
375            final List<Phoneme> phs = new ArrayList<>();
376            final String body = ph.substring(1, ph.length() - 1);
377            for (final String part : body.split("[|]")) {
378                phs.add(parsePhoneme(part));
379            }
380            if (body.startsWith("|") || body.endsWith("|")) {
381                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
382            }
383
384            return new PhonemeList(phs);
385        }
386        return parsePhoneme(ph);
387    }
388
389    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
390        final Map<String, List<Rule>> lines = new HashMap<>();
391        int currentLine = 0;
392
393        boolean inMultilineComment = false;
394        while (scanner.hasNextLine()) {
395            currentLine++;
396            final String rawLine = scanner.nextLine();
397            String line = rawLine;
398
399            if (inMultilineComment) {
400                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
401                    inMultilineComment = false;
402                }
403            } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
404                inMultilineComment = true;
405            } else {
406                // discard comments
407                final int cmtI = line.indexOf(ResourceConstants.CMT);
408                if (cmtI >= 0) {
409                    line = line.substring(0, cmtI);
410                }
411
412                // trim leading-trailing whitespace
413                line = line.trim();
414
415                if (line.isEmpty()) {
416                    continue; // empty lines can be safely skipped
417                }
418
419                if (line.startsWith(HASH_INCLUDE)) {
420                    // include statement
421                    final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
422                    if (incl.contains(" ")) {
423                        throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
424                                                           location);
425                    }
426                    try (final Scanner hashIncludeScanner = createScanner(incl)) {
427                        lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
428                    }
429                } else {
430                    // rule
431                    final String[] parts = line.split("\\s+");
432                    if (parts.length != 4) {
433                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
434                                                           " parts: " + rawLine + " in " + location);
435                    }
436                    try {
437                        final String pat = stripQuotes(parts[0]);
438                        final String lCon = stripQuotes(parts[1]);
439                        final String rCon = stripQuotes(parts[2]);
440                        final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
441                        final int cLine = currentLine;
442                        final Rule r = new Rule(pat, lCon, rCon, ph) {
443                            private final int myLine = cLine;
444                            private final String loc = location;
445
446                            @Override
447                            public String toString() {
448                                final StringBuilder sb = new StringBuilder();
449                                sb.append("Rule");
450                                sb.append("{line=").append(myLine);
451                                sb.append(", loc='").append(loc).append('\'');
452                                sb.append(", pat='").append(pat).append('\'');
453                                sb.append(", lcon='").append(lCon).append('\'');
454                                sb.append(", rcon='").append(rCon).append('\'');
455                                sb.append('}');
456                                return sb.toString();
457                            }
458                        };
459                        final String patternKey = r.pattern.substring(0, 1);
460                        final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
461                        rules.add(r);
462                    } catch (final IllegalArgumentException e) {
463                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
464                                                        location, e);
465                    }
466                }
467            }
468        }
469
470        return lines;
471    }
472
473    /**
474     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
475     *
476     * @param regex
477     *            the regular expression to compile
478     * @return an RPattern that will match this regex
479     */
480    private static RPattern pattern(final String regex) {
481        final boolean startsWith = regex.startsWith("^");
482        final boolean endsWith = regex.endsWith("$");
483        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
484        final boolean boxes = content.contains("[");
485
486        if (!boxes) {
487            if (startsWith && endsWith) {
488                // exact match
489                if (content.isEmpty()) {
490                    // empty
491                    return input -> input.length() == 0;
492                }
493                return input -> input.equals(content);
494            }
495            if ((startsWith || endsWith) && content.isEmpty()) {
496                // matches every string
497                return ALL_STRINGS_RMATCHER;
498            }
499            if (startsWith) {
500                // matches from start
501                return input -> startsWith(input, content);
502            }
503            if (endsWith) {
504                // matches from start
505                return input -> endsWith(input, content);
506            }
507        } else {
508            final boolean startsWithBox = content.startsWith("[");
509            final boolean endsWithBox = content.endsWith("]");
510
511            if (startsWithBox && endsWithBox) {
512                String boxContent = content.substring(1, content.length() - 1);
513                if (!boxContent.contains("[")) {
514                    // box containing alternatives
515                    final boolean negate = boxContent.startsWith("^");
516                    if (negate) {
517                        boxContent = boxContent.substring(1);
518                    }
519                    final String bContent = boxContent;
520                    final boolean shouldMatch = !negate;
521
522                    if (startsWith && endsWith) {
523                        // exact match
524                        return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
525                    }
526                    if (startsWith) {
527                        // first char
528                        return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
529                    }
530                    if (endsWith) {
531                        // last char
532                        return input -> input.length() > 0 &&
533                               contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
534                    }
535                }
536            }
537        }
538
539        return new RPattern() {
540            final Pattern pattern = Pattern.compile(regex);
541
542            @Override
543            public boolean isMatch(final CharSequence input) {
544                final Matcher matcher = pattern.matcher(input);
545                return matcher.find();
546            }
547        };
548    }
549
550    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
551        if (prefix.length() > input.length()) {
552            return false;
553        }
554        for (int i = 0; i < prefix.length(); i++) {
555            if (input.charAt(i) != prefix.charAt(i)) {
556                return false;
557            }
558        }
559        return true;
560    }
561
562    private static String stripQuotes(String str) {
563        if (str.startsWith(DOUBLE_QUOTE)) {
564            str = str.substring(1);
565        }
566
567        if (str.endsWith(DOUBLE_QUOTE)) {
568            str = str.substring(0, str.length() - 1);
569        }
570
571        return str;
572    }
573
574    private final RPattern lContext;
575
576    private final String pattern;
577
578    private final PhonemeExpr phoneme;
579
580    private final RPattern rContext;
581
582    /**
583     * Creates a new rule.
584     *
585     * @param pattern
586     *            the pattern
587     * @param lContext
588     *            the left context
589     * @param rContext
590     *            the right context
591     * @param phoneme
592     *            the resulting phoneme
593     */
594    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
595        this.pattern = pattern;
596        this.lContext = pattern(lContext + "$");
597        this.rContext = pattern("^" + rContext);
598        this.phoneme = phoneme;
599    }
600
601    /**
602     * Gets the left context. This is a regular expression that must match to the left of the pattern.
603     *
604     * @return the left context Pattern
605     */
606    public RPattern getLContext() {
607        return this.lContext;
608    }
609
610    /**
611     * Gets the pattern. This is a string-literal that must exactly match.
612     *
613     * @return the pattern
614     */
615    public String getPattern() {
616        return this.pattern;
617    }
618
619    /**
620     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
621     *
622     * @return the phoneme
623     */
624    public PhonemeExpr getPhoneme() {
625        return this.phoneme;
626    }
627
628    /**
629     * Gets the right context. This is a regular expression that must match to the right of the pattern.
630     *
631     * @return the right context Pattern
632     */
633    public RPattern getRContext() {
634        return this.rContext;
635    }
636
637    /**
638     * Decides if the pattern and context match the input starting at a position. It is a match if the
639     * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
640     * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
641     *
642     * @param input
643     *            the input String
644     * @param i
645     *            the int position within the input
646     * @return true if the pattern and left/right context match, false otherwise
647     */
648    public boolean patternAndContextMatches(final CharSequence input, final int i) {
649        if (i < 0) {
650            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
651        }
652
653        final int patternLength = this.pattern.length();
654        final int ipl = i + patternLength;
655
656        if (ipl > input.length()) {
657            // not enough room for the pattern to match
658            return false;
659        }
660
661        // evaluate the pattern, left context and right context
662        // fail early if any of the evaluations is not successful
663        if (!input.subSequence(i, ipl).equals(this.pattern)) {
664            return false;
665        }
666        if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
667            return false;
668        }
669        return this.lContext.isMatch(input.subSequence(0, i));
670    }
671}