Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.EnumMap;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.apache.commons.codec.Resources;
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply and a logical flag indicating if all languages must
041 * be in play. A rule matches if:
042 * </p>
043 * <ul>
044 * <li>the pattern matches at the current position</li>
045 * <li>the string up until the beginning of the pattern matches the left context</li>
046 * <li>the string from the end of the pattern matches the right context</li>
047 * <li>logical is ALL and all languages are in scope; or</li>
048 * <li>logical is any other value and at least one language is in scope</li>
049 * </ul>
050 * <p>
051 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user to explicitly construct their own.
052 * </p>
053 * <p>
054 * Rules are immutable and thread-safe.
055 * </p>
056 * <h2>Rules resources</h2>
057 * <p>
058 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the pattern:
059 * </p>
060 * <blockquote>/org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
061 * <p>
062 * The format of these resources is the following:
063 * </p>
064 * <ul>
065 * <li><strong>Rules:</strong> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these will be interpreted as:
066 * <ol>
067 * <li>pattern</li>
068 * <li>left context</li>
069 * <li>right context</li>
070 * <li>phoneme</li>
071 * </ol>
072 * </li>
073 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</li>
074 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending
075 * in '*' and '/' is found.</li>
076 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
077 * </ul>
078 *
079 * @since 1.6
080 */
081public class Rule {
082
083    /**
084     * A phoneme.
085     */
086    public static final class Phoneme implements PhonemeExpr {
087
088        /**
089         * The Phoneme Comparator.
090         */
091        public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
092            final int o1Length = o1.phonemeText.length();
093            final int o2Length = o2.phonemeText.length();
094            for (int i = 0; i < o1Length; i++) {
095                if (i >= o2Length) {
096                    return +1;
097                }
098                final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
099                if (c != 0) {
100                    return c;
101                }
102            }
103            if (o1Length < o2Length) {
104                return -1;
105            }
106            return 0;
107        };
108        private final StringBuilder phonemeText;
109        private final Languages.LanguageSet languages;
110
111        /**
112         * Constructs a new instance.
113         *
114         * @param phonemeText The phoneme text.
115         * @param languages   A language set.
116         */
117        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
118            this.phonemeText = new StringBuilder(phonemeText);
119            this.languages = languages;
120        }
121
122        /**
123         * Constructs a new instance.
124         *
125         * @param phonemeLeft  The left phoneme text.
126         * @param phonemeRight The right phoneme text.
127         */
128        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
129            this(phonemeLeft.phonemeText, phonemeLeft.languages);
130            this.phonemeText.append(phonemeRight.phonemeText);
131        }
132
133        /**
134         * Constructs a new instance.
135         *
136         * @param phonemeLeft  The left phoneme text.
137         * @param phonemeRight The right phoneme text.
138         * @param languages    A language set.
139         */
140        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
141            this(phonemeLeft.phonemeText, languages);
142            this.phonemeText.append(phonemeRight.phonemeText);
143        }
144
145        /**
146         * Appends the sequence to the phone text.
147         *
148         * @param sequence The sequence to append.
149         * @return {@code this} instance.
150         */
151        public Phoneme append(final CharSequence sequence) {
152            this.phonemeText.append(sequence);
153            return this;
154        }
155
156        /**
157         * Gets the language set.
158         *
159         * @return the language set.
160         */
161        public Languages.LanguageSet getLanguages() {
162            return this.languages;
163        }
164
165        @Override
166        public Iterable<Phoneme> getPhonemes() {
167            return Collections.singleton(this);
168        }
169
170        /**
171         * Gets the phoneme text sequence.
172         *
173         * @return the phoneme text sequence.
174         */
175        public CharSequence getPhonemeText() {
176            return this.phonemeText;
177        }
178
179        /**
180         * Deprecated since 1.9.
181         *
182         * @param right the Phoneme to join.
183         * @return a new Phoneme.
184         * @deprecated Since 1.9
185         */
186        @Deprecated
187        public Phoneme join(final Phoneme right) {
188            return new Phoneme(phonemeText.toString() + right.phonemeText.toString(), languages.restrictTo(right.languages));
189        }
190
191        /**
192         * Returns a new Phoneme with the same text but a union of its current language set and the given one.
193         *
194         * @param lang the language set to merge.
195         * @return a new Phoneme.
196         */
197        public Phoneme mergeWithLanguage(final LanguageSet lang) {
198            return new Phoneme(phonemeText.toString(), languages.merge(lang));
199        }
200
201        @Override
202        public int size() {
203            return 1;
204        }
205
206        @Override
207        public String toString() {
208            return phonemeText.toString() + "[" + languages + "]";
209        }
210    }
211
212    /**
213     * A phoneme expression.
214     */
215    public interface PhonemeExpr {
216
217        /**
218         * Gets an iteration of phonemes.
219         *
220         * @return an iteration of phonemes.
221         */
222        Iterable<Phoneme> getPhonemes();
223
224        /**
225         * Gets the expression size in phonemes.
226         *
227         * @return the expression size in phonemes.
228         * @since 1.17.0
229         */
230        default int size() {
231            // All implementations are int-bound.
232            return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
233        }
234    }
235
236    /**
237     * A list of phonemes.
238     */
239    public static final class PhonemeList implements PhonemeExpr {
240
241        private final List<Phoneme> phonemeList;
242
243        /**
244         * Constructs a new instance.
245         *
246         * @param phonemes the phoneme list.
247         */
248        public PhonemeList(final List<Phoneme> phonemes) {
249            this.phonemeList = phonemes;
250        }
251
252        @Override
253        public List<Phoneme> getPhonemes() {
254            return phonemeList;
255        }
256
257        @Override
258        public int size() {
259            return phonemeList.size();
260        }
261    }
262
263    /**
264     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
265     */
266    public interface RPattern {
267
268        /**
269         * Tests whether the given input matches this instance.
270         *
271         * @param input the input to test.
272         * @return whether the given input matches this instance.
273         */
274        boolean isMatch(CharSequence input);
275    }
276
277    private static final String PIPE = "|";
278
279    /**
280     * Always matches.
281     */
282    public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
283
284    /**
285     * Unused.
286     *
287     * @deprecated This is unused.
288     */
289    @Deprecated
290    public static final String ALL = "ALL";
291
292    private static final String DOUBLE_QUOTE = "\"";
293    private static final String HASH_INCLUDE = "#include";
294    private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
295    private static final Pattern AROUND_PLUS = Pattern.compile("[+]");
296    private static final Pattern AROUND_PIPE = Pattern.compile("[|]");
297    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES = new EnumMap<>(NameType.class);
298
299    /**
300     * Initializes {@code RULES}.
301     */
302    static {
303        for (final NameType nameType : NameType.values()) {
304            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rtsMap = new EnumMap<>(RuleType.class);
305            for (final RuleType ruleType : RuleType.values()) {
306                final Map<String, Map<String, List<Rule>>> rsMap = new HashMap<>();
307                final Languages languages = Languages.getInstance(nameType);
308                languages.getLanguages().forEach(l -> {
309                    try (Scanner scanner = createScanner(nameType, ruleType, l)) {
310                        rsMap.put(l, parseRules(scanner, createResourceName(nameType, ruleType, l)));
311                    } catch (final IllegalStateException e) {
312                        throw new IllegalStateException("Problem processing " + createResourceName(nameType, ruleType, l), e);
313                    }
314                });
315                if (!ruleType.equals(RuleType.RULES)) {
316                    try (Scanner scanner = createScanner(nameType, ruleType, "common")) {
317                        rsMap.put("common", parseRules(scanner, createResourceName(nameType, ruleType, "common")));
318                    }
319                }
320                rtsMap.put(ruleType, Collections.unmodifiableMap(rsMap));
321            }
322            RULES.put(nameType, Collections.unmodifiableMap(rtsMap));
323        }
324    }
325
326    private static boolean contains(final CharSequence chars, final char input) {
327        return chars.chars().anyMatch(c -> c == input);
328    }
329
330    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
331        return String.format("/org/apache/commons/codec/language/bm/%s_%s_%s.txt", nameType.getName(), rt.getName(), lang);
332    }
333
334    @SuppressWarnings("resource") // Closing the Scanner closes the resource
335    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
336        final String resName = createResourceName(nameType, rt, lang);
337        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
338    }
339
340    @SuppressWarnings("resource") // Closing the Scanner closes the resource
341    private static Scanner createScanner(final String lang) {
342        final String resName = String.format("/org/apache/commons/codec/language/bm/%s.txt", lang);
343        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
344    }
345
346    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
347        final int suffixLength = suffix.length();
348        final int inputLength = input.length();
349        if (suffixLength > inputLength) {
350            return false;
351        }
352        for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
353            if (input.charAt(i) != suffix.charAt(j)) {
354                return false;
355            }
356        }
357        return true;
358    }
359
360    /**
361     * Gets rules for a combination of name type, rule type and languages.
362     *
363     * @param nameType the NameType to consider.
364     * @param rt       the RuleType to consider.
365     * @param langs    the set of languages to consider.
366     * @return a list of Rules that apply.
367     */
368    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
369        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
370        final List<Rule> allRules = new ArrayList<>();
371        ruleMap.values().forEach(rules -> allRules.addAll(rules));
372        return allRules;
373    }
374
375    /**
376     * Gets rules for a combination of name type, rule type and a single language.
377     *
378     * @param nameType the NameType to consider.
379     * @param rt       the RuleType to consider.
380     * @param lang     the language to consider.
381     * @return a list of Rules that apply.
382     */
383    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
384        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
385    }
386
387    /**
388     * Gets rules for a combination of name type, rule type and languages.
389     *
390     * @param nameType the NameType to consider.
391     * @param rt       the RuleType to consider.
392     * @param langs    the set of languages to consider.
393     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern.
394     * @since 1.9
395     */
396    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final Languages.LanguageSet langs) {
397        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) : getInstanceMap(nameType, rt, Languages.ANY);
398    }
399
400    /**
401     * Gets rules for a combination of name type, rule type and a single language.
402     *
403     * @param nameType the NameType to consider.
404     * @param rt       the RuleType to consider.
405     * @param lang     the language to consider.
406     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern.
407     * @since 1.9
408     */
409    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt, final String lang) {
410        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
411        if (rules == null) {
412            throw new IllegalArgumentException(String.format("No rules found for %s, %s, '%s'.", nameType.getName(), rt.getName(), lang));
413        }
414        return rules;
415    }
416
417    private static Phoneme parsePhoneme(final String ph) {
418        final int open = ph.indexOf("[");
419        if (open >= 0) {
420            if (!ph.endsWith("]")) {
421                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
422            }
423            final String before = ph.substring(0, open);
424            final String in = ph.substring(open + 1, ph.length() - 1);
425            final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in)));
426            return new Phoneme(before, Languages.LanguageSet.from(langs));
427        }
428        return new Phoneme(ph, Languages.ANY_LANGUAGE);
429    }
430
431    /*
432     * Package-private for testing only.
433     */
434    static PhonemeExpr parsePhonemeExpr(final String ph) {
435        if (ph.startsWith("(")) {
436            // we have a bracketed list of options
437            if (!ph.endsWith(")")) {
438                throw new IllegalArgumentException("Phoneme starting with '(' must end with ')'");
439            }
440            final List<Phoneme> phs = new ArrayList<>();
441            final String body = ph.substring(1, ph.length() - 1);
442            final String[] split = AROUND_PIPE.split(body);
443            for (final String part : split) {
444                phs.add(parsePhoneme(part));
445            }
446            if (split.length > 1 && split[0].length() != 0 && body.startsWith(PIPE) || split[split.length - 1].length() != 0 && body.endsWith(PIPE)) {
447                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
448            }
449            return new PhonemeList(phs);
450        }
451        return parsePhoneme(ph);
452    }
453
454    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
455        final Map<String, List<Rule>> lines = new HashMap<>();
456        int currentLine = 0;
457        boolean inMultilineComment = false;
458        while (scanner.hasNextLine()) {
459            currentLine++;
460            final String rawLine = scanner.nextLine();
461            String line = rawLine;
462            if (inMultilineComment) {
463                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
464                    inMultilineComment = false;
465                }
466            } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
467                inMultilineComment = true;
468            } else {
469                // discard comments
470                final int cmtI = line.indexOf(ResourceConstants.CMT);
471                if (cmtI >= 0) {
472                    line = line.substring(0, cmtI);
473                }
474                // trim leading-trailing whitespace
475                line = line.trim();
476                if (line.isEmpty()) {
477                    continue; // empty lines can be safely skipped
478                }
479                if (line.startsWith(HASH_INCLUDE)) {
480                    // include statement
481                    final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
482                    if (incl.contains(" ")) {
483                        throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + location);
484                    }
485                    try (Scanner hashIncludeScanner = createScanner(incl)) {
486                        lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
487                    }
488                } else {
489                    // rule
490                    final String[] parts = ResourceConstants.SPACES.split(line);
491                    if (parts.length != 4) {
492                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
493                    }
494                    try {
495                        final String pat = stripQuotes(parts[0]);
496                        final String lCon = stripQuotes(parts[1]);
497                        final String rCon = stripQuotes(parts[2]);
498                        final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
499                        final int cLine = currentLine;
500                        final Rule r = new Rule(pat, lCon, rCon, ph) {
501
502                            private final int myLine = cLine;
503                            private final String loc = location;
504
505                            @Override
506                            public String toString() {
507                                final StringBuilder sb = new StringBuilder();
508                                sb.append("Rule");
509                                sb.append("{line=").append(myLine);
510                                sb.append(", loc='").append(loc).append('\'');
511                                sb.append(", pat='").append(pat).append('\'');
512                                sb.append(", lcon='").append(lCon).append('\'');
513                                sb.append(", rcon='").append(rCon).append('\'');
514                                sb.append('}');
515                                return sb.toString();
516                            }
517                        };
518                        final String patternKey = r.pattern.substring(0, 1);
519                        final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
520                        rules.add(r);
521                    } catch (final IllegalArgumentException e) {
522                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
523                    }
524                }
525            }
526        }
527        return lines;
528    }
529
530    /**
531     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
532     *
533     * @param regex the regular expression to compile.
534     * @return an RPattern that will match this regex.
535     */
536    private static RPattern pattern(final String regex) {
537        final boolean startsWith = regex.startsWith("^");
538        final boolean endsWith = regex.endsWith("$");
539        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
540        final boolean boxes = content.contains("[");
541        if (!boxes) {
542            if (startsWith && endsWith) {
543                // exact match
544                if (content.isEmpty()) {
545                    // empty
546                    return input -> input.length() == 0;
547                }
548                return input -> input.equals(content);
549            }
550            if ((startsWith || endsWith) && content.isEmpty()) {
551                // matches every string
552                return ALL_STRINGS_RMATCHER;
553            }
554            if (startsWith) {
555                // matches from start
556                return input -> startsWith(input, content);
557            }
558            if (endsWith) {
559                // matches from start
560                return input -> endsWith(input, content);
561            }
562        } else {
563            final boolean startsWithBox = content.startsWith("[");
564            final boolean endsWithBox = content.endsWith("]");
565            if (startsWithBox && endsWithBox) {
566                String boxContent = content.substring(1, content.length() - 1);
567                if (!boxContent.contains("[")) {
568                    // box containing alternatives
569                    final boolean negate = boxContent.startsWith("^");
570                    if (negate) {
571                        boxContent = boxContent.substring(1);
572                    }
573                    final String bContent = boxContent;
574                    final boolean shouldMatch = !negate;
575                    if (startsWith && endsWith) {
576                        // exact match
577                        return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
578                    }
579                    if (startsWith) {
580                        // first char
581                        return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
582                    }
583                    if (endsWith) {
584                        // last char
585                        return input -> input.length() > 0 && contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
586                    }
587                }
588            }
589        }
590        return new RPattern() {
591
592            final Pattern pattern = Pattern.compile(regex);
593
594            @Override
595            public boolean isMatch(final CharSequence input) {
596                final Matcher matcher = pattern.matcher(input);
597                return matcher.find();
598            }
599        };
600    }
601
602    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
603        if (prefix.length() > input.length()) {
604            return false;
605        }
606        for (int i = 0; i < prefix.length(); i++) {
607            if (input.charAt(i) != prefix.charAt(i)) {
608                return false;
609            }
610        }
611        return true;
612    }
613
614    private static String stripQuotes(String str) {
615        if (str.startsWith(DOUBLE_QUOTE)) {
616            str = str.substring(1);
617        }
618        if (str.endsWith(DOUBLE_QUOTE)) {
619            str = str.substring(0, str.length() - 1);
620        }
621        return str;
622    }
623
624    private final RPattern lContext;
625    private final String pattern;
626    private final PhonemeExpr phoneme;
627    private final RPattern rContext;
628
629    /**
630     * Creates a new rule.
631     *
632     * @param pattern  the pattern.
633     * @param lContext the left context.
634     * @param rContext the right context.
635     * @param phoneme  the resulting phoneme.
636     */
637    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
638        this.pattern = pattern;
639        this.lContext = pattern(lContext + "$");
640        this.rContext = pattern("^" + rContext);
641        this.phoneme = phoneme;
642    }
643
644    /**
645     * Gets the left context. This is a regular expression that must match to the left of the pattern.
646     *
647     * @return the left context Pattern.
648     */
649    public RPattern getLContext() {
650        return lContext;
651    }
652
653    /**
654     * Gets the pattern. This is a string-literal that must exactly match.
655     *
656     * @return the pattern.
657     */
658    public String getPattern() {
659        return pattern;
660    }
661
662    /**
663     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
664     *
665     * @return the phoneme.
666     */
667    public PhonemeExpr getPhoneme() {
668        return phoneme;
669    }
670
671    /**
672     * Gets the right context. This is a regular expression that must match to the right of the pattern.
673     *
674     * @return the right context Pattern.
675     */
676    public RPattern getRContext() {
677        return rContext;
678    }
679
680    /**
681     * Decides if the pattern and context match the input starting at a position. It is a match if the {@code lContext} matches {@code input} up to {@code i},
682     * {@code pattern} matches at i and {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
683     *
684     * @param input the input String.
685     * @param i     the int position within the input.
686     * @return true if the pattern and left/right context match, false otherwise.
687     */
688    public boolean patternAndContextMatches(final CharSequence input, final int i) {
689        if (i < 0) {
690            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
691        }
692        final int patternLength = pattern.length();
693        final int ipl = i + patternLength;
694        if (ipl > input.length()) {
695            // not enough room for the pattern to match
696            return false;
697        }
698        // evaluate the pattern, left context and right context
699        // fail early if any of the evaluations is not successful
700        if (!input.subSequence(i, ipl).equals(pattern)) {
701            return false;
702        }
703        if (!rContext.isMatch(input.subSequence(ipl, input.length()))) {
704            return false;
705        }
706        return lContext.isMatch(input.subSequence(0, i));
707    }
708}