Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.io.InputStream;
021import java.util.ArrayList;
022import java.util.Arrays;
023import java.util.Collections;
024import java.util.Comparator;
025import java.util.EnumMap;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.List;
029import java.util.Map;
030import java.util.Scanner;
031import java.util.Set;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * <ul>
043 * <li>the pattern matches at the current position</li>
044 * <li>the string up until the beginning of the pattern matches the left context</li>
045 * <li>the string from the end of the pattern matches the right context</li>
046 * <li>logical is ALL and all languages are in scope; or</li>
047 * <li>logical is any other value and at least one language is in scope</li>
048 * </ul>
049 * <p>
050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
051 * to explicitly construct their own.
052 * <p>
053 * Rules are immutable and thread-safe.
054 * <p>
055 * <b>Rules resources</b>
056 * <p>
057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
058 * named following the pattern:
059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
060 * <p>
061 * The format of these resources is the following:
062 * <ul>
063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
064 * will be interpreted as:
065 * <ol>
066 * <li>pattern</li>
067 * <li>left context</li>
068 * <li>right context</li>
069 * <li>phoneme</li>
070 * </ol>
071 * </li>
072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
073 * as a comment.</li>
074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
075 * all content until a line ending in '*' and '/' is found.</li>
076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
077 * </ul>
078 *
079 * @since 1.6
080 * @version $Id: Rule.java 1760691 2016-09-14 12:14:26Z jochen $
081 */
082public class Rule {
083
084    public static final class Phoneme implements PhonemeExpr {
085        public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
086            @Override
087            public int compare(final Phoneme o1, final Phoneme o2) {
088                for (int i = 0; i < o1.phonemeText.length(); i++) {
089                    if (i >= o2.phonemeText.length()) {
090                        return +1;
091                    }
092                    final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
093                    if (c != 0) {
094                        return c;
095                    }
096                }
097
098                if (o1.phonemeText.length() < o2.phonemeText.length()) {
099                    return -1;
100                }
101
102                return 0;
103            }
104        };
105
106        private final StringBuilder phonemeText;
107        private final Languages.LanguageSet languages;
108
109        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110            this.phonemeText = new StringBuilder(phonemeText);
111            this.languages = languages;
112        }
113
114        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115            this(phonemeLeft.phonemeText, phonemeLeft.languages);
116            this.phonemeText.append(phonemeRight.phonemeText);
117        }
118
119        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120            this(phonemeLeft.phonemeText, languages);
121            this.phonemeText.append(phonemeRight.phonemeText);
122        }
123
124        public Phoneme append(final CharSequence str) {
125            this.phonemeText.append(str);
126            return this;
127        }
128
129        public Languages.LanguageSet getLanguages() {
130            return this.languages;
131        }
132
133        @Override
134        public Iterable<Phoneme> getPhonemes() {
135            return Collections.singleton(this);
136        }
137
138        public CharSequence getPhonemeText() {
139            return this.phonemeText;
140        }
141
142        /**
143         * Deprecated since 1.9.
144         *
145         * @param right the Phoneme to join
146         * @return a new Phoneme
147         * @deprecated since 1.9
148         */
149        @Deprecated
150        public Phoneme join(final Phoneme right) {
151            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
152                               this.languages.restrictTo(right.languages));
153        }
154
155        /**
156         * Returns a new Phoneme with the same text but a union of its
157         * current language set and the given one.
158         *
159         * @param lang the language set to merge
160         * @return a new Phoneme
161         */
162        public Phoneme mergeWithLanguage(final LanguageSet lang) {
163          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
164        }
165
166        @Override
167        public String toString() {
168          return phonemeText.toString() + "[" + languages + "]";
169        }
170    }
171
172    public interface PhonemeExpr {
173        Iterable<Phoneme> getPhonemes();
174    }
175
176    public static final class PhonemeList implements PhonemeExpr {
177        private final List<Phoneme> phonemes;
178
179        public PhonemeList(final List<Phoneme> phonemes) {
180            this.phonemes = phonemes;
181        }
182
183        @Override
184        public List<Phoneme> getPhonemes() {
185            return this.phonemes;
186        }
187    }
188
189    /**
190     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
191     */
192    public interface RPattern {
193        boolean isMatch(CharSequence input);
194    }
195
196    public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
197        @Override
198        public boolean isMatch(final CharSequence input) {
199            return true;
200        }
201    };
202
203    public static final String ALL = "ALL";
204
205    private static final String DOUBLE_QUOTE = "\"";
206
207    private static final String HASH_INCLUDE = "#include";
208
209    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
210            new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
211
212    static {
213        for (final NameType s : NameType.values()) {
214            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
215                    new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
216
217            for (final RuleType rt : RuleType.values()) {
218                final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
219
220                final Languages ls = Languages.getInstance(s);
221                for (final String l : ls.getLanguages()) {
222                    final Scanner scanner = createScanner(s, rt, l);
223                    try {
224                        rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
225                    } catch (final IllegalStateException e) {
226                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
227                    } finally {
228                        scanner.close();
229                    }
230                }
231                if (!rt.equals(RuleType.RULES)) {
232                    final Scanner scanner = createScanner(s, rt, "common");
233                    try {
234                        rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
235                    } finally {
236                        scanner.close();
237                    }
238                }
239
240                rts.put(rt, Collections.unmodifiableMap(rs));
241            }
242
243            RULES.put(s, Collections.unmodifiableMap(rts));
244        }
245    }
246
247    private static boolean contains(final CharSequence chars, final char input) {
248        for (int i = 0; i < chars.length(); i++) {
249            if (chars.charAt(i) == input) {
250                return true;
251            }
252        }
253        return false;
254    }
255
256    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
257        return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
258                             nameType.getName(), rt.getName(), lang);
259    }
260
261    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
262        final String resName = createResourceName(nameType, rt, lang);
263        final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
264
265        if (rulesIS == null) {
266            throw new IllegalArgumentException("Unable to load resource: " + resName);
267        }
268
269        return new Scanner(rulesIS, ResourceConstants.ENCODING);
270    }
271
272    private static Scanner createScanner(final String lang) {
273        final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
274        final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
275
276        if (rulesIS == null) {
277            throw new IllegalArgumentException("Unable to load resource: " + resName);
278        }
279
280        return new Scanner(rulesIS, ResourceConstants.ENCODING);
281    }
282
283    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
284        if (suffix.length() > input.length()) {
285            return false;
286        }
287        for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
288            if (input.charAt(i) != suffix.charAt(j)) {
289                return false;
290            }
291        }
292        return true;
293    }
294
295    /**
296     * Gets rules for a combination of name type, rule type and languages.
297     *
298     * @param nameType
299     *            the NameType to consider
300     * @param rt
301     *            the RuleType to consider
302     * @param langs
303     *            the set of languages to consider
304     * @return a list of Rules that apply
305     */
306    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
307                                         final Languages.LanguageSet langs) {
308        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
309        final List<Rule> allRules = new ArrayList<Rule>();
310        for (final List<Rule> rules : ruleMap.values()) {
311            allRules.addAll(rules);
312        }
313        return allRules;
314    }
315
316    /**
317     * Gets rules for a combination of name type, rule type and a single language.
318     *
319     * @param nameType
320     *            the NameType to consider
321     * @param rt
322     *            the RuleType to consider
323     * @param lang
324     *            the language to consider
325     * @return a list of Rules that apply
326     */
327    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
328        return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
329    }
330
331    /**
332     * Gets rules for a combination of name type, rule type and languages.
333     *
334     * @param nameType
335     *            the NameType to consider
336     * @param rt
337     *            the RuleType to consider
338     * @param langs
339     *            the set of languages to consider
340     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
341     * @since 1.9
342     */
343    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
344                                                         final Languages.LanguageSet langs) {
345        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
346                                     getInstanceMap(nameType, rt, Languages.ANY);
347    }
348
349    /**
350     * Gets rules for a combination of name type, rule type and a single language.
351     *
352     * @param nameType
353     *            the NameType to consider
354     * @param rt
355     *            the RuleType to consider
356     * @param lang
357     *            the language to consider
358     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
359     * @since 1.9
360     */
361    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
362                                                         final String lang) {
363        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
364
365        if (rules == null) {
366            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
367                                               nameType.getName(), rt.getName(), lang));
368        }
369
370        return rules;
371    }
372
373    private static Phoneme parsePhoneme(final String ph) {
374        final int open = ph.indexOf("[");
375        if (open >= 0) {
376            if (!ph.endsWith("]")) {
377                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
378            }
379            final String before = ph.substring(0, open);
380            final String in = ph.substring(open + 1, ph.length() - 1);
381            final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
382
383            return new Phoneme(before, Languages.LanguageSet.from(langs));
384        }
385        return new Phoneme(ph, Languages.ANY_LANGUAGE);
386    }
387
388    private static PhonemeExpr parsePhonemeExpr(final String ph) {
389        if (ph.startsWith("(")) { // we have a bracketed list of options
390            if (!ph.endsWith(")")) {
391                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
392            }
393
394            final List<Phoneme> phs = new ArrayList<Phoneme>();
395            final String body = ph.substring(1, ph.length() - 1);
396            for (final String part : body.split("[|]")) {
397                phs.add(parsePhoneme(part));
398            }
399            if (body.startsWith("|") || body.endsWith("|")) {
400                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
401            }
402
403            return new PhonemeList(phs);
404        }
405        return parsePhoneme(ph);
406    }
407
408    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
409        final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
410        int currentLine = 0;
411
412        boolean inMultilineComment = false;
413        while (scanner.hasNextLine()) {
414            currentLine++;
415            final String rawLine = scanner.nextLine();
416            String line = rawLine;
417
418            if (inMultilineComment) {
419                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
420                    inMultilineComment = false;
421                }
422            } else {
423                if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
424                    inMultilineComment = true;
425                } else {
426                    // discard comments
427                    final int cmtI = line.indexOf(ResourceConstants.CMT);
428                    if (cmtI >= 0) {
429                        line = line.substring(0, cmtI);
430                    }
431
432                    // trim leading-trailing whitespace
433                    line = line.trim();
434
435                    if (line.length() == 0) {
436                        continue; // empty lines can be safely skipped
437                    }
438
439                    if (line.startsWith(HASH_INCLUDE)) {
440                        // include statement
441                        final String incl = line.substring(HASH_INCLUDE.length()).trim();
442                        if (incl.contains(" ")) {
443                            throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
444                                                               location);
445                        }
446                        final Scanner hashIncludeScanner = createScanner(incl);
447                        try {
448                            lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449                        } finally {
450                            hashIncludeScanner.close();
451                        }
452                    } else {
453                        // rule
454                        final String[] parts = line.split("\\s+");
455                        if (parts.length != 4) {
456                            throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
457                                                               " parts: " + rawLine + " in " + location);
458                        }
459                        try {
460                            final String pat = stripQuotes(parts[0]);
461                            final String lCon = stripQuotes(parts[1]);
462                            final String rCon = stripQuotes(parts[2]);
463                            final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
464                            final int cLine = currentLine;
465                            final Rule r = new Rule(pat, lCon, rCon, ph) {
466                                private final int myLine = cLine;
467                                private final String loc = location;
468
469                                @Override
470                                public String toString() {
471                                    final StringBuilder sb = new StringBuilder();
472                                    sb.append("Rule");
473                                    sb.append("{line=").append(myLine);
474                                    sb.append(", loc='").append(loc).append('\'');
475                                    sb.append(", pat='").append(pat).append('\'');
476                                    sb.append(", lcon='").append(lCon).append('\'');
477                                    sb.append(", rcon='").append(rCon).append('\'');
478                                    sb.append('}');
479                                    return sb.toString();
480                                }
481                            };
482                            final String patternKey = r.pattern.substring(0,1);
483                            List<Rule> rules = lines.get(patternKey);
484                            if (rules == null) {
485                                rules = new ArrayList<Rule>();
486                                lines.put(patternKey, rules);
487                            }
488                            rules.add(r);
489                        } catch (final IllegalArgumentException e) {
490                            throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
491                                                            location, e);
492                        }
493                    }
494                }
495            }
496        }
497
498        return lines;
499    }
500
501    /**
502     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
503     *
504     * @param regex
505     *            the regular expression to compile
506     * @return an RPattern that will match this regex
507     */
508    private static RPattern pattern(final String regex) {
509        final boolean startsWith = regex.startsWith("^");
510        final boolean endsWith = regex.endsWith("$");
511        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
512        final boolean boxes = content.contains("[");
513
514        if (!boxes) {
515            if (startsWith && endsWith) {
516                // exact match
517                if (content.length() == 0) {
518                    // empty
519                    return new RPattern() {
520                        @Override
521                        public boolean isMatch(final CharSequence input) {
522                            return input.length() == 0;
523                        }
524                    };
525                }
526                return new RPattern() {
527                    @Override
528                    public boolean isMatch(final CharSequence input) {
529                        return input.equals(content);
530                    }
531                };
532            } else if ((startsWith || endsWith) && content.length() == 0) {
533                // matches every string
534                return ALL_STRINGS_RMATCHER;
535            } else if (startsWith) {
536                // matches from start
537                return new RPattern() {
538                    @Override
539                    public boolean isMatch(final CharSequence input) {
540                        return startsWith(input, content);
541                    }
542                };
543            } else if (endsWith) {
544                // matches from start
545                return new RPattern() {
546                    @Override
547                    public boolean isMatch(final CharSequence input) {
548                        return endsWith(input, content);
549                    }
550                };
551            }
552        } else {
553            final boolean startsWithBox = content.startsWith("[");
554            final boolean endsWithBox = content.endsWith("]");
555
556            if (startsWithBox && endsWithBox) {
557                String boxContent = content.substring(1, content.length() - 1);
558                if (!boxContent.contains("[")) {
559                    // box containing alternatives
560                    final boolean negate = boxContent.startsWith("^");
561                    if (negate) {
562                        boxContent = boxContent.substring(1);
563                    }
564                    final String bContent = boxContent;
565                    final boolean shouldMatch = !negate;
566
567                    if (startsWith && endsWith) {
568                        // exact match
569                        return new RPattern() {
570                            @Override
571                            public boolean isMatch(final CharSequence input) {
572                                return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
573                            }
574                        };
575                    } else if (startsWith) {
576                        // first char
577                        return new RPattern() {
578                            @Override
579                            public boolean isMatch(final CharSequence input) {
580                                return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
581                            }
582                        };
583                    } else if (endsWith) {
584                        // last char
585                        return new RPattern() {
586                            @Override
587                            public boolean isMatch(final CharSequence input) {
588                                return input.length() > 0 &&
589                                       contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
590                            }
591                        };
592                    }
593                }
594            }
595        }
596
597        return new RPattern() {
598            Pattern pattern = Pattern.compile(regex);
599
600            @Override
601            public boolean isMatch(final CharSequence input) {
602                final Matcher matcher = pattern.matcher(input);
603                return matcher.find();
604            }
605        };
606    }
607
608    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
609        if (prefix.length() > input.length()) {
610            return false;
611        }
612        for (int i = 0; i < prefix.length(); i++) {
613            if (input.charAt(i) != prefix.charAt(i)) {
614                return false;
615            }
616        }
617        return true;
618    }
619
620    private static String stripQuotes(String str) {
621        if (str.startsWith(DOUBLE_QUOTE)) {
622            str = str.substring(1);
623        }
624
625        if (str.endsWith(DOUBLE_QUOTE)) {
626            str = str.substring(0, str.length() - 1);
627        }
628
629        return str;
630    }
631
632    private final RPattern lContext;
633
634    private final String pattern;
635
636    private final PhonemeExpr phoneme;
637
638    private final RPattern rContext;
639
640    /**
641     * Creates a new rule.
642     *
643     * @param pattern
644     *            the pattern
645     * @param lContext
646     *            the left context
647     * @param rContext
648     *            the right context
649     * @param phoneme
650     *            the resulting phoneme
651     */
652    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
653        this.pattern = pattern;
654        this.lContext = pattern(lContext + "$");
655        this.rContext = pattern("^" + rContext);
656        this.phoneme = phoneme;
657    }
658
659    /**
660     * Gets the left context. This is a regular expression that must match to the left of the pattern.
661     *
662     * @return the left context Pattern
663     */
664    public RPattern getLContext() {
665        return this.lContext;
666    }
667
668    /**
669     * Gets the pattern. This is a string-literal that must exactly match.
670     *
671     * @return the pattern
672     */
673    public String getPattern() {
674        return this.pattern;
675    }
676
677    /**
678     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
679     *
680     * @return the phoneme
681     */
682    public PhonemeExpr getPhoneme() {
683        return this.phoneme;
684    }
685
686    /**
687     * Gets the right context. This is a regular expression that must match to the right of the pattern.
688     *
689     * @return the right context Pattern
690     */
691    public RPattern getRContext() {
692        return this.rContext;
693    }
694
695    /**
696     * Decides if the pattern and context match the input starting at a position. It is a match if the
697     * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
698     * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
699     *
700     * @param input
701     *            the input String
702     * @param i
703     *            the int position within the input
704     * @return true if the pattern and left/right context match, false otherwise
705     */
706    public boolean patternAndContextMatches(final CharSequence input, final int i) {
707        if (i < 0) {
708            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
709        }
710
711        final int patternLength = this.pattern.length();
712        final int ipl = i + patternLength;
713
714        if (ipl > input.length()) {
715            // not enough room for the pattern to match
716            return false;
717        }
718
719        // evaluate the pattern, left context and right context
720        // fail early if any of the evaluations is not successful
721        if (!input.subSequence(i, ipl).equals(this.pattern)) {
722            return false;
723        } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
724            return false;
725        }
726        return this.lContext.isMatch(input.subSequence(0, i));
727    }
728}