Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.EnumMap;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Scanner;
030import java.util.Set;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.apache.commons.codec.Resources;
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * <ul>
043 * <li>the pattern matches at the current position</li>
044 * <li>the string up until the beginning of the pattern matches the left context</li>
045 * <li>the string from the end of the pattern matches the right context</li>
046 * <li>logical is ALL and all languages are in scope; or</li>
047 * <li>logical is any other value and at least one language is in scope</li>
048 * </ul>
049 * <p>
050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
051 * to explicitly construct their own.
052 * <p>
053 * Rules are immutable and thread-safe.
054 * <p>
055 * <b>Rules resources</b>
056 * <p>
057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
058 * named following the pattern:
059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
060 * <p>
061 * The format of these resources is the following:
062 * <ul>
063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
064 * will be interpreted as:
065 * <ol>
066 * <li>pattern</li>
067 * <li>left context</li>
068 * <li>right context</li>
069 * <li>phoneme</li>
070 * </ol>
071 * </li>
072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
073 * as a comment.</li>
074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
075 * all content until a line ending in '*' and '/' is found.</li>
076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
077 * </ul>
078 *
079 * @since 1.6
080 */
081public class Rule {
082
083    public static final class Phoneme implements PhonemeExpr {
084        public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
085            @Override
086            public int compare(final Phoneme o1, final Phoneme o2) {
087                for (int i = 0; i < o1.phonemeText.length(); i++) {
088                    if (i >= o2.phonemeText.length()) {
089                        return +1;
090                    }
091                    final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
092                    if (c != 0) {
093                        return c;
094                    }
095                }
096
097                if (o1.phonemeText.length() < o2.phonemeText.length()) {
098                    return -1;
099                }
100
101                return 0;
102            }
103        };
104
105        private final StringBuilder phonemeText;
106        private final Languages.LanguageSet languages;
107
108        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
109            this.phonemeText = new StringBuilder(phonemeText);
110            this.languages = languages;
111        }
112
113        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
114            this(phonemeLeft.phonemeText, phonemeLeft.languages);
115            this.phonemeText.append(phonemeRight.phonemeText);
116        }
117
118        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
119            this(phonemeLeft.phonemeText, languages);
120            this.phonemeText.append(phonemeRight.phonemeText);
121        }
122
123        public Phoneme append(final CharSequence str) {
124            this.phonemeText.append(str);
125            return this;
126        }
127
128        public Languages.LanguageSet getLanguages() {
129            return this.languages;
130        }
131
132        @Override
133        public Iterable<Phoneme> getPhonemes() {
134            return Collections.singleton(this);
135        }
136
137        public CharSequence getPhonemeText() {
138            return this.phonemeText;
139        }
140
141        /**
142         * Deprecated since 1.9.
143         *
144         * @param right the Phoneme to join
145         * @return a new Phoneme
146         * @deprecated since 1.9
147         */
148        @Deprecated
149        public Phoneme join(final Phoneme right) {
150            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
151                               this.languages.restrictTo(right.languages));
152        }
153
154        /**
155         * Returns a new Phoneme with the same text but a union of its
156         * current language set and the given one.
157         *
158         * @param lang the language set to merge
159         * @return a new Phoneme
160         */
161        public Phoneme mergeWithLanguage(final LanguageSet lang) {
162          return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
163        }
164
165        @Override
166        public String toString() {
167          return phonemeText.toString() + "[" + languages + "]";
168        }
169    }
170
171    public interface PhonemeExpr {
172        Iterable<Phoneme> getPhonemes();
173    }
174
175    public static final class PhonemeList implements PhonemeExpr {
176        private final List<Phoneme> phonemes;
177
178        public PhonemeList(final List<Phoneme> phonemes) {
179            this.phonemes = phonemes;
180        }
181
182        @Override
183        public List<Phoneme> getPhonemes() {
184            return this.phonemes;
185        }
186    }
187
188    /**
189     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
190     */
191    public interface RPattern {
192        boolean isMatch(CharSequence input);
193    }
194
195    public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
196        @Override
197        public boolean isMatch(final CharSequence input) {
198            return true;
199        }
200    };
201
202    public static final String ALL = "ALL";
203
204    private static final String DOUBLE_QUOTE = "\"";
205
206    private static final String HASH_INCLUDE = "#include";
207
208    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
209            new EnumMap<>(NameType.class);
210
211    static {
212        for (final NameType s : NameType.values()) {
213            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
214                    new EnumMap<>(RuleType.class);
215
216            for (final RuleType rt : RuleType.values()) {
217                final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
218
219                final Languages ls = Languages.getInstance(s);
220                for (final String l : ls.getLanguages()) {
221                    try (final Scanner scanner = createScanner(s, rt, l)) {
222                        rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
223                    } catch (final IllegalStateException e) {
224                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
225                    }
226                }
227                if (!rt.equals(RuleType.RULES)) {
228                    try (final Scanner scanner = createScanner(s, rt, "common")) {
229                        rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
230                    }
231                }
232
233                rts.put(rt, Collections.unmodifiableMap(rs));
234            }
235
236            RULES.put(s, Collections.unmodifiableMap(rts));
237        }
238    }
239
240    private static boolean contains(final CharSequence chars, final char input) {
241        for (int i = 0; i < chars.length(); i++) {
242            if (chars.charAt(i) == input) {
243                return true;
244            }
245        }
246        return false;
247    }
248
249    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
250        return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
251                             nameType.getName(), rt.getName(), lang);
252    }
253
254    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
255        final String resName = createResourceName(nameType, rt, lang);
256        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
257    }
258
259    private static Scanner createScanner(final String lang) {
260        final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
261        return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
262    }
263
264    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
265        if (suffix.length() > input.length()) {
266            return false;
267        }
268        for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
269            if (input.charAt(i) != suffix.charAt(j)) {
270                return false;
271            }
272        }
273        return true;
274    }
275
276    /**
277     * Gets rules for a combination of name type, rule type and languages.
278     *
279     * @param nameType
280     *            the NameType to consider
281     * @param rt
282     *            the RuleType to consider
283     * @param langs
284     *            the set of languages to consider
285     * @return a list of Rules that apply
286     */
287    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
288                                         final Languages.LanguageSet langs) {
289        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
290        final List<Rule> allRules = new ArrayList<>();
291        for (final List<Rule> rules : ruleMap.values()) {
292            allRules.addAll(rules);
293        }
294        return allRules;
295    }
296
297    /**
298     * Gets rules for a combination of name type, rule type and a single language.
299     *
300     * @param nameType
301     *            the NameType to consider
302     * @param rt
303     *            the RuleType to consider
304     * @param lang
305     *            the language to consider
306     * @return a list of Rules that apply
307     */
308    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
309        return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
310    }
311
312    /**
313     * Gets rules for a combination of name type, rule type and languages.
314     *
315     * @param nameType
316     *            the NameType to consider
317     * @param rt
318     *            the RuleType to consider
319     * @param langs
320     *            the set of languages to consider
321     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
322     * @since 1.9
323     */
324    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
325                                                         final Languages.LanguageSet langs) {
326        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
327                                     getInstanceMap(nameType, rt, Languages.ANY);
328    }
329
330    /**
331     * Gets rules for a combination of name type, rule type and a single language.
332     *
333     * @param nameType
334     *            the NameType to consider
335     * @param rt
336     *            the RuleType to consider
337     * @param lang
338     *            the language to consider
339     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
340     * @since 1.9
341     */
342    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
343                                                         final String lang) {
344        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
345
346        if (rules == null) {
347            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
348                                               nameType.getName(), rt.getName(), lang));
349        }
350
351        return rules;
352    }
353
354    private static Phoneme parsePhoneme(final String ph) {
355        final int open = ph.indexOf("[");
356        if (open >= 0) {
357            if (!ph.endsWith("]")) {
358                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
359            }
360            final String before = ph.substring(0, open);
361            final String in = ph.substring(open + 1, ph.length() - 1);
362            final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
363
364            return new Phoneme(before, Languages.LanguageSet.from(langs));
365        }
366        return new Phoneme(ph, Languages.ANY_LANGUAGE);
367    }
368
369    private static PhonemeExpr parsePhonemeExpr(final String ph) {
370        if (ph.startsWith("(")) { // we have a bracketed list of options
371            if (!ph.endsWith(")")) {
372                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
373            }
374
375            final List<Phoneme> phs = new ArrayList<>();
376            final String body = ph.substring(1, ph.length() - 1);
377            for (final String part : body.split("[|]")) {
378                phs.add(parsePhoneme(part));
379            }
380            if (body.startsWith("|") || body.endsWith("|")) {
381                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
382            }
383
384            return new PhonemeList(phs);
385        }
386        return parsePhoneme(ph);
387    }
388
389    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
390        final Map<String, List<Rule>> lines = new HashMap<>();
391        int currentLine = 0;
392
393        boolean inMultilineComment = false;
394        while (scanner.hasNextLine()) {
395            currentLine++;
396            final String rawLine = scanner.nextLine();
397            String line = rawLine;
398
399            if (inMultilineComment) {
400                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
401                    inMultilineComment = false;
402                }
403            } else {
404                if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
405                    inMultilineComment = true;
406                } else {
407                    // discard comments
408                    final int cmtI = line.indexOf(ResourceConstants.CMT);
409                    if (cmtI >= 0) {
410                        line = line.substring(0, cmtI);
411                    }
412
413                    // trim leading-trailing whitespace
414                    line = line.trim();
415
416                    if (line.length() == 0) {
417                        continue; // empty lines can be safely skipped
418                    }
419
420                    if (line.startsWith(HASH_INCLUDE)) {
421                        // include statement
422                        final String incl = line.substring(HASH_INCLUDE.length()).trim();
423                        if (incl.contains(" ")) {
424                            throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
425                                                               location);
426                        }
427                        try (final Scanner hashIncludeScanner = createScanner(incl)) {
428                            lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
429                        }
430                    } else {
431                        // rule
432                        final String[] parts = line.split("\\s+");
433                        if (parts.length != 4) {
434                            throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
435                                                               " parts: " + rawLine + " in " + location);
436                        }
437                        try {
438                            final String pat = stripQuotes(parts[0]);
439                            final String lCon = stripQuotes(parts[1]);
440                            final String rCon = stripQuotes(parts[2]);
441                            final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
442                            final int cLine = currentLine;
443                            final Rule r = new Rule(pat, lCon, rCon, ph) {
444                                private final int myLine = cLine;
445                                private final String loc = location;
446
447                                @Override
448                                public String toString() {
449                                    final StringBuilder sb = new StringBuilder();
450                                    sb.append("Rule");
451                                    sb.append("{line=").append(myLine);
452                                    sb.append(", loc='").append(loc).append('\'');
453                                    sb.append(", pat='").append(pat).append('\'');
454                                    sb.append(", lcon='").append(lCon).append('\'');
455                                    sb.append(", rcon='").append(rCon).append('\'');
456                                    sb.append('}');
457                                    return sb.toString();
458                                }
459                            };
460                            final String patternKey = r.pattern.substring(0,1);
461                            List<Rule> rules = lines.get(patternKey);
462                            if (rules == null) {
463                                rules = new ArrayList<>();
464                                lines.put(patternKey, rules);
465                            }
466                            rules.add(r);
467                        } catch (final IllegalArgumentException e) {
468                            throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
469                                                            location, e);
470                        }
471                    }
472                }
473            }
474        }
475
476        return lines;
477    }
478
479    /**
480     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
481     *
482     * @param regex
483     *            the regular expression to compile
484     * @return an RPattern that will match this regex
485     */
486    private static RPattern pattern(final String regex) {
487        final boolean startsWith = regex.startsWith("^");
488        final boolean endsWith = regex.endsWith("$");
489        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
490        final boolean boxes = content.contains("[");
491
492        if (!boxes) {
493            if (startsWith && endsWith) {
494                // exact match
495                if (content.length() == 0) {
496                    // empty
497                    return new RPattern() {
498                        @Override
499                        public boolean isMatch(final CharSequence input) {
500                            return input.length() == 0;
501                        }
502                    };
503                }
504                return new RPattern() {
505                    @Override
506                    public boolean isMatch(final CharSequence input) {
507                        return input.equals(content);
508                    }
509                };
510            } else if ((startsWith || endsWith) && content.length() == 0) {
511                // matches every string
512                return ALL_STRINGS_RMATCHER;
513            } else if (startsWith) {
514                // matches from start
515                return new RPattern() {
516                    @Override
517                    public boolean isMatch(final CharSequence input) {
518                        return startsWith(input, content);
519                    }
520                };
521            } else if (endsWith) {
522                // matches from start
523                return new RPattern() {
524                    @Override
525                    public boolean isMatch(final CharSequence input) {
526                        return endsWith(input, content);
527                    }
528                };
529            }
530        } else {
531            final boolean startsWithBox = content.startsWith("[");
532            final boolean endsWithBox = content.endsWith("]");
533
534            if (startsWithBox && endsWithBox) {
535                String boxContent = content.substring(1, content.length() - 1);
536                if (!boxContent.contains("[")) {
537                    // box containing alternatives
538                    final boolean negate = boxContent.startsWith("^");
539                    if (negate) {
540                        boxContent = boxContent.substring(1);
541                    }
542                    final String bContent = boxContent;
543                    final boolean shouldMatch = !negate;
544
545                    if (startsWith && endsWith) {
546                        // exact match
547                        return new RPattern() {
548                            @Override
549                            public boolean isMatch(final CharSequence input) {
550                                return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
551                            }
552                        };
553                    } else if (startsWith) {
554                        // first char
555                        return new RPattern() {
556                            @Override
557                            public boolean isMatch(final CharSequence input) {
558                                return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
559                            }
560                        };
561                    } else if (endsWith) {
562                        // last char
563                        return new RPattern() {
564                            @Override
565                            public boolean isMatch(final CharSequence input) {
566                                return input.length() > 0 &&
567                                       contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
568                            }
569                        };
570                    }
571                }
572            }
573        }
574
575        return new RPattern() {
576            Pattern pattern = Pattern.compile(regex);
577
578            @Override
579            public boolean isMatch(final CharSequence input) {
580                final Matcher matcher = pattern.matcher(input);
581                return matcher.find();
582            }
583        };
584    }
585
586    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
587        if (prefix.length() > input.length()) {
588            return false;
589        }
590        for (int i = 0; i < prefix.length(); i++) {
591            if (input.charAt(i) != prefix.charAt(i)) {
592                return false;
593            }
594        }
595        return true;
596    }
597
598    private static String stripQuotes(String str) {
599        if (str.startsWith(DOUBLE_QUOTE)) {
600            str = str.substring(1);
601        }
602
603        if (str.endsWith(DOUBLE_QUOTE)) {
604            str = str.substring(0, str.length() - 1);
605        }
606
607        return str;
608    }
609
610    private final RPattern lContext;
611
612    private final String pattern;
613
614    private final PhonemeExpr phoneme;
615
616    private final RPattern rContext;
617
618    /**
619     * Creates a new rule.
620     *
621     * @param pattern
622     *            the pattern
623     * @param lContext
624     *            the left context
625     * @param rContext
626     *            the right context
627     * @param phoneme
628     *            the resulting phoneme
629     */
630    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
631        this.pattern = pattern;
632        this.lContext = pattern(lContext + "$");
633        this.rContext = pattern("^" + rContext);
634        this.phoneme = phoneme;
635    }
636
637    /**
638     * Gets the left context. This is a regular expression that must match to the left of the pattern.
639     *
640     * @return the left context Pattern
641     */
642    public RPattern getLContext() {
643        return this.lContext;
644    }
645
646    /**
647     * Gets the pattern. This is a string-literal that must exactly match.
648     *
649     * @return the pattern
650     */
651    public String getPattern() {
652        return this.pattern;
653    }
654
655    /**
656     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
657     *
658     * @return the phoneme
659     */
660    public PhonemeExpr getPhoneme() {
661        return this.phoneme;
662    }
663
664    /**
665     * Gets the right context. This is a regular expression that must match to the right of the pattern.
666     *
667     * @return the right context Pattern
668     */
669    public RPattern getRContext() {
670        return this.rContext;
671    }
672
673    /**
674     * Decides if the pattern and context match the input starting at a position. It is a match if the
675     * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
676     * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
677     *
678     * @param input
679     *            the input String
680     * @param i
681     *            the int position within the input
682     * @return true if the pattern and left/right context match, false otherwise
683     */
684    public boolean patternAndContextMatches(final CharSequence input, final int i) {
685        if (i < 0) {
686            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
687        }
688
689        final int patternLength = this.pattern.length();
690        final int ipl = i + patternLength;
691
692        if (ipl > input.length()) {
693            // not enough room for the pattern to match
694            return false;
695        }
696
697        // evaluate the pattern, left context and right context
698        // fail early if any of the evaluations is not successful
699        if (!input.subSequence(i, ipl).equals(this.pattern)) {
700            return false;
701        } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
702            return false;
703        }
704        return this.lContext.isMatch(input.subSequence(0, i));
705    }
706}