001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import java.io.InputStream;
021import java.util.ArrayList;
022import java.util.Arrays;
023import java.util.Collections;
024import java.util.Comparator;
025import java.util.EnumMap;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.List;
029import java.util.Map;
030import java.util.Scanner;
031import java.util.Set;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.apache.commons.codec.language.bm.Languages.LanguageSet;
036
037/**
038 * A phoneme rule.
039 * <p>
040 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
041 * and a logical flag indicating if all languages must be in play. A rule matches if:
042 * <ul>
043 * <li>the pattern matches at the current position</li>
044 * <li>the string up until the beginning of the pattern matches the left context</li>
045 * <li>the string from the end of the pattern matches the right context</li>
046 * <li>logical is ALL and all languages are in scope; or</li>
047 * <li>logical is any other value and at least one language is in scope</li>
048 * </ul>
049 * <p>
050 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
051 * to explicitly construct their own.
052 * <p>
053 * Rules are immutable and thread-safe.
054 * <p>
055 * <b>Rules resources</b>
056 * <p>
057 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
058 * named following the pattern:
059 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
060 * <p>
061 * The format of these resources is the following:
062 * <ul>
063 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
064 * will be interpreted as:
065 * <ol>
066 * <li>pattern</li>
067 * <li>left context</li>
068 * <li>right context</li>
069 * <li>phoneme</li>
070 * </ol>
071 * </li>
072 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
073 * as a comment.</li>
074 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
075 * all content until a line ending in '*' and '/' is found.</li>
076 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
077 * </ul>
078 *
079 * @since 1.6
080 * @version $Id: Rule.Phoneme.html 891688 2013-12-24 20:49:46Z ggregory $
081 */
082public class Rule {
083
084    public static final class Phoneme implements PhonemeExpr {
085        public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
086            @Override
087            public int compare(final Phoneme o1, final Phoneme o2) {
088                for (int i = 0; i < o1.phonemeText.length(); i++) {
089                    if (i >= o2.phonemeText.length()) {
090                        return +1;
091                    }
092                    final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
093                    if (c != 0) {
094                        return c;
095                    }
096                }
097
098                if (o1.phonemeText.length() < o2.phonemeText.length()) {
099                    return -1;
100                }
101
102                return 0;
103            }
104        };
105
106        private final StringBuilder phonemeText;
107        private final Languages.LanguageSet languages;
108
109        public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110            this.phonemeText = new StringBuilder(phonemeText);
111            this.languages = languages;
112        }
113
114        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115            this(phonemeLeft.phonemeText, phonemeLeft.languages);
116            this.phonemeText.append(phonemeRight.phonemeText);
117        }
118
119        public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120            this(phonemeLeft.phonemeText, languages);
121            this.phonemeText.append(phonemeRight.phonemeText);
122        }
123
124        public Phoneme append(final CharSequence str) {
125            this.phonemeText.append(str);
126            return this;
127        }
128
129        public Languages.LanguageSet getLanguages() {
130            return this.languages;
131        }
132
133        @Override
134        public Iterable<Phoneme> getPhonemes() {
135            return Collections.singleton(this);
136        }
137
138        public CharSequence getPhonemeText() {
139            return this.phonemeText;
140        }
141
142        /**
143         * @deprecated since 1.9
144         */
145        @Deprecated
146        public Phoneme join(final Phoneme right) {
147            return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
148                               this.languages.restrictTo(right.languages));
149        }
150    }
151
152    public interface PhonemeExpr {
153        Iterable<Phoneme> getPhonemes();
154    }
155
156    public static final class PhonemeList implements PhonemeExpr {
157        private final List<Phoneme> phonemes;
158
159        public PhonemeList(final List<Phoneme> phonemes) {
160            this.phonemes = phonemes;
161        }
162
163        @Override
164        public List<Phoneme> getPhonemes() {
165            return this.phonemes;
166        }
167    }
168
169    /**
170     * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
171     */
172    public interface RPattern {
173        boolean isMatch(CharSequence input);
174    }
175
176    public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
177        @Override
178        public boolean isMatch(final CharSequence input) {
179            return true;
180        }
181    };
182
183    public static final String ALL = "ALL";
184
185    private static final String DOUBLE_QUOTE = "\"";
186
187    private static final String HASH_INCLUDE = "#include";
188
189    private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
190            new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
191
192    static {
193        for (final NameType s : NameType.values()) {
194            final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
195                    new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
196
197            for (final RuleType rt : RuleType.values()) {
198                final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
199
200                final Languages ls = Languages.getInstance(s);
201                for (final String l : ls.getLanguages()) {
202                    try {
203                        rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
204                    } catch (final IllegalStateException e) {
205                        throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
206                    }
207                }
208                if (!rt.equals(RuleType.RULES)) {
209                    rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
210                }
211
212                rts.put(rt, Collections.unmodifiableMap(rs));
213            }
214
215            RULES.put(s, Collections.unmodifiableMap(rts));
216        }
217    }
218
219    private static boolean contains(final CharSequence chars, final char input) {
220        for (int i = 0; i < chars.length(); i++) {
221            if (chars.charAt(i) == input) {
222                return true;
223            }
224        }
225        return false;
226    }
227
228    private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
229        return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
230                             nameType.getName(), rt.getName(), lang);
231    }
232
233    private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
234        final String resName = createResourceName(nameType, rt, lang);
235        final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
236
237        if (rulesIS == null) {
238            throw new IllegalArgumentException("Unable to load resource: " + resName);
239        }
240
241        return new Scanner(rulesIS, ResourceConstants.ENCODING);
242    }
243
244    private static Scanner createScanner(final String lang) {
245        final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
246        final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
247
248        if (rulesIS == null) {
249            throw new IllegalArgumentException("Unable to load resource: " + resName);
250        }
251
252        return new Scanner(rulesIS, ResourceConstants.ENCODING);
253    }
254
255    private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
256        if (suffix.length() > input.length()) {
257            return false;
258        }
259        for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
260            if (input.charAt(i) != suffix.charAt(j)) {
261                return false;
262            }
263        }
264        return true;
265    }
266
267    /**
268     * Gets rules for a combination of name type, rule type and languages.
269     *
270     * @param nameType
271     *            the NameType to consider
272     * @param rt
273     *            the RuleType to consider
274     * @param langs
275     *            the set of languages to consider
276     * @return a list of Rules that apply
277     */
278    public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
279                                         final Languages.LanguageSet langs) {
280        final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
281        final List<Rule> allRules = new ArrayList<Rule>();
282        for (final List<Rule> rules : ruleMap.values()) {
283            allRules.addAll(rules);
284        }
285        return allRules;
286    }
287
288    /**
289     * Gets rules for a combination of name type, rule type and a single language.
290     *
291     * @param nameType
292     *            the NameType to consider
293     * @param rt
294     *            the RuleType to consider
295     * @param lang
296     *            the language to consider
297     * @return a list of Rules that apply
298     */
299    public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
300        return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
301    }
302
303    /**
304     * Gets rules for a combination of name type, rule type and languages.
305     *
306     * @param nameType
307     *            the NameType to consider
308     * @param rt
309     *            the RuleType to consider
310     * @param langs
311     *            the set of languages to consider
312     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
313     * @since 1.9
314     */
315    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
316                                                         final Languages.LanguageSet langs) {
317        return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
318                                     getInstanceMap(nameType, rt, Languages.ANY);
319    }
320
321    /**
322     * Gets rules for a combination of name type, rule type and a single language.
323     *
324     * @param nameType
325     *            the NameType to consider
326     * @param rt
327     *            the RuleType to consider
328     * @param lang
329     *            the language to consider
330     * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
331     * @since 1.9
332     */
333    public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
334                                                         final String lang) {
335        final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
336
337        if (rules == null) {
338            throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
339                                               nameType.getName(), rt.getName(), lang));
340        }
341
342        return rules;
343    }
344
345    private static Phoneme parsePhoneme(final String ph) {
346        final int open = ph.indexOf("[");
347        if (open >= 0) {
348            if (!ph.endsWith("]")) {
349                throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
350            }
351            final String before = ph.substring(0, open);
352            final String in = ph.substring(open + 1, ph.length() - 1);
353            final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
354
355            return new Phoneme(before, Languages.LanguageSet.from(langs));
356        } else {
357            return new Phoneme(ph, Languages.ANY_LANGUAGE);
358        }
359    }
360
361    private static PhonemeExpr parsePhonemeExpr(final String ph) {
362        if (ph.startsWith("(")) { // we have a bracketed list of options
363            if (!ph.endsWith(")")) {
364                throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
365            }
366
367            final List<Phoneme> phs = new ArrayList<Phoneme>();
368            final String body = ph.substring(1, ph.length() - 1);
369            for (final String part : body.split("[|]")) {
370                phs.add(parsePhoneme(part));
371            }
372            if (body.startsWith("|") || body.endsWith("|")) {
373                phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
374            }
375
376            return new PhonemeList(phs);
377        } else {
378            return parsePhoneme(ph);
379        }
380    }
381
382    private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
383        final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
384        int currentLine = 0;
385
386        boolean inMultilineComment = false;
387        while (scanner.hasNextLine()) {
388            currentLine++;
389            final String rawLine = scanner.nextLine();
390            String line = rawLine;
391
392            if (inMultilineComment) {
393                if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
394                    inMultilineComment = false;
395                }
396            } else {
397                if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
398                    inMultilineComment = true;
399                } else {
400                    // discard comments
401                    final int cmtI = line.indexOf(ResourceConstants.CMT);
402                    if (cmtI >= 0) {
403                        line = line.substring(0, cmtI);
404                    }
405
406                    // trim leading-trailing whitespace
407                    line = line.trim();
408
409                    if (line.length() == 0) {
410                        continue; // empty lines can be safely skipped
411                    }
412
413                    if (line.startsWith(HASH_INCLUDE)) {
414                        // include statement
415                        final String incl = line.substring(HASH_INCLUDE.length()).trim();
416                        if (incl.contains(" ")) {
417                            throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
418                                                               location);
419                        } else {
420                            lines.putAll(parseRules(createScanner(incl), location + "->" + incl));
421                        }
422                    } else {
423                        // rule
424                        final String[] parts = line.split("\\s+");
425                        if (parts.length != 4) {
426                            throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
427                                                               " parts: " + rawLine + " in " + location);
428                        } else {
429                            try {
430                                final String pat = stripQuotes(parts[0]);
431                                final String lCon = stripQuotes(parts[1]);
432                                final String rCon = stripQuotes(parts[2]);
433                                final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
434                                final int cLine = currentLine;
435                                final Rule r = new Rule(pat, lCon, rCon, ph) {
436                                    private final int myLine = cLine;
437                                    private final String loc = location;
438
439                                    @Override
440                                    public String toString() {
441                                        final StringBuilder sb = new StringBuilder();
442                                        sb.append("Rule");
443                                        sb.append("{line=").append(myLine);
444                                        sb.append(", loc='").append(loc).append('\'');
445                                        sb.append('}');
446                                        return sb.toString();
447                                    }
448                                };
449                                final String patternKey = r.pattern.substring(0,1);
450                                List<Rule> rules = lines.get(patternKey);
451                                if (rules == null) {
452                                    rules = new ArrayList<Rule>();
453                                    lines.put(patternKey, rules);
454                                }
455                                rules.add(r);
456                            } catch (final IllegalArgumentException e) {
457                                throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
458                                                                location, e);
459                            }
460                        }
461                    }
462                }
463            }
464        }
465
466        return lines;
467    }
468
469    /**
470     * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
471     *
472     * @param regex
473     *            the regular expression to compile
474     * @return an RPattern that will match this regex
475     */
476    private static RPattern pattern(final String regex) {
477        final boolean startsWith = regex.startsWith("^");
478        final boolean endsWith = regex.endsWith("$");
479        final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
480        final boolean boxes = content.contains("[");
481
482        if (!boxes) {
483            if (startsWith && endsWith) {
484                // exact match
485                if (content.length() == 0) {
486                    // empty
487                    return new RPattern() {
488                        @Override
489                        public boolean isMatch(final CharSequence input) {
490                            return input.length() == 0;
491                        }
492                    };
493                } else {
494                    return new RPattern() {
495                        @Override
496                        public boolean isMatch(final CharSequence input) {
497                            return input.equals(content);
498                        }
499                    };
500                }
501            } else if ((startsWith || endsWith) && content.length() == 0) {
502                // matches every string
503                return ALL_STRINGS_RMATCHER;
504            } else if (startsWith) {
505                // matches from start
506                return new RPattern() {
507                    @Override
508                    public boolean isMatch(final CharSequence input) {
509                        return startsWith(input, content);
510                    }
511                };
512            } else if (endsWith) {
513                // matches from start
514                return new RPattern() {
515                    @Override
516                    public boolean isMatch(final CharSequence input) {
517                        return endsWith(input, content);
518                    }
519                };
520            }
521        } else {
522            final boolean startsWithBox = content.startsWith("[");
523            final boolean endsWithBox = content.endsWith("]");
524
525            if (startsWithBox && endsWithBox) {
526                String boxContent = content.substring(1, content.length() - 1);
527                if (!boxContent.contains("[")) {
528                    // box containing alternatives
529                    final boolean negate = boxContent.startsWith("^");
530                    if (negate) {
531                        boxContent = boxContent.substring(1);
532                    }
533                    final String bContent = boxContent;
534                    final boolean shouldMatch = !negate;
535
536                    if (startsWith && endsWith) {
537                        // exact match
538                        return new RPattern() {
539                            @Override
540                            public boolean isMatch(final CharSequence input) {
541                                return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
542                            }
543                        };
544                    } else if (startsWith) {
545                        // first char
546                        return new RPattern() {
547                            @Override
548                            public boolean isMatch(final CharSequence input) {
549                                return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550                            }
551                        };
552                    } else if (endsWith) {
553                        // last char
554                        return new RPattern() {
555                            @Override
556                            public boolean isMatch(final CharSequence input) {
557                                return input.length() > 0 &&
558                                       contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
559                            }
560                        };
561                    }
562                }
563            }
564        }
565
566        return new RPattern() {
567            Pattern pattern = Pattern.compile(regex);
568
569            @Override
570            public boolean isMatch(final CharSequence input) {
571                final Matcher matcher = pattern.matcher(input);
572                return matcher.find();
573            }
574        };
575    }
576
577    private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
578        if (prefix.length() > input.length()) {
579            return false;
580        }
581        for (int i = 0; i < prefix.length(); i++) {
582            if (input.charAt(i) != prefix.charAt(i)) {
583                return false;
584            }
585        }
586        return true;
587    }
588
589    private static String stripQuotes(String str) {
590        if (str.startsWith(DOUBLE_QUOTE)) {
591            str = str.substring(1);
592        }
593
594        if (str.endsWith(DOUBLE_QUOTE)) {
595            str = str.substring(0, str.length() - 1);
596        }
597
598        return str;
599    }
600
601    private final RPattern lContext;
602
603    private final String pattern;
604
605    private final PhonemeExpr phoneme;
606
607    private final RPattern rContext;
608
609    /**
610     * Creates a new rule.
611     *
612     * @param pattern
613     *            the pattern
614     * @param lContext
615     *            the left context
616     * @param rContext
617     *            the right context
618     * @param phoneme
619     *            the resulting phoneme
620     */
621    public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
622        this.pattern = pattern;
623        this.lContext = pattern(lContext + "$");
624        this.rContext = pattern("^" + rContext);
625        this.phoneme = phoneme;
626    }
627
628    /**
629     * Gets the left context. This is a regular expression that must match to the left of the pattern.
630     *
631     * @return the left context Pattern
632     */
633    public RPattern getLContext() {
634        return this.lContext;
635    }
636
637    /**
638     * Gets the pattern. This is a string-literal that must exactly match.
639     *
640     * @return the pattern
641     */
642    public String getPattern() {
643        return this.pattern;
644    }
645
646    /**
647     * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
648     *
649     * @return the phoneme
650     */
651    public PhonemeExpr getPhoneme() {
652        return this.phoneme;
653    }
654
655    /**
656     * Gets the right context. This is a regular expression that must match to the right of the pattern.
657     *
658     * @return the right context Pattern
659     */
660    public RPattern getRContext() {
661        return this.rContext;
662    }
663
664    /**
665     * Decides if the pattern and context match the input starting at a position. It is a match if the
666     * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
667     * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
668     *
669     * @param input
670     *            the input String
671     * @param i
672     *            the int position within the input
673     * @return true if the pattern and left/right context match, false otherwise
674     */
675    public boolean patternAndContextMatches(final CharSequence input, final int i) {
676        if (i < 0) {
677            throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
678        }
679
680        final int patternLength = this.pattern.length();
681        final int ipl = i + patternLength;
682
683        if (ipl > input.length()) {
684            // not enough room for the pattern to match
685            return false;
686        }
687
688        // evaluate the pattern, left context and right context
689        // fail early if any of the evaluations is not successful
690        if (!input.subSequence(i, ipl).equals(this.pattern)) {
691            return false;
692        } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
693            return false;
694        }
695        return this.lContext.isMatch(input.subSequence(0, i));
696    }
697}