001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language.bm;
019    
020    import java.io.InputStream;
021    import java.util.ArrayList;
022    import java.util.Arrays;
023    import java.util.Collections;
024    import java.util.Comparator;
025    import java.util.EnumMap;
026    import java.util.HashMap;
027    import java.util.HashSet;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.Scanner;
031    import java.util.Set;
032    import java.util.regex.Matcher;
033    import java.util.regex.Pattern;
034    
035    /**
036     * A phoneme rule.
037     * <p>
038     * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
039     * and a logical flag indicating if all languages must be in play. A rule matches if:
040     * <ul>
041     * <li>the pattern matches at the current position</li>
042     * <li>the string up until the beginning of the pattern matches the left context</li>
043     * <li>the string from the end of the pattern matches the right context</li>
044     * <li>logical is ALL and all languages are in scope; or</li>
045     * <li>logical is any other value and at least one language is in scope</li>
046     * </ul>
047     * <p>
048     * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
049     * to explicitly construct their own.
050     * <p>
051     * Rules are immutable and thread-safe.
052     * <p>
053     * <b>Rules resources</b>
054     * <p>
055     * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
056     * named following the pattern:
057     * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
058     * <p>
059     * The format of these resources is the following:
060     * <ul>
061     * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
062     * will be interpreted as:
063     * <ol>
064     * <li>pattern</li>
065     * <li>left context</li>
066     * <li>right context</li>
067     * <li>phoneme</li>
068     * </ol>
069     * </li>
070     * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
071     * as a comment.</li>
072     * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
073     * all content until a line ending in '*' and '/' is found.</li>
074     * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
075     * </ul>
076     *
077     * @since 1.6
078     * @version $Id: Rule.Phoneme.html 889935 2013-12-11 05:05:13Z ggregory $
079     */
080    public class Rule {
081    
082        public static final class Phoneme implements PhonemeExpr {
083            public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
084                @Override
085                public int compare(Phoneme o1, Phoneme o2) {
086                    for (int i = 0; i < o1.phonemeText.length(); i++) {
087                        if (i >= o2.phonemeText.length()) {
088                            return +1;
089                        }
090                        int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
091                        if (c != 0) {
092                            return c;
093                        }
094                    }
095    
096                    if (o1.phonemeText.length() < o2.phonemeText.length()) {
097                        return -1;
098                    }
099    
100                    return 0;
101                }
102            };
103    
104            private final CharSequence phonemeText;
105            private final Languages.LanguageSet languages;
106    
107            public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) {
108                this.phonemeText = phonemeText;
109                this.languages = languages;
110            }
111    
112            public Phoneme append(CharSequence str) {
113                return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114            }
115    
116            public Languages.LanguageSet getLanguages() {
117                return this.languages;
118            }
119    
120            @Override
121            public Iterable<Phoneme> getPhonemes() {
122                return Collections.singleton(this);
123            }
124    
125            public CharSequence getPhonemeText() {
126                return this.phonemeText;
127            }
128    
129            public Phoneme join(Phoneme right) {
130                return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131                                   this.languages.restrictTo(right.languages));
132            }
133        }
134    
135        public interface PhonemeExpr {
136            Iterable<Phoneme> getPhonemes();
137        }
138    
139        public static final class PhonemeList implements PhonemeExpr {
140            private final List<Phoneme> phonemes;
141    
142            public PhonemeList(List<Phoneme> phonemes) {
143                this.phonemes = phonemes;
144            }
145    
146            @Override
147            public List<Phoneme> getPhonemes() {
148                return this.phonemes;
149            }
150        }
151    
152        /**
153         * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154         */
155        public static interface RPattern {
156            boolean isMatch(CharSequence input);
157        }
158    
159        public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160            @Override
161            public boolean isMatch(CharSequence input) {
162                return true;
163            }
164        };
165    
166        public static final String ALL = "ALL";
167    
168        private static final String DOUBLE_QUOTE = "\"";
169    
170        private static final String HASH_INCLUDE = "#include";
171    
172        private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173                new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174    
175        static {
176            for (NameType s : NameType.values()) {
177                Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
178    
179                for (RuleType rt : RuleType.values()) {
180                    Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
181    
182                    Languages ls = Languages.getInstance(s);
183                    for (String l : ls.getLanguages()) {
184                        try {
185                            rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
186                        } catch (IllegalStateException e) {
187                            throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
188                        }
189                    }
190                    if (!rt.equals(RuleType.RULES)) {
191                        rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
192                    }
193    
194                    rts.put(rt, Collections.unmodifiableMap(rs));
195                }
196    
197                RULES.put(s, Collections.unmodifiableMap(rts));
198            }
199        }
200    
201        private static boolean contains(CharSequence chars, char input) {
202            for (int i = 0; i < chars.length(); i++) {
203                if (chars.charAt(i) == input) {
204                    return true;
205                }
206            }
207            return false;
208        }
209    
210        private static String createResourceName(NameType nameType, RuleType rt, String lang) {
211            return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
212                                 nameType.getName(), rt.getName(), lang);
213        }
214    
215        private static Scanner createScanner(NameType nameType, RuleType rt, String lang) {
216            String resName = createResourceName(nameType, rt, lang);
217            InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
218    
219            if (rulesIS == null) {
220                throw new IllegalArgumentException("Unable to load resource: " + resName);
221            }
222    
223            return new Scanner(rulesIS, ResourceConstants.ENCODING);
224        }
225    
226        private static Scanner createScanner(String lang) {
227            String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
228            InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
229    
230            if (rulesIS == null) {
231                throw new IllegalArgumentException("Unable to load resource: " + resName);
232            }
233    
234            return new Scanner(rulesIS, ResourceConstants.ENCODING);
235        }
236    
237        private static boolean endsWith(CharSequence input, CharSequence suffix) {
238            if (suffix.length() > input.length()) {
239                return false;
240            }
241            for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
242                if (input.charAt(i) != suffix.charAt(j)) {
243                    return false;
244                }
245            }
246            return true;
247        }
248    
249        /**
250         * Gets rules for a combination of name type, rule type and languages.
251         *
252         * @param nameType
253         *            the NameType to consider
254         * @param rt
255         *            the RuleType to consider
256         * @param langs
257         *            the set of languages to consider
258         * @return a list of Rules that apply
259         */
260        public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) {
261            return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
262                                         getInstance(nameType, rt, Languages.ANY);
263        }
264    
265        /**
266         * Gets rules for a combination of name type, rule type and a single language.
267         *
268         * @param nameType
269         *            the NameType to consider
270         * @param rt
271         *            the RuleType to consider
272         * @param lang
273         *            the language to consider
274         * @return a list rules for a combination of name type, rule type and a single language.
275         */
276        public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) {
277            List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
278    
279            if (rules == null) {
280                throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
281                                                   nameType.getName(), rt.getName(), lang));
282            }
283    
284            return rules;
285        }
286    
287        private static Phoneme parsePhoneme(String ph) {
288            int open = ph.indexOf("[");
289            if (open >= 0) {
290                if (!ph.endsWith("]")) {
291                    throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
292                }
293                String before = ph.substring(0, open);
294                String in = ph.substring(open + 1, ph.length() - 1);
295                Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
296    
297                return new Phoneme(before, Languages.LanguageSet.from(langs));
298            } else {
299                return new Phoneme(ph, Languages.ANY_LANGUAGE);
300            }
301        }
302    
303        private static PhonemeExpr parsePhonemeExpr(String ph) {
304            if (ph.startsWith("(")) { // we have a bracketed list of options
305                if (!ph.endsWith(")")) {
306                    throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
307                }
308    
309                List<Phoneme> phs = new ArrayList<Phoneme>();
310                String body = ph.substring(1, ph.length() - 1);
311                for (String part : body.split("[|]")) {
312                    phs.add(parsePhoneme(part));
313                }
314                if (body.startsWith("|") || body.endsWith("|")) {
315                    phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
316                }
317    
318                return new PhonemeList(phs);
319            } else {
320                return parsePhoneme(ph);
321            }
322        }
323    
324        private static List<Rule> parseRules(final Scanner scanner, final String location) {
325            List<Rule> lines = new ArrayList<Rule>();
326            int currentLine = 0;
327    
328            boolean inMultilineComment = false;
329            while (scanner.hasNextLine()) {
330                currentLine++;
331                String rawLine = scanner.nextLine();
332                String line = rawLine;
333    
334                if (inMultilineComment) {
335                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
336                        inMultilineComment = false;
337                    }
338                } else {
339                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
340                        inMultilineComment = true;
341                    } else {
342                        // discard comments
343                        int cmtI = line.indexOf(ResourceConstants.CMT);
344                        if (cmtI >= 0) {
345                            line = line.substring(0, cmtI);
346                        }
347    
348                        // trim leading-trailing whitespace
349                        line = line.trim();
350    
351                        if (line.length() == 0) {
352                            continue; // empty lines can be safely skipped
353                        }
354    
355                        if (line.startsWith(HASH_INCLUDE)) {
356                            // include statement
357                            String incl = line.substring(HASH_INCLUDE.length()).trim();
358                            if (incl.contains(" ")) {
359                                throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
360                                                                   location);
361                            } else {
362                                lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
363                            }
364                        } else {
365                            // rule
366                            String[] parts = line.split("\\s+");
367                            if (parts.length != 4) {
368                                throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
369                                                                   " parts: " + rawLine + " in " + location);
370                            } else {
371                                try {
372                                    String pat = stripQuotes(parts[0]);
373                                    String lCon = stripQuotes(parts[1]);
374                                    String rCon = stripQuotes(parts[2]);
375                                    PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
376                                    final int cLine = currentLine;
377                                    Rule r = new Rule(pat, lCon, rCon, ph) {
378                                        private final int myLine = cLine;
379                                        private final String loc = location;
380    
381                                        @Override
382                                        public String toString() {
383                                            final StringBuilder sb = new StringBuilder();
384                                            sb.append("Rule");
385                                            sb.append("{line=").append(myLine);
386                                            sb.append(", loc='").append(loc).append('\'');
387                                            sb.append('}');
388                                            return sb.toString();
389                                        }
390                                    };
391                                    lines.add(r);
392                                } catch (IllegalArgumentException e) {
393                                    throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
394                                                                    location, e);
395                                }
396                            }
397                        }
398                    }
399                }
400            }
401    
402            return lines;
403        }
404    
405        /**
406         * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
407         *
408         * @param regex
409         *            the regular expression to compile
410         * @return an RPattern that will match this regex
411         */
412        private static RPattern pattern(final String regex) {
413            boolean startsWith = regex.startsWith("^");
414            boolean endsWith = regex.endsWith("$");
415            final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
416            boolean boxes = content.contains("[");
417    
418            if (!boxes) {
419                if (startsWith && endsWith) {
420                    // exact match
421                    if (content.length() == 0) {
422                        // empty
423                        return new RPattern() {
424                            @Override
425                            public boolean isMatch(CharSequence input) {
426                                return input.length() == 0;
427                            }
428                        };
429                    } else {
430                        return new RPattern() {
431                            @Override
432                            public boolean isMatch(CharSequence input) {
433                                return input.equals(content);
434                            }
435                        };
436                    }
437                } else if ((startsWith || endsWith) && content.length() == 0) {
438                    // matches every string
439                    return ALL_STRINGS_RMATCHER;
440                } else if (startsWith) {
441                    // matches from start
442                    return new RPattern() {
443                        @Override
444                        public boolean isMatch(CharSequence input) {
445                            return startsWith(input, content);
446                        }
447                    };
448                } else if (endsWith) {
449                    // matches from start
450                    return new RPattern() {
451                        @Override
452                        public boolean isMatch(CharSequence input) {
453                            return endsWith(input, content);
454                        }
455                    };
456                }
457            } else {
458                boolean startsWithBox = content.startsWith("[");
459                boolean endsWithBox = content.endsWith("]");
460    
461                if (startsWithBox && endsWithBox) {
462                    String boxContent = content.substring(1, content.length() - 1);
463                    if (!boxContent.contains("[")) {
464                        // box containing alternatives
465                        boolean negate = boxContent.startsWith("^");
466                        if (negate) {
467                            boxContent = boxContent.substring(1);
468                        }
469                        final String bContent = boxContent;
470                        final boolean shouldMatch = !negate;
471    
472                        if (startsWith && endsWith) {
473                            // exact match
474                            return new RPattern() {
475                                @Override
476                                public boolean isMatch(CharSequence input) {
477                                    return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
478                                }
479                            };
480                        } else if (startsWith) {
481                            // first char
482                            return new RPattern() {
483                                @Override
484                                public boolean isMatch(CharSequence input) {
485                                    return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
486                                }
487                            };
488                        } else if (endsWith) {
489                            // last char
490                            return new RPattern() {
491                                @Override
492                                public boolean isMatch(CharSequence input) {
493                                    return input.length() > 0 &&
494                                           contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
495                                }
496                            };
497                        }
498                    }
499                }
500            }
501    
502            return new RPattern() {
503                Pattern pattern = Pattern.compile(regex);
504    
505                @Override
506                public boolean isMatch(CharSequence input) {
507                    Matcher matcher = pattern.matcher(input);
508                    return matcher.find();
509                }
510            };
511        }
512    
513        private static boolean startsWith(CharSequence input, CharSequence prefix) {
514            if (prefix.length() > input.length()) {
515                return false;
516            }
517            for (int i = 0; i < prefix.length(); i++) {
518                if (input.charAt(i) != prefix.charAt(i)) {
519                    return false;
520                }
521            }
522            return true;
523        }
524    
525        private static String stripQuotes(String str) {
526            if (str.startsWith(DOUBLE_QUOTE)) {
527                str = str.substring(1);
528            }
529    
530            if (str.endsWith(DOUBLE_QUOTE)) {
531                str = str.substring(0, str.length() - 1);
532            }
533    
534            return str;
535        }
536    
537        private final RPattern lContext;
538    
539        private final String pattern;
540    
541        private final PhonemeExpr phoneme;
542    
543        private final RPattern rContext;
544    
545        /**
546         * Creates a new rule.
547         *
548         * @param pattern
549         *            the pattern
550         * @param lContext
551         *            the left context
552         * @param rContext
553         *            the right context
554         * @param phoneme
555         *            the resulting phoneme
556         */
557        public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
558            this.pattern = pattern;
559            this.lContext = pattern(lContext + "$");
560            this.rContext = pattern("^" + rContext);
561            this.phoneme = phoneme;
562        }
563    
564        /**
565         * Gets the left context. This is a regular expression that must match to the left of the pattern.
566         *
567         * @return the left context Pattern
568         */
569        public RPattern getLContext() {
570            return this.lContext;
571        }
572    
573        /**
574         * Gets the pattern. This is a string-literal that must exactly match.
575         *
576         * @return the pattern
577         */
578        public String getPattern() {
579            return this.pattern;
580        }
581    
582        /**
583         * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
584         *
585         * @return the phoneme
586         */
587        public PhonemeExpr getPhoneme() {
588            return this.phoneme;
589        }
590    
591        /**
592         * Gets the right context. This is a regular expression that must match to the right of the pattern.
593         *
594         * @return the right context Pattern
595         */
596        public RPattern getRContext() {
597            return this.rContext;
598        }
599    
600        /**
601         * Decides if the pattern and context match the input starting at a position. It is a match if the
602         * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
603         * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
604         *
605         * @param input
606         *            the input String
607         * @param i
608         *            the int position within the input
609         * @return true if the pattern and left/right context match, false otherwise
610         */
611        public boolean patternAndContextMatches(CharSequence input, int i) {
612            if (i < 0) {
613                throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
614            }
615    
616            int patternLength = this.pattern.length();
617            int ipl = i + patternLength;
618    
619            if (ipl > input.length()) {
620                // not enough room for the pattern to match
621                return false;
622            }
623    
624            // evaluate the pattern, left context and right context
625            // fail early if any of the evaluations is not successful
626            if (!input.subSequence(i, ipl).equals(this.pattern)) {
627                return false;
628            } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
629                return false;
630            }
631            return this.lContext.isMatch(input.subSequence(0, i));
632        }
633    }