001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language.bm;
019    
020    import java.io.InputStream;
021    import java.util.ArrayList;
022    import java.util.Arrays;
023    import java.util.Collections;
024    import java.util.Comparator;
025    import java.util.EnumMap;
026    import java.util.HashMap;
027    import java.util.HashSet;
028    import java.util.List;
029    import java.util.Map;
030    import java.util.Scanner;
031    import java.util.Set;
032    import java.util.regex.Matcher;
033    import java.util.regex.Pattern;
034    
035    /**
036     * A phoneme rule.
037     * <p>
038     * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
039     * and a logical flag indicating if all languages must be in play. A rule matches if:
040     * <ul>
041     * <li>the pattern matches at the current position</li>
042     * <li>the string up until the beginning of the pattern matches the left context</li>
043     * <li>the string from the end of the pattern matches the right context</li>
044     * <li>logical is ALL and all languages are in scope; or</li>
045     * <li>logical is any other value and at least one language is in scope</li>
046     * </ul>
047     * <p>
048     * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
049     * to explicitly construct their own.
050     * <p>
051     * Rules are immutable and thread-safe.
052     * <p>
053     * <b>Rules resources</b>
054     * <p>
055     * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
056     * named following the pattern:
057     * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
058     * <p>
059     * The format of these resources is the following:
060     * <ul>
061     * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
062     * will be interpreted as:
063     * <ol>
064     * <li>pattern</li>
065     * <li>left context</li>
066     * <li>right context</li>
067     * <li>phoneme</li>
068     * </ol>
069     * </li>
070     * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
071     * as a comment.</li>
072     * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
073     * all content until a line ending in '*' and '/' is found.</li>
074     * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
075     * </ul>
076     *
077     * @since 1.6
078     * @version $Id: Rule.RPattern.html 889935 2013-12-11 05:05:13Z ggregory $
079     */
080    public class Rule {
081    
082        public static final class Phoneme implements PhonemeExpr {
083            public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
084                @Override
085                public int compare(final Phoneme o1, final Phoneme o2) {
086                    for (int i = 0; i < o1.phonemeText.length(); i++) {
087                        if (i >= o2.phonemeText.length()) {
088                            return +1;
089                        }
090                        final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
091                        if (c != 0) {
092                            return c;
093                        }
094                    }
095    
096                    if (o1.phonemeText.length() < o2.phonemeText.length()) {
097                        return -1;
098                    }
099    
100                    return 0;
101                }
102            };
103    
104            private final CharSequence phonemeText;
105            private final Languages.LanguageSet languages;
106    
107            public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
108                this.phonemeText = phonemeText;
109                this.languages = languages;
110            }
111    
112            public Phoneme append(final CharSequence str) {
113                return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114            }
115    
116            public Languages.LanguageSet getLanguages() {
117                return this.languages;
118            }
119    
120            @Override
121            public Iterable<Phoneme> getPhonemes() {
122                return Collections.singleton(this);
123            }
124    
125            public CharSequence getPhonemeText() {
126                return this.phonemeText;
127            }
128    
129            public Phoneme join(final Phoneme right) {
130                return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131                                   this.languages.restrictTo(right.languages));
132            }
133        }
134    
135        public interface PhonemeExpr {
136            Iterable<Phoneme> getPhonemes();
137        }
138    
139        public static final class PhonemeList implements PhonemeExpr {
140            private final List<Phoneme> phonemes;
141    
142            public PhonemeList(final List<Phoneme> phonemes) {
143                this.phonemes = phonemes;
144            }
145    
146            @Override
147            public List<Phoneme> getPhonemes() {
148                return this.phonemes;
149            }
150        }
151    
152        /**
153         * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154         */
155        public static interface RPattern {
156            boolean isMatch(CharSequence input);
157        }
158    
159        public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160            @Override
161            public boolean isMatch(final CharSequence input) {
162                return true;
163            }
164        };
165    
166        public static final String ALL = "ALL";
167    
168        private static final String DOUBLE_QUOTE = "\"";
169    
170        private static final String HASH_INCLUDE = "#include";
171    
172        private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173                new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174    
175        static {
176            for (final NameType s : NameType.values()) {
177                final Map<RuleType, Map<String, List<Rule>>> rts =
178                        new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
179    
180                for (final RuleType rt : RuleType.values()) {
181                    final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
182    
183                    final Languages ls = Languages.getInstance(s);
184                    for (final String l : ls.getLanguages()) {
185                        try {
186                            rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
187                        } catch (final IllegalStateException e) {
188                            throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
189                        }
190                    }
191                    if (!rt.equals(RuleType.RULES)) {
192                        rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
193                    }
194    
195                    rts.put(rt, Collections.unmodifiableMap(rs));
196                }
197    
198                RULES.put(s, Collections.unmodifiableMap(rts));
199            }
200        }
201    
202        private static boolean contains(final CharSequence chars, final char input) {
203            for (int i = 0; i < chars.length(); i++) {
204                if (chars.charAt(i) == input) {
205                    return true;
206                }
207            }
208            return false;
209        }
210    
211        private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
212            return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
213                                 nameType.getName(), rt.getName(), lang);
214        }
215    
216        private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
217            final String resName = createResourceName(nameType, rt, lang);
218            final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
219    
220            if (rulesIS == null) {
221                throw new IllegalArgumentException("Unable to load resource: " + resName);
222            }
223    
224            return new Scanner(rulesIS, ResourceConstants.ENCODING);
225        }
226    
227        private static Scanner createScanner(final String lang) {
228            final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
229            final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
230    
231            if (rulesIS == null) {
232                throw new IllegalArgumentException("Unable to load resource: " + resName);
233            }
234    
235            return new Scanner(rulesIS, ResourceConstants.ENCODING);
236        }
237    
238        private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
239            if (suffix.length() > input.length()) {
240                return false;
241            }
242            for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
243                if (input.charAt(i) != suffix.charAt(j)) {
244                    return false;
245                }
246            }
247            return true;
248        }
249    
250        /**
251         * Gets rules for a combination of name type, rule type and languages.
252         *
253         * @param nameType
254         *            the NameType to consider
255         * @param rt
256         *            the RuleType to consider
257         * @param langs
258         *            the set of languages to consider
259         * @return a list of Rules that apply
260         */
261        public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
262                                             final Languages.LanguageSet langs) {
263            return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
264                                         getInstance(nameType, rt, Languages.ANY);
265        }
266    
267        /**
268         * Gets rules for a combination of name type, rule type and a single language.
269         *
270         * @param nameType
271         *            the NameType to consider
272         * @param rt
273         *            the RuleType to consider
274         * @param lang
275         *            the language to consider
276         * @return a list rules for a combination of name type, rule type and a single language.
277         */
278        public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
279            final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
280    
281            if (rules == null) {
282                throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
283                                                   nameType.getName(), rt.getName(), lang));
284            }
285    
286            return rules;
287        }
288    
289        private static Phoneme parsePhoneme(final String ph) {
290            final int open = ph.indexOf("[");
291            if (open >= 0) {
292                if (!ph.endsWith("]")) {
293                    throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
294                }
295                final String before = ph.substring(0, open);
296                final String in = ph.substring(open + 1, ph.length() - 1);
297                final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
298    
299                return new Phoneme(before, Languages.LanguageSet.from(langs));
300            } else {
301                return new Phoneme(ph, Languages.ANY_LANGUAGE);
302            }
303        }
304    
305        private static PhonemeExpr parsePhonemeExpr(final String ph) {
306            if (ph.startsWith("(")) { // we have a bracketed list of options
307                if (!ph.endsWith(")")) {
308                    throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
309                }
310    
311                final List<Phoneme> phs = new ArrayList<Phoneme>();
312                final String body = ph.substring(1, ph.length() - 1);
313                for (final String part : body.split("[|]")) {
314                    phs.add(parsePhoneme(part));
315                }
316                if (body.startsWith("|") || body.endsWith("|")) {
317                    phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
318                }
319    
320                return new PhonemeList(phs);
321            } else {
322                return parsePhoneme(ph);
323            }
324        }
325    
326        private static List<Rule> parseRules(final Scanner scanner, final String location) {
327            final List<Rule> lines = new ArrayList<Rule>();
328            int currentLine = 0;
329    
330            boolean inMultilineComment = false;
331            while (scanner.hasNextLine()) {
332                currentLine++;
333                final String rawLine = scanner.nextLine();
334                String line = rawLine;
335    
336                if (inMultilineComment) {
337                    if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
338                        inMultilineComment = false;
339                    }
340                } else {
341                    if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
342                        inMultilineComment = true;
343                    } else {
344                        // discard comments
345                        final int cmtI = line.indexOf(ResourceConstants.CMT);
346                        if (cmtI >= 0) {
347                            line = line.substring(0, cmtI);
348                        }
349    
350                        // trim leading-trailing whitespace
351                        line = line.trim();
352    
353                        if (line.length() == 0) {
354                            continue; // empty lines can be safely skipped
355                        }
356    
357                        if (line.startsWith(HASH_INCLUDE)) {
358                            // include statement
359                            final String incl = line.substring(HASH_INCLUDE.length()).trim();
360                            if (incl.contains(" ")) {
361                                throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
362                                                                   location);
363                            } else {
364                                lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
365                            }
366                        } else {
367                            // rule
368                            final String[] parts = line.split("\\s+");
369                            if (parts.length != 4) {
370                                throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
371                                                                   " parts: " + rawLine + " in " + location);
372                            } else {
373                                try {
374                                    final String pat = stripQuotes(parts[0]);
375                                    final String lCon = stripQuotes(parts[1]);
376                                    final String rCon = stripQuotes(parts[2]);
377                                    final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
378                                    final int cLine = currentLine;
379                                    final Rule r = new Rule(pat, lCon, rCon, ph) {
380                                        private final int myLine = cLine;
381                                        private final String loc = location;
382    
383                                        @Override
384                                        public String toString() {
385                                            final StringBuilder sb = new StringBuilder();
386                                            sb.append("Rule");
387                                            sb.append("{line=").append(myLine);
388                                            sb.append(", loc='").append(loc).append('\'');
389                                            sb.append('}');
390                                            return sb.toString();
391                                        }
392                                    };
393                                    lines.add(r);
394                                } catch (final IllegalArgumentException e) {
395                                    throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
396                                                                    location, e);
397                                }
398                            }
399                        }
400                    }
401                }
402            }
403    
404            return lines;
405        }
406    
407        /**
408         * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
409         *
410         * @param regex
411         *            the regular expression to compile
412         * @return an RPattern that will match this regex
413         */
414        private static RPattern pattern(final String regex) {
415            final boolean startsWith = regex.startsWith("^");
416            final boolean endsWith = regex.endsWith("$");
417            final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
418            final boolean boxes = content.contains("[");
419    
420            if (!boxes) {
421                if (startsWith && endsWith) {
422                    // exact match
423                    if (content.length() == 0) {
424                        // empty
425                        return new RPattern() {
426                            @Override
427                            public boolean isMatch(final CharSequence input) {
428                                return input.length() == 0;
429                            }
430                        };
431                    } else {
432                        return new RPattern() {
433                            @Override
434                            public boolean isMatch(final CharSequence input) {
435                                return input.equals(content);
436                            }
437                        };
438                    }
439                } else if ((startsWith || endsWith) && content.length() == 0) {
440                    // matches every string
441                    return ALL_STRINGS_RMATCHER;
442                } else if (startsWith) {
443                    // matches from start
444                    return new RPattern() {
445                        @Override
446                        public boolean isMatch(final CharSequence input) {
447                            return startsWith(input, content);
448                        }
449                    };
450                } else if (endsWith) {
451                    // matches from start
452                    return new RPattern() {
453                        @Override
454                        public boolean isMatch(final CharSequence input) {
455                            return endsWith(input, content);
456                        }
457                    };
458                }
459            } else {
460                final boolean startsWithBox = content.startsWith("[");
461                final boolean endsWithBox = content.endsWith("]");
462    
463                if (startsWithBox && endsWithBox) {
464                    String boxContent = content.substring(1, content.length() - 1);
465                    if (!boxContent.contains("[")) {
466                        // box containing alternatives
467                        final boolean negate = boxContent.startsWith("^");
468                        if (negate) {
469                            boxContent = boxContent.substring(1);
470                        }
471                        final String bContent = boxContent;
472                        final boolean shouldMatch = !negate;
473    
474                        if (startsWith && endsWith) {
475                            // exact match
476                            return new RPattern() {
477                                @Override
478                                public boolean isMatch(final CharSequence input) {
479                                    return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
480                                }
481                            };
482                        } else if (startsWith) {
483                            // first char
484                            return new RPattern() {
485                                @Override
486                                public boolean isMatch(final CharSequence input) {
487                                    return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
488                                }
489                            };
490                        } else if (endsWith) {
491                            // last char
492                            return new RPattern() {
493                                @Override
494                                public boolean isMatch(final CharSequence input) {
495                                    return input.length() > 0 &&
496                                           contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
497                                }
498                            };
499                        }
500                    }
501                }
502            }
503    
504            return new RPattern() {
505                Pattern pattern = Pattern.compile(regex);
506    
507                @Override
508                public boolean isMatch(final CharSequence input) {
509                    final Matcher matcher = pattern.matcher(input);
510                    return matcher.find();
511                }
512            };
513        }
514    
515        private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
516            if (prefix.length() > input.length()) {
517                return false;
518            }
519            for (int i = 0; i < prefix.length(); i++) {
520                if (input.charAt(i) != prefix.charAt(i)) {
521                    return false;
522                }
523            }
524            return true;
525        }
526    
527        private static String stripQuotes(String str) {
528            if (str.startsWith(DOUBLE_QUOTE)) {
529                str = str.substring(1);
530            }
531    
532            if (str.endsWith(DOUBLE_QUOTE)) {
533                str = str.substring(0, str.length() - 1);
534            }
535    
536            return str;
537        }
538    
539        private final RPattern lContext;
540    
541        private final String pattern;
542    
543        private final PhonemeExpr phoneme;
544    
545        private final RPattern rContext;
546    
547        /**
548         * Creates a new rule.
549         *
550         * @param pattern
551         *            the pattern
552         * @param lContext
553         *            the left context
554         * @param rContext
555         *            the right context
556         * @param phoneme
557         *            the resulting phoneme
558         */
559        public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
560            this.pattern = pattern;
561            this.lContext = pattern(lContext + "$");
562            this.rContext = pattern("^" + rContext);
563            this.phoneme = phoneme;
564        }
565    
566        /**
567         * Gets the left context. This is a regular expression that must match to the left of the pattern.
568         *
569         * @return the left context Pattern
570         */
571        public RPattern getLContext() {
572            return this.lContext;
573        }
574    
575        /**
576         * Gets the pattern. This is a string-literal that must exactly match.
577         *
578         * @return the pattern
579         */
580        public String getPattern() {
581            return this.pattern;
582        }
583    
584        /**
585         * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
586         *
587         * @return the phoneme
588         */
589        public PhonemeExpr getPhoneme() {
590            return this.phoneme;
591        }
592    
593        /**
594         * Gets the right context. This is a regular expression that must match to the right of the pattern.
595         *
596         * @return the right context Pattern
597         */
598        public RPattern getRContext() {
599            return this.rContext;
600        }
601    
602        /**
603         * Decides if the pattern and context match the input starting at a position. It is a match if the
604         * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
605         * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
606         *
607         * @param input
608         *            the input String
609         * @param i
610         *            the int position within the input
611         * @return true if the pattern and left/right context match, false otherwise
612         */
613        public boolean patternAndContextMatches(final CharSequence input, final int i) {
614            if (i < 0) {
615                throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
616            }
617    
618            final int patternLength = this.pattern.length();
619            final int ipl = i + patternLength;
620    
621            if (ipl > input.length()) {
622                // not enough room for the pattern to match
623                return false;
624            }
625    
626            // evaluate the pattern, left context and right context
627            // fail early if any of the evaluations is not successful
628            if (!input.subSequence(i, ipl).equals(this.pattern)) {
629                return false;
630            } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
631                return false;
632            }
633            return this.lContext.isMatch(input.subSequence(0, i));
634        }
635    }