View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.EnumMap;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Scanner;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36  
37  /**
38   * A phoneme rule.
39   * <p>
40   * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
41   * and a logical flag indicating if all languages must be in play. A rule matches if:
42   * <ul>
43   * <li>the pattern matches at the current position</li>
44   * <li>the string up until the beginning of the pattern matches the left context</li>
45   * <li>the string from the end of the pattern matches the right context</li>
46   * <li>logical is ALL and all languages are in scope; or</li>
47   * <li>logical is any other value and at least one language is in scope</li>
48   * </ul>
49   * <p>
50   * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
51   * to explicitly construct their own.
52   * <p>
53   * Rules are immutable and thread-safe.
54   * <p>
55   * <b>Rules resources</b>
56   * <p>
57   * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
58   * named following the pattern:
59   * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
60   * <p>
61   * The format of these resources is the following:
62   * <ul>
63   * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
64   * will be interpreted as:
65   * <ol>
66   * <li>pattern</li>
67   * <li>left context</li>
68   * <li>right context</li>
69   * <li>phoneme</li>
70   * </ol>
71   * </li>
72   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
73   * as a comment.</li>
74   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
75   * all content until a line ending in '*' and '/' is found.</li>
76   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
77   * </ul>
78   *
79   * @since 1.6
80   * @version $Id: Rule.java 1760691 2016-09-14 12:14:26Z jochen $
81   */
82  public class Rule {
83  
84      public static final class Phoneme implements PhonemeExpr {
85          public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
86              @Override
87              public int compare(final Phoneme o1, final Phoneme o2) {
88                  for (int i = 0; i < o1.phonemeText.length(); i++) {
89                      if (i >= o2.phonemeText.length()) {
90                          return +1;
91                      }
92                      final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
93                      if (c != 0) {
94                          return c;
95                      }
96                  }
97  
98                  if (o1.phonemeText.length() < o2.phonemeText.length()) {
99                      return -1;
100                 }
101 
102                 return 0;
103             }
104         };
105 
106         private final StringBuilder phonemeText;
107         private final Languages.LanguageSet languages;
108 
109         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110             this.phonemeText = new StringBuilder(phonemeText);
111             this.languages = languages;
112         }
113 
114         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115             this(phonemeLeft.phonemeText, phonemeLeft.languages);
116             this.phonemeText.append(phonemeRight.phonemeText);
117         }
118 
119         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120             this(phonemeLeft.phonemeText, languages);
121             this.phonemeText.append(phonemeRight.phonemeText);
122         }
123 
124         public Phoneme append(final CharSequence str) {
125             this.phonemeText.append(str);
126             return this;
127         }
128 
129         public Languages.LanguageSet getLanguages() {
130             return this.languages;
131         }
132 
133         @Override
134         public Iterable<Phoneme> getPhonemes() {
135             return Collections.singleton(this);
136         }
137 
138         public CharSequence getPhonemeText() {
139             return this.phonemeText;
140         }
141 
142         /**
143          * Deprecated since 1.9.
144          *
145          * @param right the Phoneme to join
146          * @return a new Phoneme
147          * @deprecated since 1.9
148          */
149         @Deprecated
150         public Phoneme join(final Phoneme right) {
151             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
152                                this.languages.restrictTo(right.languages));
153         }
154 
155         /**
156          * Returns a new Phoneme with the same text but a union of its
157          * current language set and the given one.
158          *
159          * @param lang the language set to merge
160          * @return a new Phoneme
161          */
162         public Phoneme mergeWithLanguage(final LanguageSet lang) {
163           return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
164         }
165 
166         @Override
167         public String toString() {
168           return phonemeText.toString() + "[" + languages + "]";
169         }
170     }
171 
172     public interface PhonemeExpr {
173         Iterable<Phoneme> getPhonemes();
174     }
175 
176     public static final class PhonemeList implements PhonemeExpr {
177         private final List<Phoneme> phonemes;
178 
179         public PhonemeList(final List<Phoneme> phonemes) {
180             this.phonemes = phonemes;
181         }
182 
183         @Override
184         public List<Phoneme> getPhonemes() {
185             return this.phonemes;
186         }
187     }
188 
189     /**
190      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
191      */
192     public interface RPattern {
193         boolean isMatch(CharSequence input);
194     }
195 
196     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
197         @Override
198         public boolean isMatch(final CharSequence input) {
199             return true;
200         }
201     };
202 
203     public static final String ALL = "ALL";
204 
205     private static final String DOUBLE_QUOTE = "\"";
206 
207     private static final String HASH_INCLUDE = "#include";
208 
209     private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
210             new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
211 
212     static {
213         for (final NameType s : NameType.values()) {
214             final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
215                     new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
216 
217             for (final RuleType rt : RuleType.values()) {
218                 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
219 
220                 final Languages ls = Languages.getInstance(s);
221                 for (final String l : ls.getLanguages()) {
222                     final Scanner scanner = createScanner(s, rt, l);
223                     try {
224                         rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
225                     } catch (final IllegalStateException e) {
226                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
227                     } finally {
228                         scanner.close();
229                     }
230                 }
231                 if (!rt.equals(RuleType.RULES)) {
232                     final Scanner scanner = createScanner(s, rt, "common");
233                     try {
234                         rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
235                     } finally {
236                         scanner.close();
237                     }
238                 }
239 
240                 rts.put(rt, Collections.unmodifiableMap(rs));
241             }
242 
243             RULES.put(s, Collections.unmodifiableMap(rts));
244         }
245     }
246 
247     private static boolean contains(final CharSequence chars, final char input) {
248         for (int i = 0; i < chars.length(); i++) {
249             if (chars.charAt(i) == input) {
250                 return true;
251             }
252         }
253         return false;
254     }
255 
256     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
257         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
258                              nameType.getName(), rt.getName(), lang);
259     }
260 
261     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
262         final String resName = createResourceName(nameType, rt, lang);
263         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
264 
265         if (rulesIS == null) {
266             throw new IllegalArgumentException("Unable to load resource: " + resName);
267         }
268 
269         return new Scanner(rulesIS, ResourceConstants.ENCODING);
270     }
271 
272     private static Scanner createScanner(final String lang) {
273         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
274         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
275 
276         if (rulesIS == null) {
277             throw new IllegalArgumentException("Unable to load resource: " + resName);
278         }
279 
280         return new Scanner(rulesIS, ResourceConstants.ENCODING);
281     }
282 
283     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
284         if (suffix.length() > input.length()) {
285             return false;
286         }
287         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
288             if (input.charAt(i) != suffix.charAt(j)) {
289                 return false;
290             }
291         }
292         return true;
293     }
294 
295     /**
296      * Gets rules for a combination of name type, rule type and languages.
297      *
298      * @param nameType
299      *            the NameType to consider
300      * @param rt
301      *            the RuleType to consider
302      * @param langs
303      *            the set of languages to consider
304      * @return a list of Rules that apply
305      */
306     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
307                                          final Languages.LanguageSet langs) {
308         final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
309         final List<Rule> allRules = new ArrayList<Rule>();
310         for (final List<Rule> rules : ruleMap.values()) {
311             allRules.addAll(rules);
312         }
313         return allRules;
314     }
315 
316     /**
317      * Gets rules for a combination of name type, rule type and a single language.
318      *
319      * @param nameType
320      *            the NameType to consider
321      * @param rt
322      *            the RuleType to consider
323      * @param lang
324      *            the language to consider
325      * @return a list of Rules that apply
326      */
327     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
328         return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
329     }
330 
331     /**
332      * Gets rules for a combination of name type, rule type and languages.
333      *
334      * @param nameType
335      *            the NameType to consider
336      * @param rt
337      *            the RuleType to consider
338      * @param langs
339      *            the set of languages to consider
340      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
341      * @since 1.9
342      */
343     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
344                                                          final Languages.LanguageSet langs) {
345         return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
346                                      getInstanceMap(nameType, rt, Languages.ANY);
347     }
348 
349     /**
350      * Gets rules for a combination of name type, rule type and a single language.
351      *
352      * @param nameType
353      *            the NameType to consider
354      * @param rt
355      *            the RuleType to consider
356      * @param lang
357      *            the language to consider
358      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
359      * @since 1.9
360      */
361     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
362                                                          final String lang) {
363         final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
364 
365         if (rules == null) {
366             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
367                                                nameType.getName(), rt.getName(), lang));
368         }
369 
370         return rules;
371     }
372 
373     private static Phoneme parsePhoneme(final String ph) {
374         final int open = ph.indexOf("[");
375         if (open >= 0) {
376             if (!ph.endsWith("]")) {
377                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
378             }
379             final String before = ph.substring(0, open);
380             final String in = ph.substring(open + 1, ph.length() - 1);
381             final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
382 
383             return new Phoneme(before, Languages.LanguageSet.from(langs));
384         }
385         return new Phoneme(ph, Languages.ANY_LANGUAGE);
386     }
387 
388     private static PhonemeExpr parsePhonemeExpr(final String ph) {
389         if (ph.startsWith("(")) { // we have a bracketed list of options
390             if (!ph.endsWith(")")) {
391                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
392             }
393 
394             final List<Phoneme> phs = new ArrayList<Phoneme>();
395             final String body = ph.substring(1, ph.length() - 1);
396             for (final String part : body.split("[|]")) {
397                 phs.add(parsePhoneme(part));
398             }
399             if (body.startsWith("|") || body.endsWith("|")) {
400                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
401             }
402 
403             return new PhonemeList(phs);
404         }
405         return parsePhoneme(ph);
406     }
407 
408     private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
409         final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
410         int currentLine = 0;
411 
412         boolean inMultilineComment = false;
413         while (scanner.hasNextLine()) {
414             currentLine++;
415             final String rawLine = scanner.nextLine();
416             String line = rawLine;
417 
418             if (inMultilineComment) {
419                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
420                     inMultilineComment = false;
421                 }
422             } else {
423                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
424                     inMultilineComment = true;
425                 } else {
426                     // discard comments
427                     final int cmtI = line.indexOf(ResourceConstants.CMT);
428                     if (cmtI >= 0) {
429                         line = line.substring(0, cmtI);
430                     }
431 
432                     // trim leading-trailing whitespace
433                     line = line.trim();
434 
435                     if (line.length() == 0) {
436                         continue; // empty lines can be safely skipped
437                     }
438 
439                     if (line.startsWith(HASH_INCLUDE)) {
440                         // include statement
441                         final String incl = line.substring(HASH_INCLUDE.length()).trim();
442                         if (incl.contains(" ")) {
443                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
444                                                                location);
445                         }
446                         final Scanner hashIncludeScanner = createScanner(incl);
447                         try {
448                             lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449                         } finally {
450                             hashIncludeScanner.close();
451                         }
452                     } else {
453                         // rule
454                         final String[] parts = line.split("\\s+");
455                         if (parts.length != 4) {
456                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
457                                                                " parts: " + rawLine + " in " + location);
458                         }
459                         try {
460                             final String pat = stripQuotes(parts[0]);
461                             final String lCon = stripQuotes(parts[1]);
462                             final String rCon = stripQuotes(parts[2]);
463                             final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
464                             final int cLine = currentLine;
465                             final Rule r = new Rule(pat, lCon, rCon, ph) {
466                                 private final int myLine = cLine;
467                                 private final String loc = location;
468 
469                                 @Override
470                                 public String toString() {
471                                     final StringBuilder sb = new StringBuilder();
472                                     sb.append("Rule");
473                                     sb.append("{line=").append(myLine);
474                                     sb.append(", loc='").append(loc).append('\'');
475                                     sb.append(", pat='").append(pat).append('\'');
476                                     sb.append(", lcon='").append(lCon).append('\'');
477                                     sb.append(", rcon='").append(rCon).append('\'');
478                                     sb.append('}');
479                                     return sb.toString();
480                                 }
481                             };
482                             final String patternKey = r.pattern.substring(0,1);
483                             List<Rule> rules = lines.get(patternKey);
484                             if (rules == null) {
485                                 rules = new ArrayList<Rule>();
486                                 lines.put(patternKey, rules);
487                             }
488                             rules.add(r);
489                         } catch (final IllegalArgumentException e) {
490                             throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
491                                                             location, e);
492                         }
493                     }
494                 }
495             }
496         }
497 
498         return lines;
499     }
500 
501     /**
502      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
503      *
504      * @param regex
505      *            the regular expression to compile
506      * @return an RPattern that will match this regex
507      */
508     private static RPattern pattern(final String regex) {
509         final boolean startsWith = regex.startsWith("^");
510         final boolean endsWith = regex.endsWith("$");
511         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
512         final boolean boxes = content.contains("[");
513 
514         if (!boxes) {
515             if (startsWith && endsWith) {
516                 // exact match
517                 if (content.length() == 0) {
518                     // empty
519                     return new RPattern() {
520                         @Override
521                         public boolean isMatch(final CharSequence input) {
522                             return input.length() == 0;
523                         }
524                     };
525                 }
526                 return new RPattern() {
527                     @Override
528                     public boolean isMatch(final CharSequence input) {
529                         return input.equals(content);
530                     }
531                 };
532             } else if ((startsWith || endsWith) && content.length() == 0) {
533                 // matches every string
534                 return ALL_STRINGS_RMATCHER;
535             } else if (startsWith) {
536                 // matches from start
537                 return new RPattern() {
538                     @Override
539                     public boolean isMatch(final CharSequence input) {
540                         return startsWith(input, content);
541                     }
542                 };
543             } else if (endsWith) {
544                 // matches from start
545                 return new RPattern() {
546                     @Override
547                     public boolean isMatch(final CharSequence input) {
548                         return endsWith(input, content);
549                     }
550                 };
551             }
552         } else {
553             final boolean startsWithBox = content.startsWith("[");
554             final boolean endsWithBox = content.endsWith("]");
555 
556             if (startsWithBox && endsWithBox) {
557                 String boxContent = content.substring(1, content.length() - 1);
558                 if (!boxContent.contains("[")) {
559                     // box containing alternatives
560                     final boolean negate = boxContent.startsWith("^");
561                     if (negate) {
562                         boxContent = boxContent.substring(1);
563                     }
564                     final String bContent = boxContent;
565                     final boolean shouldMatch = !negate;
566 
567                     if (startsWith && endsWith) {
568                         // exact match
569                         return new RPattern() {
570                             @Override
571                             public boolean isMatch(final CharSequence input) {
572                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
573                             }
574                         };
575                     } else if (startsWith) {
576                         // first char
577                         return new RPattern() {
578                             @Override
579                             public boolean isMatch(final CharSequence input) {
580                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
581                             }
582                         };
583                     } else if (endsWith) {
584                         // last char
585                         return new RPattern() {
586                             @Override
587                             public boolean isMatch(final CharSequence input) {
588                                 return input.length() > 0 &&
589                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
590                             }
591                         };
592                     }
593                 }
594             }
595         }
596 
597         return new RPattern() {
598             Pattern pattern = Pattern.compile(regex);
599 
600             @Override
601             public boolean isMatch(final CharSequence input) {
602                 final Matcher matcher = pattern.matcher(input);
603                 return matcher.find();
604             }
605         };
606     }
607 
608     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
609         if (prefix.length() > input.length()) {
610             return false;
611         }
612         for (int i = 0; i < prefix.length(); i++) {
613             if (input.charAt(i) != prefix.charAt(i)) {
614                 return false;
615             }
616         }
617         return true;
618     }
619 
620     private static String stripQuotes(String str) {
621         if (str.startsWith(DOUBLE_QUOTE)) {
622             str = str.substring(1);
623         }
624 
625         if (str.endsWith(DOUBLE_QUOTE)) {
626             str = str.substring(0, str.length() - 1);
627         }
628 
629         return str;
630     }
631 
632     private final RPattern lContext;
633 
634     private final String pattern;
635 
636     private final PhonemeExpr phoneme;
637 
638     private final RPattern rContext;
639 
640     /**
641      * Creates a new rule.
642      *
643      * @param pattern
644      *            the pattern
645      * @param lContext
646      *            the left context
647      * @param rContext
648      *            the right context
649      * @param phoneme
650      *            the resulting phoneme
651      */
652     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
653         this.pattern = pattern;
654         this.lContext = pattern(lContext + "$");
655         this.rContext = pattern("^" + rContext);
656         this.phoneme = phoneme;
657     }
658 
659     /**
660      * Gets the left context. This is a regular expression that must match to the left of the pattern.
661      *
662      * @return the left context Pattern
663      */
664     public RPattern getLContext() {
665         return this.lContext;
666     }
667 
668     /**
669      * Gets the pattern. This is a string-literal that must exactly match.
670      *
671      * @return the pattern
672      */
673     public String getPattern() {
674         return this.pattern;
675     }
676 
677     /**
678      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
679      *
680      * @return the phoneme
681      */
682     public PhonemeExpr getPhoneme() {
683         return this.phoneme;
684     }
685 
686     /**
687      * Gets the right context. This is a regular expression that must match to the right of the pattern.
688      *
689      * @return the right context Pattern
690      */
691     public RPattern getRContext() {
692         return this.rContext;
693     }
694 
695     /**
696      * Decides if the pattern and context match the input starting at a position. It is a match if the
697      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
698      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
699      *
700      * @param input
701      *            the input String
702      * @param i
703      *            the int position within the input
704      * @return true if the pattern and left/right context match, false otherwise
705      */
706     public boolean patternAndContextMatches(final CharSequence input, final int i) {
707         if (i < 0) {
708             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
709         }
710 
711         final int patternLength = this.pattern.length();
712         final int ipl = i + patternLength;
713 
714         if (ipl > input.length()) {
715             // not enough room for the pattern to match
716             return false;
717         }
718 
719         // evaluate the pattern, left context and right context
720         // fail early if any of the evaluations is not successful
721         if (!input.subSequence(i, ipl).equals(this.pattern)) {
722             return false;
723         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
724             return false;
725         }
726         return this.lContext.isMatch(input.subSequence(0, i));
727     }
728 }