View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.EnumMap;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Scanner;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36  
37  /**
38   * A phoneme rule.
39   * <p>
40   * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
41   * and a logical flag indicating if all languages must be in play. A rule matches if:
42   * <ul>
43   * <li>the pattern matches at the current position</li>
44   * <li>the string up until the beginning of the pattern matches the left context</li>
45   * <li>the string from the end of the pattern matches the right context</li>
46   * <li>logical is ALL and all languages are in scope; or</li>
47   * <li>logical is any other value and at least one language is in scope</li>
48   * </ul>
49   * <p>
50   * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
51   * to explicitly construct their own.
52   * <p>
53   * Rules are immutable and thread-safe.
54   * <p>
55   * <b>Rules resources</b>
56   * <p>
57   * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
58   * named following the pattern:
59   * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
60   * <p>
61   * The format of these resources is the following:
62   * <ul>
63   * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
64   * will be interpreted as:
65   * <ol>
66   * <li>pattern</li>
67   * <li>left context</li>
68   * <li>right context</li>
69   * <li>phoneme</li>
70   * </ol>
71   * </li>
72   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
73   * as a comment.</li>
74   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
75   * all content until a line ending in '*' and '/' is found.</li>
76   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
77   * </ul>
78   *
79   * @since 1.6
80   * @version $Id: Rule.html 891688 2013-12-24 20:49:46Z ggregory $
81   */
82  public class Rule {
83  
84      public static final class Phoneme implements PhonemeExpr {
85          public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
86              @Override
87              public int compare(final Phoneme o1, final Phoneme o2) {
88                  for (int i = 0; i < o1.phonemeText.length(); i++) {
89                      if (i >= o2.phonemeText.length()) {
90                          return +1;
91                      }
92                      final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
93                      if (c != 0) {
94                          return c;
95                      }
96                  }
97  
98                  if (o1.phonemeText.length() < o2.phonemeText.length()) {
99                      return -1;
100                 }
101 
102                 return 0;
103             }
104         };
105 
106         private final StringBuilder phonemeText;
107         private final Languages.LanguageSet languages;
108 
109         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
110             this.phonemeText = new StringBuilder(phonemeText);
111             this.languages = languages;
112         }
113 
114         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
115             this(phonemeLeft.phonemeText, phonemeLeft.languages);
116             this.phonemeText.append(phonemeRight.phonemeText);
117         }
118 
119         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
120             this(phonemeLeft.phonemeText, languages);
121             this.phonemeText.append(phonemeRight.phonemeText);
122         }
123 
124         public Phoneme append(final CharSequence str) {
125             this.phonemeText.append(str);
126             return this;
127         }
128 
129         public Languages.LanguageSet getLanguages() {
130             return this.languages;
131         }
132 
133         @Override
134         public Iterable<Phoneme> getPhonemes() {
135             return Collections.singleton(this);
136         }
137 
138         public CharSequence getPhonemeText() {
139             return this.phonemeText;
140         }
141 
142         /**
143          * @deprecated since 1.9
144          */
145         @Deprecated
146         public Phoneme join(final Phoneme right) {
147             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
148                                this.languages.restrictTo(right.languages));
149         }
150     }
151 
152     public interface PhonemeExpr {
153         Iterable<Phoneme> getPhonemes();
154     }
155 
156     public static final class PhonemeList implements PhonemeExpr {
157         private final List<Phoneme> phonemes;
158 
159         public PhonemeList(final List<Phoneme> phonemes) {
160             this.phonemes = phonemes;
161         }
162 
163         @Override
164         public List<Phoneme> getPhonemes() {
165             return this.phonemes;
166         }
167     }
168 
169     /**
170      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
171      */
172     public interface RPattern {
173         boolean isMatch(CharSequence input);
174     }
175 
176     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
177         @Override
178         public boolean isMatch(final CharSequence input) {
179             return true;
180         }
181     };
182 
183     public static final String ALL = "ALL";
184 
185     private static final String DOUBLE_QUOTE = "\"";
186 
187     private static final String HASH_INCLUDE = "#include";
188 
189     private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
190             new EnumMap<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>>(NameType.class);
191 
192     static {
193         for (final NameType s : NameType.values()) {
194             final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
195                     new EnumMap<RuleType, Map<String, Map<String, List<Rule>>>>(RuleType.class);
196 
197             for (final RuleType rt : RuleType.values()) {
198                 final Map<String, Map<String, List<Rule>>> rs = new HashMap<String, Map<String, List<Rule>>>();
199 
200                 final Languages ls = Languages.getInstance(s);
201                 for (final String l : ls.getLanguages()) {
202                     try {
203                         rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
204                     } catch (final IllegalStateException e) {
205                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
206                     }
207                 }
208                 if (!rt.equals(RuleType.RULES)) {
209                     rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
210                 }
211 
212                 rts.put(rt, Collections.unmodifiableMap(rs));
213             }
214 
215             RULES.put(s, Collections.unmodifiableMap(rts));
216         }
217     }
218 
219     private static boolean contains(final CharSequence chars, final char input) {
220         for (int i = 0; i < chars.length(); i++) {
221             if (chars.charAt(i) == input) {
222                 return true;
223             }
224         }
225         return false;
226     }
227 
228     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
229         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
230                              nameType.getName(), rt.getName(), lang);
231     }
232 
233     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
234         final String resName = createResourceName(nameType, rt, lang);
235         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
236 
237         if (rulesIS == null) {
238             throw new IllegalArgumentException("Unable to load resource: " + resName);
239         }
240 
241         return new Scanner(rulesIS, ResourceConstants.ENCODING);
242     }
243 
244     private static Scanner createScanner(final String lang) {
245         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
246         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
247 
248         if (rulesIS == null) {
249             throw new IllegalArgumentException("Unable to load resource: " + resName);
250         }
251 
252         return new Scanner(rulesIS, ResourceConstants.ENCODING);
253     }
254 
255     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
256         if (suffix.length() > input.length()) {
257             return false;
258         }
259         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
260             if (input.charAt(i) != suffix.charAt(j)) {
261                 return false;
262             }
263         }
264         return true;
265     }
266 
267     /**
268      * Gets rules for a combination of name type, rule type and languages.
269      *
270      * @param nameType
271      *            the NameType to consider
272      * @param rt
273      *            the RuleType to consider
274      * @param langs
275      *            the set of languages to consider
276      * @return a list of Rules that apply
277      */
278     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
279                                          final Languages.LanguageSet langs) {
280         final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
281         final List<Rule> allRules = new ArrayList<Rule>();
282         for (final List<Rule> rules : ruleMap.values()) {
283             allRules.addAll(rules);
284         }
285         return allRules;
286     }
287 
288     /**
289      * Gets rules for a combination of name type, rule type and a single language.
290      *
291      * @param nameType
292      *            the NameType to consider
293      * @param rt
294      *            the RuleType to consider
295      * @param lang
296      *            the language to consider
297      * @return a list of Rules that apply
298      */
299     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
300         return getInstance(nameType, rt, LanguageSet.from(new HashSet<String>(Arrays.asList(lang))));
301     }
302 
303     /**
304      * Gets rules for a combination of name type, rule type and languages.
305      *
306      * @param nameType
307      *            the NameType to consider
308      * @param rt
309      *            the RuleType to consider
310      * @param langs
311      *            the set of languages to consider
312      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
313      * @since 1.9
314      */
315     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
316                                                          final Languages.LanguageSet langs) {
317         return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
318                                      getInstanceMap(nameType, rt, Languages.ANY);
319     }
320 
321     /**
322      * Gets rules for a combination of name type, rule type and a single language.
323      *
324      * @param nameType
325      *            the NameType to consider
326      * @param rt
327      *            the RuleType to consider
328      * @param lang
329      *            the language to consider
330      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
331      * @since 1.9
332      */
333     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
334                                                          final String lang) {
335         final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
336 
337         if (rules == null) {
338             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
339                                                nameType.getName(), rt.getName(), lang));
340         }
341 
342         return rules;
343     }
344 
345     private static Phoneme parsePhoneme(final String ph) {
346         final int open = ph.indexOf("[");
347         if (open >= 0) {
348             if (!ph.endsWith("]")) {
349                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
350             }
351             final String before = ph.substring(0, open);
352             final String in = ph.substring(open + 1, ph.length() - 1);
353             final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
354 
355             return new Phoneme(before, Languages.LanguageSet.from(langs));
356         } else {
357             return new Phoneme(ph, Languages.ANY_LANGUAGE);
358         }
359     }
360 
361     private static PhonemeExpr parsePhonemeExpr(final String ph) {
362         if (ph.startsWith("(")) { // we have a bracketed list of options
363             if (!ph.endsWith(")")) {
364                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
365             }
366 
367             final List<Phoneme> phs = new ArrayList<Phoneme>();
368             final String body = ph.substring(1, ph.length() - 1);
369             for (final String part : body.split("[|]")) {
370                 phs.add(parsePhoneme(part));
371             }
372             if (body.startsWith("|") || body.endsWith("|")) {
373                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
374             }
375 
376             return new PhonemeList(phs);
377         } else {
378             return parsePhoneme(ph);
379         }
380     }
381 
382     private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
383         final Map<String, List<Rule>> lines = new HashMap<String, List<Rule>>();
384         int currentLine = 0;
385 
386         boolean inMultilineComment = false;
387         while (scanner.hasNextLine()) {
388             currentLine++;
389             final String rawLine = scanner.nextLine();
390             String line = rawLine;
391 
392             if (inMultilineComment) {
393                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
394                     inMultilineComment = false;
395                 }
396             } else {
397                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
398                     inMultilineComment = true;
399                 } else {
400                     // discard comments
401                     final int cmtI = line.indexOf(ResourceConstants.CMT);
402                     if (cmtI >= 0) {
403                         line = line.substring(0, cmtI);
404                     }
405 
406                     // trim leading-trailing whitespace
407                     line = line.trim();
408 
409                     if (line.length() == 0) {
410                         continue; // empty lines can be safely skipped
411                     }
412 
413                     if (line.startsWith(HASH_INCLUDE)) {
414                         // include statement
415                         final String incl = line.substring(HASH_INCLUDE.length()).trim();
416                         if (incl.contains(" ")) {
417                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
418                                                                location);
419                         } else {
420                             lines.putAll(parseRules(createScanner(incl), location + "->" + incl));
421                         }
422                     } else {
423                         // rule
424                         final String[] parts = line.split("\\s+");
425                         if (parts.length != 4) {
426                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
427                                                                " parts: " + rawLine + " in " + location);
428                         } else {
429                             try {
430                                 final String pat = stripQuotes(parts[0]);
431                                 final String lCon = stripQuotes(parts[1]);
432                                 final String rCon = stripQuotes(parts[2]);
433                                 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
434                                 final int cLine = currentLine;
435                                 final Rule r = new Rule(pat, lCon, rCon, ph) {
436                                     private final int myLine = cLine;
437                                     private final String loc = location;
438 
439                                     @Override
440                                     public String toString() {
441                                         final StringBuilder sb = new StringBuilder();
442                                         sb.append("Rule");
443                                         sb.append("{line=").append(myLine);
444                                         sb.append(", loc='").append(loc).append('\'');
445                                         sb.append('}');
446                                         return sb.toString();
447                                     }
448                                 };
449                                 final String patternKey = r.pattern.substring(0,1);
450                                 List<Rule> rules = lines.get(patternKey);
451                                 if (rules == null) {
452                                     rules = new ArrayList<Rule>();
453                                     lines.put(patternKey, rules);
454                                 }
455                                 rules.add(r);
456                             } catch (final IllegalArgumentException e) {
457                                 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
458                                                                 location, e);
459                             }
460                         }
461                     }
462                 }
463             }
464         }
465 
466         return lines;
467     }
468 
469     /**
470      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
471      *
472      * @param regex
473      *            the regular expression to compile
474      * @return an RPattern that will match this regex
475      */
476     private static RPattern pattern(final String regex) {
477         final boolean startsWith = regex.startsWith("^");
478         final boolean endsWith = regex.endsWith("$");
479         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
480         final boolean boxes = content.contains("[");
481 
482         if (!boxes) {
483             if (startsWith && endsWith) {
484                 // exact match
485                 if (content.length() == 0) {
486                     // empty
487                     return new RPattern() {
488                         @Override
489                         public boolean isMatch(final CharSequence input) {
490                             return input.length() == 0;
491                         }
492                     };
493                 } else {
494                     return new RPattern() {
495                         @Override
496                         public boolean isMatch(final CharSequence input) {
497                             return input.equals(content);
498                         }
499                     };
500                 }
501             } else if ((startsWith || endsWith) && content.length() == 0) {
502                 // matches every string
503                 return ALL_STRINGS_RMATCHER;
504             } else if (startsWith) {
505                 // matches from start
506                 return new RPattern() {
507                     @Override
508                     public boolean isMatch(final CharSequence input) {
509                         return startsWith(input, content);
510                     }
511                 };
512             } else if (endsWith) {
513                 // matches from start
514                 return new RPattern() {
515                     @Override
516                     public boolean isMatch(final CharSequence input) {
517                         return endsWith(input, content);
518                     }
519                 };
520             }
521         } else {
522             final boolean startsWithBox = content.startsWith("[");
523             final boolean endsWithBox = content.endsWith("]");
524 
525             if (startsWithBox && endsWithBox) {
526                 String boxContent = content.substring(1, content.length() - 1);
527                 if (!boxContent.contains("[")) {
528                     // box containing alternatives
529                     final boolean negate = boxContent.startsWith("^");
530                     if (negate) {
531                         boxContent = boxContent.substring(1);
532                     }
533                     final String bContent = boxContent;
534                     final boolean shouldMatch = !negate;
535 
536                     if (startsWith && endsWith) {
537                         // exact match
538                         return new RPattern() {
539                             @Override
540                             public boolean isMatch(final CharSequence input) {
541                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
542                             }
543                         };
544                     } else if (startsWith) {
545                         // first char
546                         return new RPattern() {
547                             @Override
548                             public boolean isMatch(final CharSequence input) {
549                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550                             }
551                         };
552                     } else if (endsWith) {
553                         // last char
554                         return new RPattern() {
555                             @Override
556                             public boolean isMatch(final CharSequence input) {
557                                 return input.length() > 0 &&
558                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
559                             }
560                         };
561                     }
562                 }
563             }
564         }
565 
566         return new RPattern() {
567             Pattern pattern = Pattern.compile(regex);
568 
569             @Override
570             public boolean isMatch(final CharSequence input) {
571                 final Matcher matcher = pattern.matcher(input);
572                 return matcher.find();
573             }
574         };
575     }
576 
577     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
578         if (prefix.length() > input.length()) {
579             return false;
580         }
581         for (int i = 0; i < prefix.length(); i++) {
582             if (input.charAt(i) != prefix.charAt(i)) {
583                 return false;
584             }
585         }
586         return true;
587     }
588 
589     private static String stripQuotes(String str) {
590         if (str.startsWith(DOUBLE_QUOTE)) {
591             str = str.substring(1);
592         }
593 
594         if (str.endsWith(DOUBLE_QUOTE)) {
595             str = str.substring(0, str.length() - 1);
596         }
597 
598         return str;
599     }
600 
601     private final RPattern lContext;
602 
603     private final String pattern;
604 
605     private final PhonemeExpr phoneme;
606 
607     private final RPattern rContext;
608 
609     /**
610      * Creates a new rule.
611      *
612      * @param pattern
613      *            the pattern
614      * @param lContext
615      *            the left context
616      * @param rContext
617      *            the right context
618      * @param phoneme
619      *            the resulting phoneme
620      */
621     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
622         this.pattern = pattern;
623         this.lContext = pattern(lContext + "$");
624         this.rContext = pattern("^" + rContext);
625         this.phoneme = phoneme;
626     }
627 
628     /**
629      * Gets the left context. This is a regular expression that must match to the left of the pattern.
630      *
631      * @return the left context Pattern
632      */
633     public RPattern getLContext() {
634         return this.lContext;
635     }
636 
637     /**
638      * Gets the pattern. This is a string-literal that must exactly match.
639      *
640      * @return the pattern
641      */
642     public String getPattern() {
643         return this.pattern;
644     }
645 
646     /**
647      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
648      *
649      * @return the phoneme
650      */
651     public PhonemeExpr getPhoneme() {
652         return this.phoneme;
653     }
654 
655     /**
656      * Gets the right context. This is a regular expression that must match to the right of the pattern.
657      *
658      * @return the right context Pattern
659      */
660     public RPattern getRContext() {
661         return this.rContext;
662     }
663 
664     /**
665      * Decides if the pattern and context match the input starting at a position. It is a match if the
666      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
667      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
668      *
669      * @param input
670      *            the input String
671      * @param i
672      *            the int position within the input
673      * @return true if the pattern and left/right context match, false otherwise
674      */
675     public boolean patternAndContextMatches(final CharSequence input, final int i) {
676         if (i < 0) {
677             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
678         }
679 
680         final int patternLength = this.pattern.length();
681         final int ipl = i + patternLength;
682 
683         if (ipl > input.length()) {
684             // not enough room for the pattern to match
685             return false;
686         }
687 
688         // evaluate the pattern, left context and right context
689         // fail early if any of the evaluations is not successful
690         if (!input.subSequence(i, ipl).equals(this.pattern)) {
691             return false;
692         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
693             return false;
694         }
695         return this.lContext.isMatch(input.subSequence(0, i));
696     }
697 }