View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.Comparator;
24  import java.util.EnumMap;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Scanner;
30  import java.util.Set;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  
34  import org.apache.commons.codec.Resources;
35  import org.apache.commons.codec.language.bm.Languages.LanguageSet;
36  
37  /**
38   * A phoneme rule.
39   * <p>
40   * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
41   * and a logical flag indicating if all languages must be in play. A rule matches if:
42   * </p>
43   * <ul>
44   * <li>the pattern matches at the current position</li>
45   * <li>the string up until the beginning of the pattern matches the left context</li>
46   * <li>the string from the end of the pattern matches the right context</li>
47   * <li>logical is ALL and all languages are in scope; or</li>
48   * <li>logical is any other value and at least one language is in scope</li>
49   * </ul>
50   * <p>
51   * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
52   * to explicitly construct their own.
53   * </p>
54   * <p>
55   * Rules are immutable and thread-safe.
56   * </p>
57   * <h2>Rules resources</h2>
58   * <p>
59   * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
60   * named following the pattern:
61   * </p>
62   * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
63   * <p>
64   * The format of these resources is the following:
65   * </p>
66   * <ul>
67   * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
68   * will be interpreted as:
69   * <ol>
70   * <li>pattern</li>
71   * <li>left context</li>
72   * <li>right context</li>
73   * <li>phoneme</li>
74   * </ol>
75   * </li>
76   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
77   * as a comment.</li>
78   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
79   * all content until a line ending in '*' and '/' is found.</li>
80   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
81   * </ul>
82   *
83   * @since 1.6
84   */
85  public class Rule {
86  
87      public static final class Phoneme implements PhonemeExpr {
88  
89          public static final Comparator<Phoneme> COMPARATOR = (o1, o2) -> {
90              final int o1Length = o1.phonemeText.length();
91              final int o2Length = o2.phonemeText.length();
92              for (int i = 0; i < o1Length; i++) {
93                  if (i >= o2Length) {
94                      return +1;
95                  }
96                  final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
97                  if (c != 0) {
98                      return c;
99                  }
100             }
101 
102             if (o1Length < o2Length) {
103                 return -1;
104             }
105 
106             return 0;
107         };
108         private final StringBuilder phonemeText;
109 
110         private final Languages.LanguageSet languages;
111 
112         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
113             this.phonemeText = new StringBuilder(phonemeText);
114             this.languages = languages;
115         }
116 
117         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight) {
118             this(phonemeLeft.phonemeText, phonemeLeft.languages);
119             this.phonemeText.append(phonemeRight.phonemeText);
120         }
121 
122         public Phoneme(final Phoneme phonemeLeft, final Phoneme phonemeRight, final Languages.LanguageSet languages) {
123             this(phonemeLeft.phonemeText, languages);
124             this.phonemeText.append(phonemeRight.phonemeText);
125         }
126 
127         public Phoneme append(final CharSequence str) {
128             this.phonemeText.append(str);
129             return this;
130         }
131 
132         public Languages.LanguageSet getLanguages() {
133             return this.languages;
134         }
135 
136         @Override
137         public Iterable<Phoneme> getPhonemes() {
138             return Collections.singleton(this);
139         }
140 
141         public CharSequence getPhonemeText() {
142             return this.phonemeText;
143         }
144 
145         /**
146          * Deprecated since 1.9.
147          *
148          * @param right the Phoneme to join
149          * @return a new Phoneme
150          * @deprecated since 1.9
151          */
152         @Deprecated
153         public Phoneme join(final Phoneme right) {
154             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
155                                this.languages.restrictTo(right.languages));
156         }
157 
158         /**
159          * Returns a new Phoneme with the same text but a union of its
160          * current language set and the given one.
161          *
162          * @param lang the language set to merge
163          * @return a new Phoneme
164          */
165         public Phoneme mergeWithLanguage(final LanguageSet lang) {
166           return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
167         }
168 
169         @Override
170         public int size() {
171             return 1;
172         }
173 
174         @Override
175         public String toString() {
176           return phonemeText.toString() + "[" + languages + "]";
177         }
178     }
179 
180     public interface PhonemeExpr {
181         Iterable<Phoneme> getPhonemes();
182 
183         /**
184          * Gets the expression size in phonemes.
185          *
186          * @return the expression size in phonemes.
187          * @since 1.17.0
188          */
189         default int size() {
190             // All implementations are int-bound.
191             return (int) Math.min(getPhonemes().spliterator().getExactSizeIfKnown(), Integer.MAX_VALUE);
192         }
193     }
194 
195     public static final class PhonemeList implements PhonemeExpr {
196 
197         private final List<Phoneme> phonemeList;
198 
199         public PhonemeList(final List<Phoneme> phonemes) {
200             this.phonemeList = phonemes;
201         }
202 
203         @Override
204         public List<Phoneme> getPhonemes() {
205             return phonemeList;
206         }
207 
208         @Override
209         public int size() {
210             return phonemeList.size();
211         }
212     }
213 
214     /**
215      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
216      */
217     public interface RPattern {
218         boolean isMatch(CharSequence input);
219     }
220 
221     public static final RPattern ALL_STRINGS_RMATCHER = input -> true;
222 
223     public static final String ALL = "ALL";
224 
225     private static final String DOUBLE_QUOTE = "\"";
226 
227     private static final String HASH_INCLUDE = "#include";
228 
229     private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
230 
231     private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
232             new EnumMap<>(NameType.class);
233 
234     static {
235         for (final NameType s : NameType.values()) {
236             final Map<RuleType, Map<String, Map<String, List<Rule>>>> rts =
237                     new EnumMap<>(RuleType.class);
238 
239             for (final RuleType rt : RuleType.values()) {
240                 final Map<String, Map<String, List<Rule>>> rs = new HashMap<>();
241 
242                 final Languages ls = Languages.getInstance(s);
243                 ls.getLanguages().forEach(l -> {
244                     try (final Scanner scanner = createScanner(s, rt, l)) {
245                         rs.put(l, parseRules(scanner, createResourceName(s, rt, l)));
246                     } catch (final IllegalStateException e) {
247                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
248                     }
249                 });
250                 if (!rt.equals(RuleType.RULES)) {
251                     try (final Scanner scanner = createScanner(s, rt, "common")) {
252                         rs.put("common", parseRules(scanner, createResourceName(s, rt, "common")));
253                     }
254                 }
255 
256                 rts.put(rt, Collections.unmodifiableMap(rs));
257             }
258 
259             RULES.put(s, Collections.unmodifiableMap(rts));
260         }
261     }
262 
263     private static boolean contains(final CharSequence chars, final char input) {
264         return chars.chars().anyMatch(c -> c == input);
265     }
266 
267     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
268         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
269                              nameType.getName(), rt.getName(), lang);
270     }
271 
272     @SuppressWarnings("resource") // Closing the Scanner closes the resource
273     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
274         final String resName = createResourceName(nameType, rt, lang);
275         return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
276     }
277 
278     @SuppressWarnings("resource") // Closing the Scanner closes the resource
279     private static Scanner createScanner(final String lang) {
280         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
281         return new Scanner(Resources.getInputStream(resName), ResourceConstants.ENCODING);
282     }
283 
284     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
285         final int suffixLength = suffix.length();
286         final int inputLength = input.length();
287 
288         if (suffixLength > inputLength) {
289             return false;
290         }
291         for (int i = inputLength - 1, j = suffixLength - 1; j >= 0; i--, j--) {
292             if (input.charAt(i) != suffix.charAt(j)) {
293                 return false;
294             }
295         }
296         return true;
297     }
298 
299     /**
300      * Gets rules for a combination of name type, rule type and languages.
301      *
302      * @param nameType
303      *            the NameType to consider
304      * @param rt
305      *            the RuleType to consider
306      * @param langs
307      *            the set of languages to consider
308      * @return a list of Rules that apply
309      */
310     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
311                                          final Languages.LanguageSet langs) {
312         final Map<String, List<Rule>> ruleMap = getInstanceMap(nameType, rt, langs);
313         final List<Rule> allRules = new ArrayList<>();
314         ruleMap.values().forEach(rules -> allRules.addAll(rules));
315         return allRules;
316     }
317 
318     /**
319      * Gets rules for a combination of name type, rule type and a single language.
320      *
321      * @param nameType
322      *            the NameType to consider
323      * @param rt
324      *            the RuleType to consider
325      * @param lang
326      *            the language to consider
327      * @return a list of Rules that apply
328      */
329     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
330         return getInstance(nameType, rt, LanguageSet.from(new HashSet<>(Arrays.asList(lang))));
331     }
332 
333     /**
334      * Gets rules for a combination of name type, rule type and languages.
335      *
336      * @param nameType
337      *            the NameType to consider
338      * @param rt
339      *            the RuleType to consider
340      * @param langs
341      *            the set of languages to consider
342      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
343      * @since 1.9
344      */
345     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
346                                                          final Languages.LanguageSet langs) {
347         return langs.isSingleton() ? getInstanceMap(nameType, rt, langs.getAny()) :
348                                      getInstanceMap(nameType, rt, Languages.ANY);
349     }
350 
351     /**
352      * Gets rules for a combination of name type, rule type and a single language.
353      *
354      * @param nameType
355      *            the NameType to consider
356      * @param rt
357      *            the RuleType to consider
358      * @param lang
359      *            the language to consider
360      * @return a map containing all Rules that apply, grouped by the first character of the rule pattern
361      * @since 1.9
362      */
363     public static Map<String, List<Rule>> getInstanceMap(final NameType nameType, final RuleType rt,
364                                                          final String lang) {
365         final Map<String, List<Rule>> rules = RULES.get(nameType).get(rt).get(lang);
366 
367         if (rules == null) {
368             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
369                                                nameType.getName(), rt.getName(), lang));
370         }
371 
372         return rules;
373     }
374 
375     private static Phoneme parsePhoneme(final String ph) {
376         final int open = ph.indexOf("[");
377         if (open >= 0) {
378             if (!ph.endsWith("]")) {
379                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
380             }
381             final String before = ph.substring(0, open);
382             final String in = ph.substring(open + 1, ph.length() - 1);
383             final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
384 
385             return new Phoneme(before, Languages.LanguageSet.from(langs));
386         }
387         return new Phoneme(ph, Languages.ANY_LANGUAGE);
388     }
389 
390     private static PhonemeExpr parsePhonemeExpr(final String ph) {
391         if (ph.startsWith("(")) { // we have a bracketed list of options
392             if (!ph.endsWith(")")) {
393                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
394             }
395 
396             final List<Phoneme> phs = new ArrayList<>();
397             final String body = ph.substring(1, ph.length() - 1);
398             for (final String part : body.split("[|]")) {
399                 phs.add(parsePhoneme(part));
400             }
401             if (body.startsWith("|") || body.endsWith("|")) {
402                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
403             }
404 
405             return new PhonemeList(phs);
406         }
407         return parsePhoneme(ph);
408     }
409 
410     private static Map<String, List<Rule>> parseRules(final Scanner scanner, final String location) {
411         final Map<String, List<Rule>> lines = new HashMap<>();
412         int currentLine = 0;
413 
414         boolean inMultilineComment = false;
415         while (scanner.hasNextLine()) {
416             currentLine++;
417             final String rawLine = scanner.nextLine();
418             String line = rawLine;
419 
420             if (inMultilineComment) {
421                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
422                     inMultilineComment = false;
423                 }
424             } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
425                 inMultilineComment = true;
426             } else {
427                 // discard comments
428                 final int cmtI = line.indexOf(ResourceConstants.CMT);
429                 if (cmtI >= 0) {
430                     line = line.substring(0, cmtI);
431                 }
432 
433                 // trim leading-trailing whitespace
434                 line = line.trim();
435 
436                 if (line.isEmpty()) {
437                     continue; // empty lines can be safely skipped
438                 }
439 
440                 if (line.startsWith(HASH_INCLUDE)) {
441                     // include statement
442                     final String incl = line.substring(HASH_INCLUDE_LENGTH).trim();
443                     if (incl.contains(" ")) {
444                         throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
445                                                            location);
446                     }
447                     try (final Scanner hashIncludeScanner = createScanner(incl)) {
448                         lines.putAll(parseRules(hashIncludeScanner, location + "->" + incl));
449                     }
450                 } else {
451                     // rule
452                     final String[] parts = line.split("\\s+");
453                     if (parts.length != 4) {
454                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
455                                                            " parts: " + rawLine + " in " + location);
456                     }
457                     try {
458                         final String pat = stripQuotes(parts[0]);
459                         final String lCon = stripQuotes(parts[1]);
460                         final String rCon = stripQuotes(parts[2]);
461                         final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
462                         final int cLine = currentLine;
463                         final Rule r = new Rule(pat, lCon, rCon, ph) {
464                             private final int myLine = cLine;
465                             private final String loc = location;
466 
467                             @Override
468                             public String toString() {
469                                 final StringBuilder sb = new StringBuilder();
470                                 sb.append("Rule");
471                                 sb.append("{line=").append(myLine);
472                                 sb.append(", loc='").append(loc).append('\'');
473                                 sb.append(", pat='").append(pat).append('\'');
474                                 sb.append(", lcon='").append(lCon).append('\'');
475                                 sb.append(", rcon='").append(rCon).append('\'');
476                                 sb.append('}');
477                                 return sb.toString();
478                             }
479                         };
480                         final String patternKey = r.pattern.substring(0, 1);
481                         final List<Rule> rules = lines.computeIfAbsent(patternKey, k -> new ArrayList<>());
482                         rules.add(r);
483                     } catch (final IllegalArgumentException e) {
484                         throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
485                                                         location, e);
486                     }
487                 }
488             }
489         }
490 
491         return lines;
492     }
493 
494     /**
495      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
496      *
497      * @param regex
498      *            the regular expression to compile
499      * @return an RPattern that will match this regex
500      */
501     private static RPattern pattern(final String regex) {
502         final boolean startsWith = regex.startsWith("^");
503         final boolean endsWith = regex.endsWith("$");
504         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
505         final boolean boxes = content.contains("[");
506 
507         if (!boxes) {
508             if (startsWith && endsWith) {
509                 // exact match
510                 if (content.isEmpty()) {
511                     // empty
512                     return input -> input.length() == 0;
513                 }
514                 return input -> input.equals(content);
515             }
516             if ((startsWith || endsWith) && content.isEmpty()) {
517                 // matches every string
518                 return ALL_STRINGS_RMATCHER;
519             }
520             if (startsWith) {
521                 // matches from start
522                 return input -> startsWith(input, content);
523             }
524             if (endsWith) {
525                 // matches from start
526                 return input -> endsWith(input, content);
527             }
528         } else {
529             final boolean startsWithBox = content.startsWith("[");
530             final boolean endsWithBox = content.endsWith("]");
531 
532             if (startsWithBox && endsWithBox) {
533                 String boxContent = content.substring(1, content.length() - 1);
534                 if (!boxContent.contains("[")) {
535                     // box containing alternatives
536                     final boolean negate = boxContent.startsWith("^");
537                     if (negate) {
538                         boxContent = boxContent.substring(1);
539                     }
540                     final String bContent = boxContent;
541                     final boolean shouldMatch = !negate;
542 
543                     if (startsWith && endsWith) {
544                         // exact match
545                         return input -> input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
546                     }
547                     if (startsWith) {
548                         // first char
549                         return input -> input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
550                     }
551                     if (endsWith) {
552                         // last char
553                         return input -> input.length() > 0 &&
554                                contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
555                     }
556                 }
557             }
558         }
559 
560         return new RPattern() {
561             final Pattern pattern = Pattern.compile(regex);
562 
563             @Override
564             public boolean isMatch(final CharSequence input) {
565                 final Matcher matcher = pattern.matcher(input);
566                 return matcher.find();
567             }
568         };
569     }
570 
571     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
572         if (prefix.length() > input.length()) {
573             return false;
574         }
575         for (int i = 0; i < prefix.length(); i++) {
576             if (input.charAt(i) != prefix.charAt(i)) {
577                 return false;
578             }
579         }
580         return true;
581     }
582 
583     private static String stripQuotes(String str) {
584         if (str.startsWith(DOUBLE_QUOTE)) {
585             str = str.substring(1);
586         }
587 
588         if (str.endsWith(DOUBLE_QUOTE)) {
589             str = str.substring(0, str.length() - 1);
590         }
591 
592         return str;
593     }
594 
595     private final RPattern lContext;
596 
597     private final String pattern;
598 
599     private final PhonemeExpr phoneme;
600 
601     private final RPattern rContext;
602 
603     /**
604      * Creates a new rule.
605      *
606      * @param pattern
607      *            the pattern
608      * @param lContext
609      *            the left context
610      * @param rContext
611      *            the right context
612      * @param phoneme
613      *            the resulting phoneme
614      */
615     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
616         this.pattern = pattern;
617         this.lContext = pattern(lContext + "$");
618         this.rContext = pattern("^" + rContext);
619         this.phoneme = phoneme;
620     }
621 
622     /**
623      * Gets the left context. This is a regular expression that must match to the left of the pattern.
624      *
625      * @return the left context Pattern
626      */
627     public RPattern getLContext() {
628         return this.lContext;
629     }
630 
631     /**
632      * Gets the pattern. This is a string-literal that must exactly match.
633      *
634      * @return the pattern
635      */
636     public String getPattern() {
637         return this.pattern;
638     }
639 
640     /**
641      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
642      *
643      * @return the phoneme
644      */
645     public PhonemeExpr getPhoneme() {
646         return this.phoneme;
647     }
648 
649     /**
650      * Gets the right context. This is a regular expression that must match to the right of the pattern.
651      *
652      * @return the right context Pattern
653      */
654     public RPattern getRContext() {
655         return this.rContext;
656     }
657 
658     /**
659      * Decides if the pattern and context match the input starting at a position. It is a match if the
660      * {@code lContext} matches {@code input} up to {@code i}, {@code pattern} matches at i and
661      * {@code rContext} matches from the end of the match of {@code pattern} to the end of {@code input}.
662      *
663      * @param input
664      *            the input String
665      * @param i
666      *            the int position within the input
667      * @return true if the pattern and left/right context match, false otherwise
668      */
669     public boolean patternAndContextMatches(final CharSequence input, final int i) {
670         if (i < 0) {
671             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
672         }
673 
674         final int patternLength = this.pattern.length();
675         final int ipl = i + patternLength;
676 
677         if (ipl > input.length()) {
678             // not enough room for the pattern to match
679             return false;
680         }
681 
682         // evaluate the pattern, left context and right context
683         // fail early if any of the evaluations is not successful
684         if (!input.subSequence(i, ipl).equals(this.pattern)) {
685             return false;
686         }
687         if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
688             return false;
689         }
690         return this.lContext.isMatch(input.subSequence(0, i));
691     }
692 }