View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.EnumMap;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Scanner;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  /**
36   * A phoneme rule.
37   * <p>
38   * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
39   * and a logical flag indicating if all languages must be in play. A rule matches if:
40   * <ul>
41   * <li>the pattern matches at the current position</li>
42   * <li>the string up until the beginning of the pattern matches the left context</li>
43   * <li>the string from the end of the pattern matches the right context</li>
44   * <li>logical is ALL and all languages are in scope; or</li>
45   * <li>logical is any other value and at least one language is in scope</li>
46   * </ul>
47   * <p>
48   * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
49   * to explicitly construct their own.
50   * <p>
51   * Rules are immutable and thread-safe.
52   * <p>
53   * <b>Rules resources</b>
54   * <p>
55   * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
56   * named following the pattern:
57   * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
58   * <p>
59   * The format of these resources is the following:
60   * <ul>
61   * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
62   * will be interpreted as:
63   * <ol>
64   * <li>pattern</li>
65   * <li>left context</li>
66   * <li>right context</li>
67   * <li>phoneme</li>
68   * </ol>
69   * </li>
70   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
71   * as a comment.</li>
72   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
73   * all content until a line ending in '*' and '/' is found.</li>
74   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
75   * </ul>
76   *
77   * @since 1.6
78   * @version $Id: Rule.html 889935 2013-12-11 05:05:13Z ggregory $
79   */
80  public class Rule {
81  
82      public static final class Phoneme implements PhonemeExpr {
83          public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84              @Override
85              public int compare(Phoneme o1, Phoneme o2) {
86                  for (int i = 0; i < o1.phonemeText.length(); i++) {
87                      if (i >= o2.phonemeText.length()) {
88                          return +1;
89                      }
90                      int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91                      if (c != 0) {
92                          return c;
93                      }
94                  }
95  
96                  if (o1.phonemeText.length() < o2.phonemeText.length()) {
97                      return -1;
98                  }
99  
100                 return 0;
101             }
102         };
103 
104         private final CharSequence phonemeText;
105         private final Languages.LanguageSet languages;
106 
107         public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) {
108             this.phonemeText = phonemeText;
109             this.languages = languages;
110         }
111 
112         public Phoneme append(CharSequence str) {
113             return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114         }
115 
116         public Languages.LanguageSet getLanguages() {
117             return this.languages;
118         }
119 
120         @Override
121         public Iterable<Phoneme> getPhonemes() {
122             return Collections.singleton(this);
123         }
124 
125         public CharSequence getPhonemeText() {
126             return this.phonemeText;
127         }
128 
129         public Phoneme join(Phoneme right) {
130             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131                                this.languages.restrictTo(right.languages));
132         }
133     }
134 
135     public interface PhonemeExpr {
136         Iterable<Phoneme> getPhonemes();
137     }
138 
139     public static final class PhonemeList implements PhonemeExpr {
140         private final List<Phoneme> phonemes;
141 
142         public PhonemeList(List<Phoneme> phonemes) {
143             this.phonemes = phonemes;
144         }
145 
146         @Override
147         public List<Phoneme> getPhonemes() {
148             return this.phonemes;
149         }
150     }
151 
152     /**
153      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154      */
155     public static interface RPattern {
156         boolean isMatch(CharSequence input);
157     }
158 
159     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160         @Override
161         public boolean isMatch(CharSequence input) {
162             return true;
163         }
164     };
165 
166     public static final String ALL = "ALL";
167 
168     private static final String DOUBLE_QUOTE = "\"";
169 
170     private static final String HASH_INCLUDE = "#include";
171 
172     private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173             new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174 
175     static {
176         for (NameType s : NameType.values()) {
177             Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
178 
179             for (RuleType rt : RuleType.values()) {
180                 Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
181 
182                 Languages ls = Languages.getInstance(s);
183                 for (String l : ls.getLanguages()) {
184                     try {
185                         rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
186                     } catch (IllegalStateException e) {
187                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
188                     }
189                 }
190                 if (!rt.equals(RuleType.RULES)) {
191                     rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
192                 }
193 
194                 rts.put(rt, Collections.unmodifiableMap(rs));
195             }
196 
197             RULES.put(s, Collections.unmodifiableMap(rts));
198         }
199     }
200 
201     private static boolean contains(CharSequence chars, char input) {
202         for (int i = 0; i < chars.length(); i++) {
203             if (chars.charAt(i) == input) {
204                 return true;
205             }
206         }
207         return false;
208     }
209 
210     private static String createResourceName(NameType nameType, RuleType rt, String lang) {
211         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
212                              nameType.getName(), rt.getName(), lang);
213     }
214 
215     private static Scanner createScanner(NameType nameType, RuleType rt, String lang) {
216         String resName = createResourceName(nameType, rt, lang);
217         InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
218 
219         if (rulesIS == null) {
220             throw new IllegalArgumentException("Unable to load resource: " + resName);
221         }
222 
223         return new Scanner(rulesIS, ResourceConstants.ENCODING);
224     }
225 
226     private static Scanner createScanner(String lang) {
227         String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
228         InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
229 
230         if (rulesIS == null) {
231             throw new IllegalArgumentException("Unable to load resource: " + resName);
232         }
233 
234         return new Scanner(rulesIS, ResourceConstants.ENCODING);
235     }
236 
237     private static boolean endsWith(CharSequence input, CharSequence suffix) {
238         if (suffix.length() > input.length()) {
239             return false;
240         }
241         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
242             if (input.charAt(i) != suffix.charAt(j)) {
243                 return false;
244             }
245         }
246         return true;
247     }
248 
249     /**
250      * Gets rules for a combination of name type, rule type and languages.
251      *
252      * @param nameType
253      *            the NameType to consider
254      * @param rt
255      *            the RuleType to consider
256      * @param langs
257      *            the set of languages to consider
258      * @return a list of Rules that apply
259      */
260     public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) {
261         return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
262                                      getInstance(nameType, rt, Languages.ANY);
263     }
264 
265     /**
266      * Gets rules for a combination of name type, rule type and a single language.
267      *
268      * @param nameType
269      *            the NameType to consider
270      * @param rt
271      *            the RuleType to consider
272      * @param lang
273      *            the language to consider
274      * @return a list rules for a combination of name type, rule type and a single language.
275      */
276     public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) {
277         List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
278 
279         if (rules == null) {
280             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
281                                                nameType.getName(), rt.getName(), lang));
282         }
283 
284         return rules;
285     }
286 
287     private static Phoneme parsePhoneme(String ph) {
288         int open = ph.indexOf("[");
289         if (open >= 0) {
290             if (!ph.endsWith("]")) {
291                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
292             }
293             String before = ph.substring(0, open);
294             String in = ph.substring(open + 1, ph.length() - 1);
295             Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
296 
297             return new Phoneme(before, Languages.LanguageSet.from(langs));
298         } else {
299             return new Phoneme(ph, Languages.ANY_LANGUAGE);
300         }
301     }
302 
303     private static PhonemeExpr parsePhonemeExpr(String ph) {
304         if (ph.startsWith("(")) { // we have a bracketed list of options
305             if (!ph.endsWith(")")) {
306                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
307             }
308 
309             List<Phoneme> phs = new ArrayList<Phoneme>();
310             String body = ph.substring(1, ph.length() - 1);
311             for (String part : body.split("[|]")) {
312                 phs.add(parsePhoneme(part));
313             }
314             if (body.startsWith("|") || body.endsWith("|")) {
315                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
316             }
317 
318             return new PhonemeList(phs);
319         } else {
320             return parsePhoneme(ph);
321         }
322     }
323 
324     private static List<Rule> parseRules(final Scanner scanner, final String location) {
325         List<Rule> lines = new ArrayList<Rule>();
326         int currentLine = 0;
327 
328         boolean inMultilineComment = false;
329         while (scanner.hasNextLine()) {
330             currentLine++;
331             String rawLine = scanner.nextLine();
332             String line = rawLine;
333 
334             if (inMultilineComment) {
335                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
336                     inMultilineComment = false;
337                 }
338             } else {
339                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
340                     inMultilineComment = true;
341                 } else {
342                     // discard comments
343                     int cmtI = line.indexOf(ResourceConstants.CMT);
344                     if (cmtI >= 0) {
345                         line = line.substring(0, cmtI);
346                     }
347 
348                     // trim leading-trailing whitespace
349                     line = line.trim();
350 
351                     if (line.length() == 0) {
352                         continue; // empty lines can be safely skipped
353                     }
354 
355                     if (line.startsWith(HASH_INCLUDE)) {
356                         // include statement
357                         String incl = line.substring(HASH_INCLUDE.length()).trim();
358                         if (incl.contains(" ")) {
359                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
360                                                                location);
361                         } else {
362                             lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
363                         }
364                     } else {
365                         // rule
366                         String[] parts = line.split("\\s+");
367                         if (parts.length != 4) {
368                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
369                                                                " parts: " + rawLine + " in " + location);
370                         } else {
371                             try {
372                                 String pat = stripQuotes(parts[0]);
373                                 String lCon = stripQuotes(parts[1]);
374                                 String rCon = stripQuotes(parts[2]);
375                                 PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
376                                 final int cLine = currentLine;
377                                 Rule r = new Rule(pat, lCon, rCon, ph) {
378                                     private final int myLine = cLine;
379                                     private final String loc = location;
380 
381                                     @Override
382                                     public String toString() {
383                                         final StringBuilder sb = new StringBuilder();
384                                         sb.append("Rule");
385                                         sb.append("{line=").append(myLine);
386                                         sb.append(", loc='").append(loc).append('\'');
387                                         sb.append('}');
388                                         return sb.toString();
389                                     }
390                                 };
391                                 lines.add(r);
392                             } catch (IllegalArgumentException e) {
393                                 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
394                                                                 location, e);
395                             }
396                         }
397                     }
398                 }
399             }
400         }
401 
402         return lines;
403     }
404 
405     /**
406      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
407      *
408      * @param regex
409      *            the regular expression to compile
410      * @return an RPattern that will match this regex
411      */
412     private static RPattern pattern(final String regex) {
413         boolean startsWith = regex.startsWith("^");
414         boolean endsWith = regex.endsWith("$");
415         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
416         boolean boxes = content.contains("[");
417 
418         if (!boxes) {
419             if (startsWith && endsWith) {
420                 // exact match
421                 if (content.length() == 0) {
422                     // empty
423                     return new RPattern() {
424                         @Override
425                         public boolean isMatch(CharSequence input) {
426                             return input.length() == 0;
427                         }
428                     };
429                 } else {
430                     return new RPattern() {
431                         @Override
432                         public boolean isMatch(CharSequence input) {
433                             return input.equals(content);
434                         }
435                     };
436                 }
437             } else if ((startsWith || endsWith) && content.length() == 0) {
438                 // matches every string
439                 return ALL_STRINGS_RMATCHER;
440             } else if (startsWith) {
441                 // matches from start
442                 return new RPattern() {
443                     @Override
444                     public boolean isMatch(CharSequence input) {
445                         return startsWith(input, content);
446                     }
447                 };
448             } else if (endsWith) {
449                 // matches from start
450                 return new RPattern() {
451                     @Override
452                     public boolean isMatch(CharSequence input) {
453                         return endsWith(input, content);
454                     }
455                 };
456             }
457         } else {
458             boolean startsWithBox = content.startsWith("[");
459             boolean endsWithBox = content.endsWith("]");
460 
461             if (startsWithBox && endsWithBox) {
462                 String boxContent = content.substring(1, content.length() - 1);
463                 if (!boxContent.contains("[")) {
464                     // box containing alternatives
465                     boolean negate = boxContent.startsWith("^");
466                     if (negate) {
467                         boxContent = boxContent.substring(1);
468                     }
469                     final String bContent = boxContent;
470                     final boolean shouldMatch = !negate;
471 
472                     if (startsWith && endsWith) {
473                         // exact match
474                         return new RPattern() {
475                             @Override
476                             public boolean isMatch(CharSequence input) {
477                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
478                             }
479                         };
480                     } else if (startsWith) {
481                         // first char
482                         return new RPattern() {
483                             @Override
484                             public boolean isMatch(CharSequence input) {
485                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
486                             }
487                         };
488                     } else if (endsWith) {
489                         // last char
490                         return new RPattern() {
491                             @Override
492                             public boolean isMatch(CharSequence input) {
493                                 return input.length() > 0 &&
494                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
495                             }
496                         };
497                     }
498                 }
499             }
500         }
501 
502         return new RPattern() {
503             Pattern pattern = Pattern.compile(regex);
504 
505             @Override
506             public boolean isMatch(CharSequence input) {
507                 Matcher matcher = pattern.matcher(input);
508                 return matcher.find();
509             }
510         };
511     }
512 
513     private static boolean startsWith(CharSequence input, CharSequence prefix) {
514         if (prefix.length() > input.length()) {
515             return false;
516         }
517         for (int i = 0; i < prefix.length(); i++) {
518             if (input.charAt(i) != prefix.charAt(i)) {
519                 return false;
520             }
521         }
522         return true;
523     }
524 
525     private static String stripQuotes(String str) {
526         if (str.startsWith(DOUBLE_QUOTE)) {
527             str = str.substring(1);
528         }
529 
530         if (str.endsWith(DOUBLE_QUOTE)) {
531             str = str.substring(0, str.length() - 1);
532         }
533 
534         return str;
535     }
536 
537     private final RPattern lContext;
538 
539     private final String pattern;
540 
541     private final PhonemeExpr phoneme;
542 
543     private final RPattern rContext;
544 
545     /**
546      * Creates a new rule.
547      *
548      * @param pattern
549      *            the pattern
550      * @param lContext
551      *            the left context
552      * @param rContext
553      *            the right context
554      * @param phoneme
555      *            the resulting phoneme
556      */
557     public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
558         this.pattern = pattern;
559         this.lContext = pattern(lContext + "$");
560         this.rContext = pattern("^" + rContext);
561         this.phoneme = phoneme;
562     }
563 
564     /**
565      * Gets the left context. This is a regular expression that must match to the left of the pattern.
566      *
567      * @return the left context Pattern
568      */
569     public RPattern getLContext() {
570         return this.lContext;
571     }
572 
573     /**
574      * Gets the pattern. This is a string-literal that must exactly match.
575      *
576      * @return the pattern
577      */
578     public String getPattern() {
579         return this.pattern;
580     }
581 
582     /**
583      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
584      *
585      * @return the phoneme
586      */
587     public PhonemeExpr getPhoneme() {
588         return this.phoneme;
589     }
590 
591     /**
592      * Gets the right context. This is a regular expression that must match to the right of the pattern.
593      *
594      * @return the right context Pattern
595      */
596     public RPattern getRContext() {
597         return this.rContext;
598     }
599 
600     /**
601      * Decides if the pattern and context match the input starting at a position. It is a match if the
602      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
603      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
604      *
605      * @param input
606      *            the input String
607      * @param i
608      *            the int position within the input
609      * @return true if the pattern and left/right context match, false otherwise
610      */
611     public boolean patternAndContextMatches(CharSequence input, int i) {
612         if (i < 0) {
613             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
614         }
615 
616         int patternLength = this.pattern.length();
617         int ipl = i + patternLength;
618 
619         if (ipl > input.length()) {
620             // not enough room for the pattern to match
621             return false;
622         }
623 
624         // evaluate the pattern, left context and right context
625         // fail early if any of the evaluations is not successful
626         if (!input.subSequence(i, ipl).equals(this.pattern)) {
627             return false;
628         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
629             return false;
630         }
631         return this.lContext.isMatch(input.subSequence(0, i));
632     }
633 }