View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.EnumMap;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Scanner;
31  import java.util.Set;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  /**
36   * A phoneme rule.
37   * <p>
38   * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
39   * and a logical flag indicating if all languages must be in play. A rule matches if:
40   * <ul>
41   * <li>the pattern matches at the current position</li>
42   * <li>the string up until the beginning of the pattern matches the left context</li>
43   * <li>the string from the end of the pattern matches the right context</li>
44   * <li>logical is ALL and all languages are in scope; or</li>
45   * <li>logical is any other value and at least one language is in scope</li>
46   * </ul>
47   * <p>
48   * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
49   * to explicitly construct their own.
50   * <p>
51   * Rules are immutable and thread-safe.
52   * <p>
53   * <b>Rules resources</b>
54   * <p>
55   * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
56   * named following the pattern:
57   * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
58   * <p>
59   * The format of these resources is the following:
60   * <ul>
61   * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
62   * will be interpreted as:
63   * <ol>
64   * <li>pattern</li>
65   * <li>left context</li>
66   * <li>right context</li>
67   * <li>phoneme</li>
68   * </ol>
69   * </li>
70   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
71   * as a comment.</li>
72   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
73   * all content until a line ending in '*' and '/' is found.</li>
74   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
75   * </ul>
76   *
77   * @since 1.6
78   * @version $Id: Rule.html 889935 2013-12-11 05:05:13Z ggregory $
79   */
80  public class Rule {
81  
82      public static final class Phoneme implements PhonemeExpr {
83          public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84              @Override
85              public int compare(final Phoneme o1, final Phoneme o2) {
86                  for (int i = 0; i < o1.phonemeText.length(); i++) {
87                      if (i >= o2.phonemeText.length()) {
88                          return +1;
89                      }
90                      final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91                      if (c != 0) {
92                          return c;
93                      }
94                  }
95  
96                  if (o1.phonemeText.length() < o2.phonemeText.length()) {
97                      return -1;
98                  }
99  
100                 return 0;
101             }
102         };
103 
104         private final CharSequence phonemeText;
105         private final Languages.LanguageSet languages;
106 
107         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
108             this.phonemeText = phonemeText;
109             this.languages = languages;
110         }
111 
112         public Phoneme append(final CharSequence str) {
113             return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114         }
115 
116         public Languages.LanguageSet getLanguages() {
117             return this.languages;
118         }
119 
120         @Override
121         public Iterable<Phoneme> getPhonemes() {
122             return Collections.singleton(this);
123         }
124 
125         public CharSequence getPhonemeText() {
126             return this.phonemeText;
127         }
128 
129         public Phoneme join(final Phoneme right) {
130             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131                                this.languages.restrictTo(right.languages));
132         }
133     }
134 
135     public interface PhonemeExpr {
136         Iterable<Phoneme> getPhonemes();
137     }
138 
139     public static final class PhonemeList implements PhonemeExpr {
140         private final List<Phoneme> phonemes;
141 
142         public PhonemeList(final List<Phoneme> phonemes) {
143             this.phonemes = phonemes;
144         }
145 
146         @Override
147         public List<Phoneme> getPhonemes() {
148             return this.phonemes;
149         }
150     }
151 
152     /**
153      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154      */
155     public static interface RPattern {
156         boolean isMatch(CharSequence input);
157     }
158 
159     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160         @Override
161         public boolean isMatch(final CharSequence input) {
162             return true;
163         }
164     };
165 
166     public static final String ALL = "ALL";
167 
168     private static final String DOUBLE_QUOTE = "\"";
169 
170     private static final String HASH_INCLUDE = "#include";
171 
172     private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173             new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174 
175     static {
176         for (final NameType s : NameType.values()) {
177             final Map<RuleType, Map<String, List<Rule>>> rts =
178                     new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
179 
180             for (final RuleType rt : RuleType.values()) {
181                 final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
182 
183                 final Languages ls = Languages.getInstance(s);
184                 for (final String l : ls.getLanguages()) {
185                     try {
186                         rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
187                     } catch (final IllegalStateException e) {
188                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
189                     }
190                 }
191                 if (!rt.equals(RuleType.RULES)) {
192                     rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
193                 }
194 
195                 rts.put(rt, Collections.unmodifiableMap(rs));
196             }
197 
198             RULES.put(s, Collections.unmodifiableMap(rts));
199         }
200     }
201 
202     private static boolean contains(final CharSequence chars, final char input) {
203         for (int i = 0; i < chars.length(); i++) {
204             if (chars.charAt(i) == input) {
205                 return true;
206             }
207         }
208         return false;
209     }
210 
211     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
212         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
213                              nameType.getName(), rt.getName(), lang);
214     }
215 
216     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
217         final String resName = createResourceName(nameType, rt, lang);
218         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
219 
220         if (rulesIS == null) {
221             throw new IllegalArgumentException("Unable to load resource: " + resName);
222         }
223 
224         return new Scanner(rulesIS, ResourceConstants.ENCODING);
225     }
226 
227     private static Scanner createScanner(final String lang) {
228         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
229         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
230 
231         if (rulesIS == null) {
232             throw new IllegalArgumentException("Unable to load resource: " + resName);
233         }
234 
235         return new Scanner(rulesIS, ResourceConstants.ENCODING);
236     }
237 
238     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
239         if (suffix.length() > input.length()) {
240             return false;
241         }
242         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
243             if (input.charAt(i) != suffix.charAt(j)) {
244                 return false;
245             }
246         }
247         return true;
248     }
249 
250     /**
251      * Gets rules for a combination of name type, rule type and languages.
252      *
253      * @param nameType
254      *            the NameType to consider
255      * @param rt
256      *            the RuleType to consider
257      * @param langs
258      *            the set of languages to consider
259      * @return a list of Rules that apply
260      */
261     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
262                                          final Languages.LanguageSet langs) {
263         return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
264                                      getInstance(nameType, rt, Languages.ANY);
265     }
266 
267     /**
268      * Gets rules for a combination of name type, rule type and a single language.
269      *
270      * @param nameType
271      *            the NameType to consider
272      * @param rt
273      *            the RuleType to consider
274      * @param lang
275      *            the language to consider
276      * @return a list rules for a combination of name type, rule type and a single language.
277      */
278     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
279         final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
280 
281         if (rules == null) {
282             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
283                                                nameType.getName(), rt.getName(), lang));
284         }
285 
286         return rules;
287     }
288 
289     private static Phoneme parsePhoneme(final String ph) {
290         final int open = ph.indexOf("[");
291         if (open >= 0) {
292             if (!ph.endsWith("]")) {
293                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
294             }
295             final String before = ph.substring(0, open);
296             final String in = ph.substring(open + 1, ph.length() - 1);
297             final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
298 
299             return new Phoneme(before, Languages.LanguageSet.from(langs));
300         } else {
301             return new Phoneme(ph, Languages.ANY_LANGUAGE);
302         }
303     }
304 
305     private static PhonemeExpr parsePhonemeExpr(final String ph) {
306         if (ph.startsWith("(")) { // we have a bracketed list of options
307             if (!ph.endsWith(")")) {
308                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
309             }
310 
311             final List<Phoneme> phs = new ArrayList<Phoneme>();
312             final String body = ph.substring(1, ph.length() - 1);
313             for (final String part : body.split("[|]")) {
314                 phs.add(parsePhoneme(part));
315             }
316             if (body.startsWith("|") || body.endsWith("|")) {
317                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
318             }
319 
320             return new PhonemeList(phs);
321         } else {
322             return parsePhoneme(ph);
323         }
324     }
325 
326     private static List<Rule> parseRules(final Scanner scanner, final String location) {
327         final List<Rule> lines = new ArrayList<Rule>();
328         int currentLine = 0;
329 
330         boolean inMultilineComment = false;
331         while (scanner.hasNextLine()) {
332             currentLine++;
333             final String rawLine = scanner.nextLine();
334             String line = rawLine;
335 
336             if (inMultilineComment) {
337                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
338                     inMultilineComment = false;
339                 }
340             } else {
341                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
342                     inMultilineComment = true;
343                 } else {
344                     // discard comments
345                     final int cmtI = line.indexOf(ResourceConstants.CMT);
346                     if (cmtI >= 0) {
347                         line = line.substring(0, cmtI);
348                     }
349 
350                     // trim leading-trailing whitespace
351                     line = line.trim();
352 
353                     if (line.length() == 0) {
354                         continue; // empty lines can be safely skipped
355                     }
356 
357                     if (line.startsWith(HASH_INCLUDE)) {
358                         // include statement
359                         final String incl = line.substring(HASH_INCLUDE.length()).trim();
360                         if (incl.contains(" ")) {
361                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
362                                                                location);
363                         } else {
364                             lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
365                         }
366                     } else {
367                         // rule
368                         final String[] parts = line.split("\\s+");
369                         if (parts.length != 4) {
370                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
371                                                                " parts: " + rawLine + " in " + location);
372                         } else {
373                             try {
374                                 final String pat = stripQuotes(parts[0]);
375                                 final String lCon = stripQuotes(parts[1]);
376                                 final String rCon = stripQuotes(parts[2]);
377                                 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
378                                 final int cLine = currentLine;
379                                 final Rule r = new Rule(pat, lCon, rCon, ph) {
380                                     private final int myLine = cLine;
381                                     private final String loc = location;
382 
383                                     @Override
384                                     public String toString() {
385                                         final StringBuilder sb = new StringBuilder();
386                                         sb.append("Rule");
387                                         sb.append("{line=").append(myLine);
388                                         sb.append(", loc='").append(loc).append('\'');
389                                         sb.append('}');
390                                         return sb.toString();
391                                     }
392                                 };
393                                 lines.add(r);
394                             } catch (final IllegalArgumentException e) {
395                                 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
396                                                                 location, e);
397                             }
398                         }
399                     }
400                 }
401             }
402         }
403 
404         return lines;
405     }
406 
407     /**
408      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
409      *
410      * @param regex
411      *            the regular expression to compile
412      * @return an RPattern that will match this regex
413      */
414     private static RPattern pattern(final String regex) {
415         final boolean startsWith = regex.startsWith("^");
416         final boolean endsWith = regex.endsWith("$");
417         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
418         final boolean boxes = content.contains("[");
419 
420         if (!boxes) {
421             if (startsWith && endsWith) {
422                 // exact match
423                 if (content.length() == 0) {
424                     // empty
425                     return new RPattern() {
426                         @Override
427                         public boolean isMatch(final CharSequence input) {
428                             return input.length() == 0;
429                         }
430                     };
431                 } else {
432                     return new RPattern() {
433                         @Override
434                         public boolean isMatch(final CharSequence input) {
435                             return input.equals(content);
436                         }
437                     };
438                 }
439             } else if ((startsWith || endsWith) && content.length() == 0) {
440                 // matches every string
441                 return ALL_STRINGS_RMATCHER;
442             } else if (startsWith) {
443                 // matches from start
444                 return new RPattern() {
445                     @Override
446                     public boolean isMatch(final CharSequence input) {
447                         return startsWith(input, content);
448                     }
449                 };
450             } else if (endsWith) {
451                 // matches from start
452                 return new RPattern() {
453                     @Override
454                     public boolean isMatch(final CharSequence input) {
455                         return endsWith(input, content);
456                     }
457                 };
458             }
459         } else {
460             final boolean startsWithBox = content.startsWith("[");
461             final boolean endsWithBox = content.endsWith("]");
462 
463             if (startsWithBox && endsWithBox) {
464                 String boxContent = content.substring(1, content.length() - 1);
465                 if (!boxContent.contains("[")) {
466                     // box containing alternatives
467                     final boolean negate = boxContent.startsWith("^");
468                     if (negate) {
469                         boxContent = boxContent.substring(1);
470                     }
471                     final String bContent = boxContent;
472                     final boolean shouldMatch = !negate;
473 
474                     if (startsWith && endsWith) {
475                         // exact match
476                         return new RPattern() {
477                             @Override
478                             public boolean isMatch(final CharSequence input) {
479                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
480                             }
481                         };
482                     } else if (startsWith) {
483                         // first char
484                         return new RPattern() {
485                             @Override
486                             public boolean isMatch(final CharSequence input) {
487                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
488                             }
489                         };
490                     } else if (endsWith) {
491                         // last char
492                         return new RPattern() {
493                             @Override
494                             public boolean isMatch(final CharSequence input) {
495                                 return input.length() > 0 &&
496                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
497                             }
498                         };
499                     }
500                 }
501             }
502         }
503 
504         return new RPattern() {
505             Pattern pattern = Pattern.compile(regex);
506 
507             @Override
508             public boolean isMatch(final CharSequence input) {
509                 final Matcher matcher = pattern.matcher(input);
510                 return matcher.find();
511             }
512         };
513     }
514 
515     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
516         if (prefix.length() > input.length()) {
517             return false;
518         }
519         for (int i = 0; i < prefix.length(); i++) {
520             if (input.charAt(i) != prefix.charAt(i)) {
521                 return false;
522             }
523         }
524         return true;
525     }
526 
527     private static String stripQuotes(String str) {
528         if (str.startsWith(DOUBLE_QUOTE)) {
529             str = str.substring(1);
530         }
531 
532         if (str.endsWith(DOUBLE_QUOTE)) {
533             str = str.substring(0, str.length() - 1);
534         }
535 
536         return str;
537     }
538 
539     private final RPattern lContext;
540 
541     private final String pattern;
542 
543     private final PhonemeExpr phoneme;
544 
545     private final RPattern rContext;
546 
547     /**
548      * Creates a new rule.
549      *
550      * @param pattern
551      *            the pattern
552      * @param lContext
553      *            the left context
554      * @param rContext
555      *            the right context
556      * @param phoneme
557      *            the resulting phoneme
558      */
559     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
560         this.pattern = pattern;
561         this.lContext = pattern(lContext + "$");
562         this.rContext = pattern("^" + rContext);
563         this.phoneme = phoneme;
564     }
565 
566     /**
567      * Gets the left context. This is a regular expression that must match to the left of the pattern.
568      *
569      * @return the left context Pattern
570      */
571     public RPattern getLContext() {
572         return this.lContext;
573     }
574 
575     /**
576      * Gets the pattern. This is a string-literal that must exactly match.
577      *
578      * @return the pattern
579      */
580     public String getPattern() {
581         return this.pattern;
582     }
583 
584     /**
585      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
586      *
587      * @return the phoneme
588      */
589     public PhonemeExpr getPhoneme() {
590         return this.phoneme;
591     }
592 
593     /**
594      * Gets the right context. This is a regular expression that must match to the right of the pattern.
595      *
596      * @return the right context Pattern
597      */
598     public RPattern getRContext() {
599         return this.rContext;
600     }
601 
602     /**
603      * Decides if the pattern and context match the input starting at a position. It is a match if the
604      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
605      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
606      *
607      * @param input
608      *            the input String
609      * @param i
610      *            the int position within the input
611      * @return true if the pattern and left/right context match, false otherwise
612      */
613     public boolean patternAndContextMatches(final CharSequence input, final int i) {
614         if (i < 0) {
615             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
616         }
617 
618         final int patternLength = this.pattern.length();
619         final int ipl = i + patternLength;
620 
621         if (ipl > input.length()) {
622             // not enough room for the pattern to match
623             return false;
624         }
625 
626         // evaluate the pattern, left context and right context
627         // fail early if any of the evaluations is not successful
628         if (!input.subSequence(i, ipl).equals(this.pattern)) {
629             return false;
630         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
631             return false;
632         }
633         return this.lContext.isMatch(input.subSequence(0, i));
634     }
635 }