001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language.bm;
019
020 import java.io.InputStream;
021 import java.util.ArrayList;
022 import java.util.Arrays;
023 import java.util.Collections;
024 import java.util.Comparator;
025 import java.util.EnumMap;
026 import java.util.HashMap;
027 import java.util.HashSet;
028 import java.util.List;
029 import java.util.Map;
030 import java.util.Scanner;
031 import java.util.Set;
032 import java.util.regex.Matcher;
033 import java.util.regex.Pattern;
034
035 /**
036 * A phoneme rule.
037 * <p>
038 * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
039 * and a logical flag indicating if all languages must be in play. A rule matches if:
040 * <ul>
041 * <li>the pattern matches at the current position</li>
042 * <li>the string up until the beginning of the pattern matches the left context</li>
043 * <li>the string from the end of the pattern matches the right context</li>
044 * <li>logical is ALL and all languages are in scope; or</li>
045 * <li>logical is any other value and at least one language is in scope</li>
046 * </ul>
047 * <p>
048 * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
049 * to explicitly construct their own.
050 * <p>
051 * Rules are immutable and thread-safe.
052 * <p>
053 * <b>Rules resources</b>
054 * <p>
055 * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
056 * named following the pattern:
057 * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
058 * <p>
059 * The format of these resources is the following:
060 * <ul>
061 * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
062 * will be interpreted as:
063 * <ol>
064 * <li>pattern</li>
065 * <li>left context</li>
066 * <li>right context</li>
067 * <li>phoneme</li>
068 * </ol>
069 * </li>
070 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
071 * as a comment.</li>
072 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
073 * all content until a line ending in '*' and '/' is found.</li>
074 * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
075 * </ul>
076 *
077 * @since 1.6
078 * @version $Id: Rule.html 889935 2013-12-11 05:05:13Z ggregory $
079 */
080 public class Rule {
081
082 public static final class Phoneme implements PhonemeExpr {
083 public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
084 @Override
085 public int compare(final Phoneme o1, final Phoneme o2) {
086 for (int i = 0; i < o1.phonemeText.length(); i++) {
087 if (i >= o2.phonemeText.length()) {
088 return +1;
089 }
090 final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
091 if (c != 0) {
092 return c;
093 }
094 }
095
096 if (o1.phonemeText.length() < o2.phonemeText.length()) {
097 return -1;
098 }
099
100 return 0;
101 }
102 };
103
104 private final CharSequence phonemeText;
105 private final Languages.LanguageSet languages;
106
107 public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
108 this.phonemeText = phonemeText;
109 this.languages = languages;
110 }
111
112 public Phoneme append(final CharSequence str) {
113 return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114 }
115
116 public Languages.LanguageSet getLanguages() {
117 return this.languages;
118 }
119
120 @Override
121 public Iterable<Phoneme> getPhonemes() {
122 return Collections.singleton(this);
123 }
124
125 public CharSequence getPhonemeText() {
126 return this.phonemeText;
127 }
128
129 public Phoneme join(final Phoneme right) {
130 return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131 this.languages.restrictTo(right.languages));
132 }
133 }
134
135 public interface PhonemeExpr {
136 Iterable<Phoneme> getPhonemes();
137 }
138
139 public static final class PhonemeList implements PhonemeExpr {
140 private final List<Phoneme> phonemes;
141
142 public PhonemeList(final List<Phoneme> phonemes) {
143 this.phonemes = phonemes;
144 }
145
146 @Override
147 public List<Phoneme> getPhonemes() {
148 return this.phonemes;
149 }
150 }
151
152 /**
153 * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154 */
155 public static interface RPattern {
156 boolean isMatch(CharSequence input);
157 }
158
159 public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160 @Override
161 public boolean isMatch(final CharSequence input) {
162 return true;
163 }
164 };
165
166 public static final String ALL = "ALL";
167
168 private static final String DOUBLE_QUOTE = "\"";
169
170 private static final String HASH_INCLUDE = "#include";
171
172 private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173 new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174
175 static {
176 for (final NameType s : NameType.values()) {
177 final Map<RuleType, Map<String, List<Rule>>> rts =
178 new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
179
180 for (final RuleType rt : RuleType.values()) {
181 final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
182
183 final Languages ls = Languages.getInstance(s);
184 for (final String l : ls.getLanguages()) {
185 try {
186 rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
187 } catch (final IllegalStateException e) {
188 throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
189 }
190 }
191 if (!rt.equals(RuleType.RULES)) {
192 rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
193 }
194
195 rts.put(rt, Collections.unmodifiableMap(rs));
196 }
197
198 RULES.put(s, Collections.unmodifiableMap(rts));
199 }
200 }
201
202 private static boolean contains(final CharSequence chars, final char input) {
203 for (int i = 0; i < chars.length(); i++) {
204 if (chars.charAt(i) == input) {
205 return true;
206 }
207 }
208 return false;
209 }
210
211 private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
212 return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
213 nameType.getName(), rt.getName(), lang);
214 }
215
216 private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
217 final String resName = createResourceName(nameType, rt, lang);
218 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
219
220 if (rulesIS == null) {
221 throw new IllegalArgumentException("Unable to load resource: " + resName);
222 }
223
224 return new Scanner(rulesIS, ResourceConstants.ENCODING);
225 }
226
227 private static Scanner createScanner(final String lang) {
228 final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
229 final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
230
231 if (rulesIS == null) {
232 throw new IllegalArgumentException("Unable to load resource: " + resName);
233 }
234
235 return new Scanner(rulesIS, ResourceConstants.ENCODING);
236 }
237
238 private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
239 if (suffix.length() > input.length()) {
240 return false;
241 }
242 for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
243 if (input.charAt(i) != suffix.charAt(j)) {
244 return false;
245 }
246 }
247 return true;
248 }
249
250 /**
251 * Gets rules for a combination of name type, rule type and languages.
252 *
253 * @param nameType
254 * the NameType to consider
255 * @param rt
256 * the RuleType to consider
257 * @param langs
258 * the set of languages to consider
259 * @return a list of Rules that apply
260 */
261 public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
262 final Languages.LanguageSet langs) {
263 return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
264 getInstance(nameType, rt, Languages.ANY);
265 }
266
267 /**
268 * Gets rules for a combination of name type, rule type and a single language.
269 *
270 * @param nameType
271 * the NameType to consider
272 * @param rt
273 * the RuleType to consider
274 * @param lang
275 * the language to consider
276 * @return a list rules for a combination of name type, rule type and a single language.
277 */
278 public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
279 final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
280
281 if (rules == null) {
282 throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
283 nameType.getName(), rt.getName(), lang));
284 }
285
286 return rules;
287 }
288
289 private static Phoneme parsePhoneme(final String ph) {
290 final int open = ph.indexOf("[");
291 if (open >= 0) {
292 if (!ph.endsWith("]")) {
293 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
294 }
295 final String before = ph.substring(0, open);
296 final String in = ph.substring(open + 1, ph.length() - 1);
297 final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
298
299 return new Phoneme(before, Languages.LanguageSet.from(langs));
300 } else {
301 return new Phoneme(ph, Languages.ANY_LANGUAGE);
302 }
303 }
304
305 private static PhonemeExpr parsePhonemeExpr(final String ph) {
306 if (ph.startsWith("(")) { // we have a bracketed list of options
307 if (!ph.endsWith(")")) {
308 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
309 }
310
311 final List<Phoneme> phs = new ArrayList<Phoneme>();
312 final String body = ph.substring(1, ph.length() - 1);
313 for (final String part : body.split("[|]")) {
314 phs.add(parsePhoneme(part));
315 }
316 if (body.startsWith("|") || body.endsWith("|")) {
317 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
318 }
319
320 return new PhonemeList(phs);
321 } else {
322 return parsePhoneme(ph);
323 }
324 }
325
326 private static List<Rule> parseRules(final Scanner scanner, final String location) {
327 final List<Rule> lines = new ArrayList<Rule>();
328 int currentLine = 0;
329
330 boolean inMultilineComment = false;
331 while (scanner.hasNextLine()) {
332 currentLine++;
333 final String rawLine = scanner.nextLine();
334 String line = rawLine;
335
336 if (inMultilineComment) {
337 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
338 inMultilineComment = false;
339 }
340 } else {
341 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
342 inMultilineComment = true;
343 } else {
344 // discard comments
345 final int cmtI = line.indexOf(ResourceConstants.CMT);
346 if (cmtI >= 0) {
347 line = line.substring(0, cmtI);
348 }
349
350 // trim leading-trailing whitespace
351 line = line.trim();
352
353 if (line.length() == 0) {
354 continue; // empty lines can be safely skipped
355 }
356
357 if (line.startsWith(HASH_INCLUDE)) {
358 // include statement
359 final String incl = line.substring(HASH_INCLUDE.length()).trim();
360 if (incl.contains(" ")) {
361 throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
362 location);
363 } else {
364 lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
365 }
366 } else {
367 // rule
368 final String[] parts = line.split("\\s+");
369 if (parts.length != 4) {
370 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
371 " parts: " + rawLine + " in " + location);
372 } else {
373 try {
374 final String pat = stripQuotes(parts[0]);
375 final String lCon = stripQuotes(parts[1]);
376 final String rCon = stripQuotes(parts[2]);
377 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
378 final int cLine = currentLine;
379 final Rule r = new Rule(pat, lCon, rCon, ph) {
380 private final int myLine = cLine;
381 private final String loc = location;
382
383 @Override
384 public String toString() {
385 final StringBuilder sb = new StringBuilder();
386 sb.append("Rule");
387 sb.append("{line=").append(myLine);
388 sb.append(", loc='").append(loc).append('\'');
389 sb.append('}');
390 return sb.toString();
391 }
392 };
393 lines.add(r);
394 } catch (final IllegalArgumentException e) {
395 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
396 location, e);
397 }
398 }
399 }
400 }
401 }
402 }
403
404 return lines;
405 }
406
407 /**
408 * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
409 *
410 * @param regex
411 * the regular expression to compile
412 * @return an RPattern that will match this regex
413 */
414 private static RPattern pattern(final String regex) {
415 final boolean startsWith = regex.startsWith("^");
416 final boolean endsWith = regex.endsWith("$");
417 final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
418 final boolean boxes = content.contains("[");
419
420 if (!boxes) {
421 if (startsWith && endsWith) {
422 // exact match
423 if (content.length() == 0) {
424 // empty
425 return new RPattern() {
426 @Override
427 public boolean isMatch(final CharSequence input) {
428 return input.length() == 0;
429 }
430 };
431 } else {
432 return new RPattern() {
433 @Override
434 public boolean isMatch(final CharSequence input) {
435 return input.equals(content);
436 }
437 };
438 }
439 } else if ((startsWith || endsWith) && content.length() == 0) {
440 // matches every string
441 return ALL_STRINGS_RMATCHER;
442 } else if (startsWith) {
443 // matches from start
444 return new RPattern() {
445 @Override
446 public boolean isMatch(final CharSequence input) {
447 return startsWith(input, content);
448 }
449 };
450 } else if (endsWith) {
451 // matches from start
452 return new RPattern() {
453 @Override
454 public boolean isMatch(final CharSequence input) {
455 return endsWith(input, content);
456 }
457 };
458 }
459 } else {
460 final boolean startsWithBox = content.startsWith("[");
461 final boolean endsWithBox = content.endsWith("]");
462
463 if (startsWithBox && endsWithBox) {
464 String boxContent = content.substring(1, content.length() - 1);
465 if (!boxContent.contains("[")) {
466 // box containing alternatives
467 final boolean negate = boxContent.startsWith("^");
468 if (negate) {
469 boxContent = boxContent.substring(1);
470 }
471 final String bContent = boxContent;
472 final boolean shouldMatch = !negate;
473
474 if (startsWith && endsWith) {
475 // exact match
476 return new RPattern() {
477 @Override
478 public boolean isMatch(final CharSequence input) {
479 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
480 }
481 };
482 } else if (startsWith) {
483 // first char
484 return new RPattern() {
485 @Override
486 public boolean isMatch(final CharSequence input) {
487 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
488 }
489 };
490 } else if (endsWith) {
491 // last char
492 return new RPattern() {
493 @Override
494 public boolean isMatch(final CharSequence input) {
495 return input.length() > 0 &&
496 contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
497 }
498 };
499 }
500 }
501 }
502 }
503
504 return new RPattern() {
505 Pattern pattern = Pattern.compile(regex);
506
507 @Override
508 public boolean isMatch(final CharSequence input) {
509 final Matcher matcher = pattern.matcher(input);
510 return matcher.find();
511 }
512 };
513 }
514
515 private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
516 if (prefix.length() > input.length()) {
517 return false;
518 }
519 for (int i = 0; i < prefix.length(); i++) {
520 if (input.charAt(i) != prefix.charAt(i)) {
521 return false;
522 }
523 }
524 return true;
525 }
526
527 private static String stripQuotes(String str) {
528 if (str.startsWith(DOUBLE_QUOTE)) {
529 str = str.substring(1);
530 }
531
532 if (str.endsWith(DOUBLE_QUOTE)) {
533 str = str.substring(0, str.length() - 1);
534 }
535
536 return str;
537 }
538
539 private final RPattern lContext;
540
541 private final String pattern;
542
543 private final PhonemeExpr phoneme;
544
545 private final RPattern rContext;
546
547 /**
548 * Creates a new rule.
549 *
550 * @param pattern
551 * the pattern
552 * @param lContext
553 * the left context
554 * @param rContext
555 * the right context
556 * @param phoneme
557 * the resulting phoneme
558 */
559 public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
560 this.pattern = pattern;
561 this.lContext = pattern(lContext + "$");
562 this.rContext = pattern("^" + rContext);
563 this.phoneme = phoneme;
564 }
565
566 /**
567 * Gets the left context. This is a regular expression that must match to the left of the pattern.
568 *
569 * @return the left context Pattern
570 */
571 public RPattern getLContext() {
572 return this.lContext;
573 }
574
575 /**
576 * Gets the pattern. This is a string-literal that must exactly match.
577 *
578 * @return the pattern
579 */
580 public String getPattern() {
581 return this.pattern;
582 }
583
584 /**
585 * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
586 *
587 * @return the phoneme
588 */
589 public PhonemeExpr getPhoneme() {
590 return this.phoneme;
591 }
592
593 /**
594 * Gets the right context. This is a regular expression that must match to the right of the pattern.
595 *
596 * @return the right context Pattern
597 */
598 public RPattern getRContext() {
599 return this.rContext;
600 }
601
602 /**
603 * Decides if the pattern and context match the input starting at a position. It is a match if the
604 * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
605 * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
606 *
607 * @param input
608 * the input String
609 * @param i
610 * the int position within the input
611 * @return true if the pattern and left/right context match, false otherwise
612 */
613 public boolean patternAndContextMatches(final CharSequence input, final int i) {
614 if (i < 0) {
615 throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
616 }
617
618 final int patternLength = this.pattern.length();
619 final int ipl = i + patternLength;
620
621 if (ipl > input.length()) {
622 // not enough room for the pattern to match
623 return false;
624 }
625
626 // evaluate the pattern, left context and right context
627 // fail early if any of the evaluations is not successful
628 if (!input.subSequence(i, ipl).equals(this.pattern)) {
629 return false;
630 } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
631 return false;
632 }
633 return this.lContext.isMatch(input.subSequence(0, i));
634 }
635 }