Lang.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language.bm;

  18. import java.io.InputStream;
  19. import java.util.ArrayList;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.EnumMap;
  23. import java.util.HashSet;
  24. import java.util.List;
  25. import java.util.Locale;
  26. import java.util.Map;
  27. import java.util.Scanner;
  28. import java.util.Set;
  29. import java.util.regex.Pattern;

  30. /**
  31.  * Language guessing utility.
  32.  * <p>
  33.  * This class encapsulates rules used to guess the possible languages that a word originates from. This is
  34.  * done by reference to a whole series of rules distributed in resource files.
  35.  * <p>
  36.  * Instances of this class are typically managed through the static factory method instance().
  37.  * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
  38.  * <p>
  39.  * This class is intended to be immutable and thread-safe.
  40.  * <p>
  41.  * <b>Lang resources</b>
  42.  * <p>
  43.  * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
  44.  * They are systematically named following the pattern:
  45.  * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
  46.  * The format of these resources is the following:
  47.  * <ul>
  48.  * <li><b>Rules:</b> whitespace separated strings.
  49.  * There should be 3 columns to each row, and these will be interpreted as:
  50.  * <ol>
  51.  * <li>pattern: a regular expression.</li>
  52.  * <li>languages: a '+'-separated list of languages.</li>
  53.  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
  54.  * </ol>
  55.  * </li>
  56.  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
  57.  * discarded as a comment.</li>
  58.  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
  59.  * This will skip all content until a line ending in '*' and '/' is found.</li>
  60.  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  61.  * </ul>
  62.  * <p>
  63.  * Port of lang.php
  64.  *
  65.  * @since 1.6
  66.  * @version $Id: Lang.java 1608115 2014-07-05 19:58:38Z tn $
  67.  */
  68. public class Lang {
  69.     // Implementation note: This class is divided into two sections. The first part is a static factory interface that
  70.     // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
  71.     // encapsulate a particular language-guessing rule table and the language guessing itself.
  72.     //
  73.     // It may make sense in the future to expose the private constructor to allow power users to build custom language-
  74.     // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
  75.     // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.

  76.     private static final class LangRule {
  77.         private final boolean acceptOnMatch;
  78.         private final Set<String> languages;
  79.         private final Pattern pattern;

  80.         private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
  81.             this.pattern = pattern;
  82.             this.languages = languages;
  83.             this.acceptOnMatch = acceptOnMatch;
  84.         }

  85.         public boolean matches(final String txt) {
  86.             return this.pattern.matcher(txt).find();
  87.         }
  88.     }

  89.     private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);

  90.     private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";

  91.     static {
  92.         for (final NameType s : NameType.values()) {
  93.             Langs.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
  94.         }
  95.     }

  96.     /**
  97.      * Gets a Lang instance for one of the supported NameTypes.
  98.      *
  99.      * @param nameType
  100.      *            the NameType to look up
  101.      * @return a Lang encapsulating the language guessing rules for that name type
  102.      */
  103.     public static Lang instance(final NameType nameType) {
  104.         return Langs.get(nameType);
  105.     }

  106.     /**
  107.      * Loads language rules from a resource.
  108.      * <p>
  109.      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
  110.      * You will only need to call this yourself if you are developing custom language mapping rules.
  111.      *
  112.      * @param languageRulesResourceName
  113.      *            the fully-qualified resource name to load
  114.      * @param languages
  115.      *            the languages that these rules will support
  116.      * @return a Lang encapsulating the loaded language-guessing rules.
  117.      */
  118.     public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
  119.         final List<LangRule> rules = new ArrayList<LangRule>();
  120.         final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);

  121.         if (lRulesIS == null) {
  122.             throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
  123.         }

  124.         final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
  125.         try {
  126.             boolean inExtendedComment = false;
  127.             while (scanner.hasNextLine()) {
  128.                 final String rawLine = scanner.nextLine();
  129.                 String line = rawLine;
  130.                 if (inExtendedComment) {
  131.                     // check for closing comment marker, otherwise discard doc comment line
  132.                     if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
  133.                         inExtendedComment = false;
  134.                     }
  135.                 } else {
  136.                     if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
  137.                         inExtendedComment = true;
  138.                     } else {
  139.                         // discard comments
  140.                         final int cmtI = line.indexOf(ResourceConstants.CMT);
  141.                         if (cmtI >= 0) {
  142.                             line = line.substring(0, cmtI);
  143.                         }

  144.                         // trim leading-trailing whitespace
  145.                         line = line.trim();

  146.                         if (line.length() == 0) {
  147.                             continue; // empty lines can be safely skipped
  148.                         }

  149.                         // split it up
  150.                         final String[] parts = line.split("\\s+");

  151.                         if (parts.length != 3) {
  152.                             throw new IllegalArgumentException("Malformed line '" + rawLine +
  153.                                     "' in language resource '" + languageRulesResourceName + "'");
  154.                         }

  155.                         final Pattern pattern = Pattern.compile(parts[0]);
  156.                         final String[] langs = parts[1].split("\\+");
  157.                         final boolean accept = parts[2].equals("true");

  158.                         rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
  159.                     }
  160.                 }
  161.             }
  162.         } finally {
  163.             scanner.close();
  164.         }
  165.         return new Lang(rules, languages);
  166.     }

  167.     private final Languages languages;
  168.     private final List<LangRule> rules;

  169.     private Lang(final List<LangRule> rules, final Languages languages) {
  170.         this.rules = Collections.unmodifiableList(rules);
  171.         this.languages = languages;
  172.     }

  173.     /**
  174.      * Guesses the language of a word.
  175.      *
  176.      * @param text
  177.      *            the word
  178.      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
  179.      */
  180.     public String guessLanguage(final String text) {
  181.         final Languages.LanguageSet ls = guessLanguages(text);
  182.         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
  183.     }

  184.     /**
  185.      * Guesses the languages of a word.
  186.      *
  187.      * @param input
  188.      *            the word
  189.      * @return a Set of Strings of language names that are potential matches for the input word
  190.      */
  191.     public Languages.LanguageSet guessLanguages(final String input) {
  192.         final String text = input.toLowerCase(Locale.ENGLISH);

  193.         final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
  194.         for (final LangRule rule : this.rules) {
  195.             if (rule.matches(text)) {
  196.                 if (rule.acceptOnMatch) {
  197.                     langs.retainAll(rule.languages);
  198.                 } else {
  199.                     langs.removeAll(rule.languages);
  200.                 }
  201.             }
  202.         }

  203.         final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
  204.         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
  205.     }
  206. }