Lang.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language.bm;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.EnumMap;
  22. import java.util.HashSet;
  23. import java.util.List;
  24. import java.util.Locale;
  25. import java.util.Map;
  26. import java.util.Scanner;
  27. import java.util.Set;
  28. import java.util.regex.Pattern;

  29. import org.apache.commons.codec.Resources;

  30. /**
  31.  * Language guessing utility.
  32.  * <p>
  33.  * This class encapsulates rules used to guess the possible languages that a word originates from. This is
  34.  * done by reference to a whole series of rules distributed in resource files.
  35.  * </p>
  36.  * <p>
  37.  * Instances of this class are typically managed through the static factory method instance().
  38.  * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
  39.  * </p>
  40.  * <p>
  41.  * This class is intended to be immutable and thread-safe.
  42.  * </p>
  43.  * <h2>Lang resources</h2>
  44.  * <p>
  45.  * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
  46.  * They are systematically named following the pattern:
  47.  * </p>
  48.  * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
  49.  * <p>
  50.  * The format of these resources is the following:
  51.  * </p>
  52.  * <ul>
  53.  * <li><strong>Rules:</strong> whitespace separated strings.
  54.  * There should be 3 columns to each row, and these will be interpreted as:
  55.  * <ol>
  56.  * <li>pattern: a regular expression.</li>
  57.  * <li>languages: a '+'-separated list of languages.</li>
  58.  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
  59.  * </ol>
  60.  * </li>
  61.  * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be
  62.  * discarded as a comment.</li>
  63.  * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode.
  64.  * This will skip all content until a line ending in '*' and '/' is found.</li>
  65.  * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li>
  66.  * </ul>
  67.  * <p>
  68.  * Port of lang.php
  69.  * </p>
  70.  *
  71.  * @since 1.6
  72.  */
  73. public class Lang {
  74.     // Implementation note: This class is divided into two sections. The first part is a static factory interface that
  75.     // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
  76.     // encapsulate a particular language-guessing rule table and the language guessing itself.
  77.     //
  78.     // It may make sense in the future to expose the private constructor to allow power users to build custom language-
  79.     // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
  80.     // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.

  81.     private static final class LangRule {
  82.         private final boolean acceptOnMatch;
  83.         private final Set<String> languages;
  84.         private final Pattern pattern;

  85.         private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
  86.             this.pattern = pattern;
  87.             this.languages = languages;
  88.             this.acceptOnMatch = acceptOnMatch;
  89.         }

  90.         public boolean matches(final String txt) {
  91.             return this.pattern.matcher(txt).find();
  92.         }
  93.     }

  94.     private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class);

  95.     private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";

  96.     static {
  97.         for (final NameType s : NameType.values()) {
  98.             LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
  99.         }
  100.     }

  101.     /**
  102.      * Gets a Lang instance for one of the supported NameTypes.
  103.      *
  104.      * @param nameType
  105.      *            the NameType to look up
  106.      * @return a Lang encapsulating the language guessing rules for that name type
  107.      */
  108.     public static Lang instance(final NameType nameType) {
  109.         return LANGS.get(nameType);
  110.     }

  111.     /**
  112.      * Loads language rules from a resource.
  113.      * <p>
  114.      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
  115.      * You will only need to call this yourself if you are developing custom language mapping rules.
  116.      * </p>
  117.      *
  118.      * @param languageRulesResourceName
  119.      *            the fully-qualified resource name to load
  120.      * @param languages
  121.      *            the languages that these rules will support
  122.      * @return a Lang encapsulating the loaded language-guessing rules.
  123.      */
  124.     public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
  125.         final List<LangRule> rules = new ArrayList<>();
  126.         try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName),
  127.                 ResourceConstants.ENCODING)) {
  128.             boolean inExtendedComment = false;
  129.             while (scanner.hasNextLine()) {
  130.                 final String rawLine = scanner.nextLine();
  131.                 String line = rawLine;
  132.                 if (inExtendedComment) {
  133.                     // check for closing comment marker, otherwise discard doc comment line
  134.                     if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
  135.                         inExtendedComment = false;
  136.                     }
  137.                 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
  138.                     inExtendedComment = true;
  139.                 } else {
  140.                     // discard comments
  141.                     final int cmtI = line.indexOf(ResourceConstants.CMT);
  142.                     if (cmtI >= 0) {
  143.                         line = line.substring(0, cmtI);
  144.                     }

  145.                     // trim leading-trailing whitespace
  146.                     line = line.trim();

  147.                     if (line.isEmpty()) {
  148.                         continue; // empty lines can be safely skipped
  149.                     }

  150.                     // split it up
  151.                     final String[] parts = line.split("\\s+");

  152.                     if (parts.length != 3) {
  153.                         throw new IllegalArgumentException("Malformed line '" + rawLine +
  154.                                 "' in language resource '" + languageRulesResourceName + "'");
  155.                     }

  156.                     final Pattern pattern = Pattern.compile(parts[0]);
  157.                     final String[] langs = parts[1].split("\\+");
  158.                     final boolean accept = parts[2].equals("true");

  159.                     rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));
  160.                 }
  161.             }
  162.         }
  163.         return new Lang(rules, languages);
  164.     }

  165.     private final Languages languages;

  166.     private final List<LangRule> rules;

  167.     private Lang(final List<LangRule> rules, final Languages languages) {
  168.         this.rules = Collections.unmodifiableList(rules);
  169.         this.languages = languages;
  170.     }

  171.     /**
  172.      * Guesses the language of a word.
  173.      *
  174.      * @param text
  175.      *            the word
  176.      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
  177.      */
  178.     public String guessLanguage(final String text) {
  179.         final Languages.LanguageSet ls = guessLanguages(text);
  180.         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
  181.     }

  182.     /**
  183.      * Guesses the languages of a word.
  184.      *
  185.      * @param input
  186.      *            the word
  187.      * @return a Set of Strings of language names that are potential matches for the input word
  188.      */
  189.     public Languages.LanguageSet guessLanguages(final String input) {
  190.         final String text = input.toLowerCase(Locale.ENGLISH);
  191.         final Set<String> langs = new HashSet<>(this.languages.getLanguages());
  192.         rules.forEach(rule -> {
  193.             if (rule.matches(text)) {
  194.                 if (rule.acceptOnMatch) {
  195.                     langs.retainAll(rule.languages);
  196.                 } else {
  197.                     langs.removeAll(rule.languages);
  198.                 }
  199.             }
  200.         });
  201.         final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
  202.         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
  203.     }
  204. }