View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collections;
24  import java.util.EnumMap;
25  import java.util.HashSet;
26  import java.util.List;
27  import java.util.Locale;
28  import java.util.Map;
29  import java.util.Scanner;
30  import java.util.Set;
31  import java.util.regex.Pattern;
32  
33  /**
34   * Language guessing utility.
35   * <p>
36   * This class encapsulates rules used to guess the possible languages that a word originates from. This is
37   * done by reference to a whole series of rules distributed in resource files.
38   * <p>
39   * Instances of this class are typically managed through the static factory method instance().
40   * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
41   * <p>
42   * This class is intended to be immutable and thread-safe.
43   * <p>
44   * <b>Lang resources</b>
45   * <p>
46   * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
47   * They are systematically named following the pattern:
48   * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
49   * The format of these resources is the following:
50   * <ul>
51   * <li><b>Rules:</b> whitespace separated strings.
52   * There should be 3 columns to each row, and these will be interpreted as:
53   * <ol>
54   * <li>pattern: a regular expression.</li>
55   * <li>languages: a '+'-separated list of languages.</li>
56   * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
57   * </ol>
58   * </li>
59   * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
60   * discarded as a comment.</li>
61   * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
62   * This will skip all content until a line ending in '*' and '/' is found.</li>
63   * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
64   * </ul>
65   * <p>
66   * Port of lang.php
67   *
68   * @since 1.6
69   * @version $Id: Lang.html 891688 2013-12-24 20:49:46Z ggregory $
70   */
71  public class Lang {
72      // Implementation note: This class is divided into two sections. The first part is a static factory interface that
73      // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
74      // encapsulate a particular language-guessing rule table and the language guessing itself.
75      //
76      // It may make sense in the future to expose the private constructor to allow power users to build custom language-
77      // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
78      // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
79  
80      private static final class LangRule {
81          private final boolean acceptOnMatch;
82          private final Set<String> languages;
83          private final Pattern pattern;
84  
85          private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
86              this.pattern = pattern;
87              this.languages = languages;
88              this.acceptOnMatch = acceptOnMatch;
89          }
90  
91          public boolean matches(final String txt) {
92              return this.pattern.matcher(txt).find();
93          }
94      }
95  
96      private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
97  
98      private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
99  
100     static {
101         for (final NameType s : NameType.values()) {
102             Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
103         }
104     }
105 
106     /**
107      * Gets a Lang instance for one of the supported NameTypes.
108      *
109      * @param nameType
110      *            the NameType to look up
111      * @return a Lang encapsulating the language guessing rules for that name type
112      */
113     public static Lang instance(final NameType nameType) {
114         return Langs.get(nameType);
115     }
116 
117     /**
118      * Loads language rules from a resource.
119      * <p>
120      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121      * You will only need to call this yourself if you are developing custom language mapping rules.
122      *
123      * @param languageRulesResourceName
124      *            the fully-qualified resource name to load
125      * @param languages
126      *            the languages that these rules will support
127      * @return a Lang encapsulating the loaded language-guessing rules.
128      */
129     public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
130         final List<LangRule> rules = new ArrayList<LangRule>();
131         final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132 
133         if (lRulesIS == null) {
134             throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135         }
136 
137         final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138         try {
139             boolean inExtendedComment = false;
140             while (scanner.hasNextLine()) {
141                 final String rawLine = scanner.nextLine();
142                 String line = rawLine;
143                 if (inExtendedComment) {
144                     // check for closing comment marker, otherwise discard doc comment line
145                     if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146                         inExtendedComment = false;
147                     }
148                 } else {
149                     if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150                         inExtendedComment = true;
151                     } else {
152                         // discard comments
153                         final int cmtI = line.indexOf(ResourceConstants.CMT);
154                         if (cmtI >= 0) {
155                             line = line.substring(0, cmtI);
156                         }
157 
158                         // trim leading-trailing whitespace
159                         line = line.trim();
160 
161                         if (line.length() == 0) {
162                             continue; // empty lines can be safely skipped
163                         }
164 
165                         // split it up
166                         final String[] parts = line.split("\\s+");
167 
168                         if (parts.length != 3) {
169                             throw new IllegalArgumentException("Malformed line '" + rawLine +
170                                     "' in language resource '" + languageRulesResourceName + "'");
171                         }
172 
173                         final Pattern pattern = Pattern.compile(parts[0]);
174                         final String[] langs = parts[1].split("\\+");
175                         final boolean accept = parts[2].equals("true");
176 
177                         rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178                     }
179                 }
180             }
181         } finally {
182             scanner.close();
183         }
184         return new Lang(rules, languages);
185     }
186 
187     private final Languages languages;
188     private final List<LangRule> rules;
189 
190     private Lang(final List<LangRule> rules, final Languages languages) {
191         this.rules = Collections.unmodifiableList(rules);
192         this.languages = languages;
193     }
194 
195     /**
196      * Guesses the language of a word.
197      *
198      * @param text
199      *            the word
200      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
201      */
202     public String guessLanguage(final String text) {
203         final Languages.LanguageSet ls = guessLanguages(text);
204         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
205     }
206 
207     /**
208      * Guesses the languages of a word.
209      *
210      * @param input
211      *            the word
212      * @return a Set of Strings of language names that are potential matches for the input word
213      */
214     public Languages.LanguageSet guessLanguages(final String input) {
215         final String text = input.toLowerCase(Locale.ENGLISH);
216 
217         final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
218         for (final LangRule rule : this.rules) {
219             if (rule.matches(text)) {
220                 if (rule.acceptOnMatch) {
221                     langs.retainAll(rule.languages);
222                 } else {
223                     langs.removeAll(rule.languages);
224                 }
225             }
226         }
227 
228         final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
229         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
230     }
231 }