001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.codec.language.bm; 019 020 import java.io.InputStream; 021 import java.util.ArrayList; 022 import java.util.Arrays; 023 import java.util.Collections; 024 import java.util.EnumMap; 025 import java.util.HashSet; 026 import java.util.List; 027 import java.util.Locale; 028 import java.util.Map; 029 import java.util.Scanner; 030 import java.util.Set; 031 import java.util.regex.Pattern; 032 033 /** 034 * Language guessing utility. 035 * <p> 036 * This class encapsulates rules used to guess the possible languages that a word originates from. This is 037 * done by reference to a whole series of rules distributed in resource files. 038 * <p> 039 * Instances of this class are typically managed through the static factory method instance(). 040 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly. 041 * <p> 042 * This class is intended to be immutable and thread-safe. 043 * <p> 044 * <b>Lang resources</b> 045 * <p> 046 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. 047 * They are systematically named following the pattern: 048 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> 049 * The format of these resources is the following: 050 * <ul> 051 * <li><b>Rules:</b> whitespace separated strings. 052 * There should be 3 columns to each row, and these will be interpreted as: 053 * <ol> 054 * <li>pattern: a regular expression.</li> 055 * <li>languages: a '+'-separated list of languages.</li> 056 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li> 057 * </ol> 058 * </li> 059 * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be 060 * discarded as a comment.</li> 061 * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. 062 * This will skip all content until a line ending in '*' and '/' is found.</li> 063 * <li><b>Blank lines:</b> All blank lines will be skipped.</li> 064 * </ul> 065 * <p> 066 * Port of lang.php 067 * 068 * @since 1.6 069 * @version $Id: Lang.html 889935 2013-12-11 05:05:13Z ggregory $ 070 */ 071 public class Lang { 072 // Implementation note: This class is divided into two sections. The first part is a static factory interface that 073 // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that 074 // encapsulate a particular language-guessing rule table and the language guessing itself. 075 // 076 // It may make sense in the future to expose the private constructor to allow power users to build custom language- 077 // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users 078 // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances. 079 080 private static final class LangRule { 081 private final boolean acceptOnMatch; 082 private final Set<String> languages; 083 private final Pattern pattern; 084 085 private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) { 086 this.pattern = pattern; 087 this.languages = languages; 088 this.acceptOnMatch = acceptOnMatch; 089 } 090 091 public boolean matches(String txt) { 092 return this.pattern.matcher(txt).find(); 093 } 094 } 095 096 private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class); 097 098 private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt"; 099 100 static { 101 for (NameType s : NameType.values()) { 102 Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s))); 103 } 104 } 105 106 /** 107 * Gets a Lang instance for one of the supported NameTypes. 108 * 109 * @param nameType 110 * the NameType to look up 111 * @return a Lang encapsulating the language guessing rules for that name type 112 */ 113 public static Lang instance(NameType nameType) { 114 return Langs.get(nameType); 115 } 116 117 /** 118 * Loads language rules from a resource. 119 * <p> 120 * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. 121 * You will only need to call this yourself if you are developing custom language mapping rules. 122 * 123 * @param languageRulesResourceName 124 * the fully-qualified resource name to load 125 * @param languages 126 * the languages that these rules will support 127 * @return a Lang encapsulating the loaded language-guessing rules. 128 */ 129 public static Lang loadFromResource(String languageRulesResourceName, Languages languages) { 130 List<LangRule> rules = new ArrayList<LangRule>(); 131 InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName); 132 133 if (lRulesIS == null) { 134 throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN); 135 } 136 137 Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING); 138 boolean inExtendedComment = false; 139 while (scanner.hasNextLine()) { 140 String rawLine = scanner.nextLine(); 141 String line = rawLine; 142 143 if (inExtendedComment) { 144 // check for closing comment marker, otherwise discard doc comment line 145 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 146 inExtendedComment = false; 147 } 148 } else { 149 if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 150 inExtendedComment = true; 151 } else { 152 // discard comments 153 int cmtI = line.indexOf(ResourceConstants.CMT); 154 if (cmtI >= 0) { 155 line = line.substring(0, cmtI); 156 } 157 158 // trim leading-trailing whitespace 159 line = line.trim(); 160 161 if (line.length() == 0) { 162 continue; // empty lines can be safely skipped 163 } 164 165 // split it up 166 String[] parts = line.split("\\s+"); 167 168 if (parts.length != 3) { 169 throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" + 170 languageRulesResourceName + "'"); 171 } 172 173 Pattern pattern = Pattern.compile(parts[0]); 174 String[] langs = parts[1].split("\\+"); 175 boolean accept = parts[2].equals("true"); 176 177 rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept)); 178 } 179 } 180 } 181 182 return new Lang(rules, languages); 183 } 184 185 private final Languages languages; 186 private final List<LangRule> rules; 187 188 private Lang(List<LangRule> rules, Languages languages) { 189 this.rules = Collections.unmodifiableList(rules); 190 this.languages = languages; 191 } 192 193 /** 194 * Guesses the language of a word. 195 * 196 * @param text 197 * the word 198 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match 199 */ 200 public String guessLanguage(String text) { 201 Languages.LanguageSet ls = guessLanguages(text); 202 return ls.isSingleton() ? ls.getAny() : Languages.ANY; 203 } 204 205 /** 206 * Guesses the languages of a word. 207 * 208 * @param input 209 * the word 210 * @return a Set of Strings of language names that are potential matches for the input word 211 */ 212 public Languages.LanguageSet guessLanguages(String input) { 213 String text = input.toLowerCase(Locale.ENGLISH); 214 215 Set<String> langs = new HashSet<String>(this.languages.getLanguages()); 216 for (LangRule rule : this.rules) { 217 if (rule.matches(text)) { 218 if (rule.acceptOnMatch) { 219 langs.retainAll(rule.languages); 220 } else { 221 langs.removeAll(rule.languages); 222 } 223 } 224 } 225 226 Languages.LanguageSet ls = Languages.LanguageSet.from(langs); 227 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; 228 } 229 }