001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.EnumMap; 024import java.util.HashSet; 025import java.util.List; 026import java.util.Locale; 027import java.util.Map; 028import java.util.Scanner; 029import java.util.Set; 030import java.util.regex.Pattern; 031 032import org.apache.commons.codec.Resources; 033 034/** 035 * Language guessing utility. 036 * <p> 037 * This class encapsulates rules used to guess the possible languages that a word originates from. This is 038 * done by reference to a whole series of rules distributed in resource files. 039 * </p> 040 * <p> 041 * Instances of this class are typically managed through the static factory method instance(). 042 * Unless you are developing your own language guessing rules, you will not need to interact with this class directly. 043 * </p> 044 * <p> 045 * This class is intended to be immutable and thread-safe. 046 * </p> 047 * <h2>Lang resources</h2> 048 * <p> 049 * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. 050 * They are systematically named following the pattern: 051 * </p> 052 * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> 053 * <p> 054 * The format of these resources is the following: 055 * </p> 056 * <ul> 057 * <li><strong>Rules:</strong> whitespace separated strings. 058 * There should be 3 columns to each row, and these will be interpreted as: 059 * <ol> 060 * <li>pattern: a regular expression.</li> 061 * <li>languages: a '+'-separated list of languages.</li> 062 * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li> 063 * </ol> 064 * </li> 065 * <li><strong>End-of-line comments:</strong> Any occurrence of '//' will cause all text following on that line to be 066 * discarded as a comment.</li> 067 * <li><strong>Multi-line comments:</strong> Any line starting with '/*' will start multi-line commenting mode. 068 * This will skip all content until a line ending in '*' and '/' is found.</li> 069 * <li><strong>Blank lines:</strong> All blank lines will be skipped.</li> 070 * </ul> 071 * <p> 072 * Port of lang.php 073 * </p> 074 * 075 * @since 1.6 076 */ 077public class Lang { 078 // Implementation note: This class is divided into two sections. The first part is a static factory interface that 079 // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that 080 // encapsulate a particular language-guessing rule table and the language guessing itself. 081 // 082 // It may make sense in the future to expose the private constructor to allow power users to build custom language- 083 // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users 084 // should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances. 085 086 private static final class LangRule { 087 private final boolean acceptOnMatch; 088 private final Set<String> languages; 089 private final Pattern pattern; 090 091 private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) { 092 this.pattern = pattern; 093 this.languages = languages; 094 this.acceptOnMatch = acceptOnMatch; 095 } 096 097 public boolean matches(final String txt) { 098 return pattern.matcher(txt).find(); 099 } 100 } 101 102 private static final Map<NameType, Lang> LANGS = new EnumMap<>(NameType.class); 103 104 private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt"; 105 106 private static final Pattern PLUS = Pattern.compile("\\+"); 107 108 static { 109 for (final NameType s : NameType.values()) { 110 LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s))); 111 } 112 } 113 114 /** 115 * Gets a Lang instance for one of the supported NameTypes. 116 * 117 * @param nameType 118 * the NameType to look up 119 * @return a Lang encapsulating the language guessing rules for that name type 120 */ 121 public static Lang instance(final NameType nameType) { 122 return LANGS.get(nameType); 123 } 124 125 /** 126 * Loads language rules from a resource. 127 * <p> 128 * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. 129 * You will only need to call this yourself if you are developing custom language mapping rules. 130 * </p> 131 * 132 * @param languageRulesResourceName 133 * the fully-qualified resource name to load 134 * @param languages 135 * the languages that these rules will support 136 * @return a Lang encapsulating the loaded language-guessing rules. 137 */ 138 public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) { 139 final List<LangRule> rules = new ArrayList<>(); 140 try (Scanner scanner = new Scanner(Resources.getInputStream(languageRulesResourceName), 141 ResourceConstants.ENCODING)) { 142 boolean inExtendedComment = false; 143 while (scanner.hasNextLine()) { 144 final String rawLine = scanner.nextLine(); 145 String line = rawLine; 146 if (inExtendedComment) { 147 // check for closing comment marker, otherwise discard doc comment line 148 if (line.endsWith(ResourceConstants.EXT_CMT_END)) { 149 inExtendedComment = false; 150 } 151 } else if (line.startsWith(ResourceConstants.EXT_CMT_START)) { 152 inExtendedComment = true; 153 } else { 154 // discard comments 155 final int cmtI = line.indexOf(ResourceConstants.CMT); 156 if (cmtI >= 0) { 157 line = line.substring(0, cmtI); 158 } 159 160 // trim leading-trailing whitespace 161 line = line.trim(); 162 163 if (line.isEmpty()) { 164 continue; // empty lines can be safely skipped 165 } 166 167 // split it up 168 final String[] parts = ResourceConstants.SPACES.split(line); 169 170 if (parts.length != 3) { 171 throw new IllegalArgumentException("Malformed line '" + rawLine + 172 "' in language resource '" + languageRulesResourceName + "'"); 173 } 174 175 final Pattern pattern = Pattern.compile(parts[0]); 176 final String[] langs = PLUS.split(parts[1]); 177 final boolean accept = parts[2].equals("true"); 178 179 rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept)); 180 } 181 } 182 } 183 return new Lang(rules, languages); 184 } 185 186 private final Languages languages; 187 188 private final List<LangRule> rules; 189 190 private Lang(final List<LangRule> rules, final Languages languages) { 191 this.rules = Collections.unmodifiableList(rules); 192 this.languages = languages; 193 } 194 195 /** 196 * Guesses the language of a word. 197 * 198 * @param text 199 * the word 200 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match 201 */ 202 public String guessLanguage(final String text) { 203 final Languages.LanguageSet ls = guessLanguages(text); 204 return ls.isSingleton() ? ls.getAny() : Languages.ANY; 205 } 206 207 /** 208 * Guesses the languages of a word. 209 * 210 * @param input 211 * the word 212 * @return a Set of Strings of language names that are potential matches for the input word 213 */ 214 public Languages.LanguageSet guessLanguages(final String input) { 215 final String text = input.toLowerCase(Locale.ENGLISH); 216 final Set<String> langs = new HashSet<>(languages.getLanguages()); 217 rules.forEach(rule -> { 218 if (rule.matches(text)) { 219 if (rule.acceptOnMatch) { 220 langs.retainAll(rule.languages); 221 } else { 222 langs.removeAll(rule.languages); 223 } 224 } 225 }); 226 final Languages.LanguageSet ls = Languages.LanguageSet.from(langs); 227 return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; 228 } 229}