| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Lang |
|
| 3.142857142857143;3.143 | ||||
| Lang$1 |
|
| 3.142857142857143;3.143 | ||||
| Lang$LangRule |
|
| 3.142857142857143;3.143 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.commons.codec.language.bm; | |
| 19 | ||
| 20 | import java.io.InputStream; | |
| 21 | import java.util.ArrayList; | |
| 22 | import java.util.Arrays; | |
| 23 | import java.util.Collections; | |
| 24 | import java.util.EnumMap; | |
| 25 | import java.util.HashSet; | |
| 26 | import java.util.List; | |
| 27 | import java.util.Locale; | |
| 28 | import java.util.Map; | |
| 29 | import java.util.Scanner; | |
| 30 | import java.util.Set; | |
| 31 | import java.util.regex.Pattern; | |
| 32 | ||
| 33 | /** | |
| 34 | * Language guessing utility. | |
| 35 | * <p> | |
| 36 | * This class encapsulates rules used to guess the possible languages that a word originates from. This is | |
| 37 | * done by reference to a whole series of rules distributed in resource files. | |
| 38 | * <p> | |
| 39 | * Instances of this class are typically managed through the static factory method instance(). | |
| 40 | * Unless you are developing your own language guessing rules, you will not need to interact with this class directly. | |
| 41 | * <p> | |
| 42 | * This class is intended to be immutable and thread-safe. | |
| 43 | * <p> | |
| 44 | * <b>Lang resources</b> | |
| 45 | * <p> | |
| 46 | * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. | |
| 47 | * They are systematically named following the pattern: | |
| 48 | * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> | |
| 49 | * The format of these resources is the following: | |
| 50 | * <ul> | |
| 51 | * <li><b>Rules:</b> whitespace separated strings. | |
| 52 | * There should be 3 columns to each row, and these will be interpreted as: | |
| 53 | * <ol> | |
| 54 | * <li>pattern: a regular expression.</li> | |
| 55 | * <li>languages: a '+'-separated list of languages.</li> | |
| 56 | * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li> | |
| 57 | * </ol> | |
| 58 | * </li> | |
| 59 | * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be | |
| 60 | * discarded as a comment.</li> | |
| 61 | * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. | |
| 62 | * This will skip all content until a line ending in '*' and '/' is found.</li> | |
| 63 | * <li><b>Blank lines:</b> All blank lines will be skipped.</li> | |
| 64 | * </ul> | |
| 65 | * <p> | |
| 66 | * Port of lang.php | |
| 67 | * | |
| 68 | * @since 1.6 | |
| 69 | * @version $Id: Lang.java 1429868 2013-01-07 16:08:05Z ggregory $ | |
| 70 | */ | |
| 71 | public class Lang { | |
| 72 | // Implementation note: This class is divided into two sections. The first part is a static factory interface that | |
| 73 | // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that | |
| 74 | // encapsulate a particular language-guessing rule table and the language guessing itself. | |
| 75 | // | |
| 76 | // It may make sense in the future to expose the private constructor to allow power users to build custom language- | |
| 77 | // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users | |
| 78 | // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances. | |
| 79 | ||
| 80 | 7768 | private static final class LangRule { |
| 81 | private final boolean acceptOnMatch; | |
| 82 | private final Set<String> languages; | |
| 83 | private final Pattern pattern; | |
| 84 | ||
| 85 | 756 | private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) { |
| 86 | 756 | this.pattern = pattern; |
| 87 | 756 | this.languages = languages; |
| 88 | 756 | this.acceptOnMatch = acceptOnMatch; |
| 89 | 756 | } |
| 90 | ||
| 91 | public boolean matches(final String txt) { | |
| 92 | 16924824 | return this.pattern.matcher(txt).find(); |
| 93 | } | |
| 94 | } | |
| 95 | ||
| 96 | 1 | private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class); |
| 97 | ||
| 98 | private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt"; | |
| 99 | ||
| 100 | static { | |
| 101 | 4 | for (final NameType s : NameType.values()) { |
| 102 | 3 | Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s))); |
| 103 | } | |
| 104 | 1 | } |
| 105 | ||
| 106 | /** | |
| 107 | * Gets a Lang instance for one of the supported NameTypes. | |
| 108 | * | |
| 109 | * @param nameType | |
| 110 | * the NameType to look up | |
| 111 | * @return a Lang encapsulating the language guessing rules for that name type | |
| 112 | */ | |
| 113 | public static Lang instance(final NameType nameType) { | |
| 114 | 145 | return Langs.get(nameType); |
| 115 | } | |
| 116 | ||
| 117 | /** | |
| 118 | * Loads language rules from a resource. | |
| 119 | * <p> | |
| 120 | * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. | |
| 121 | * You will only need to call this yourself if you are developing custom language mapping rules. | |
| 122 | * | |
| 123 | * @param languageRulesResourceName | |
| 124 | * the fully-qualified resource name to load | |
| 125 | * @param languages | |
| 126 | * the languages that these rules will support | |
| 127 | * @return a Lang encapsulating the loaded language-guessing rules. | |
| 128 | */ | |
| 129 | public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) { | |
| 130 | 4 | final List<LangRule> rules = new ArrayList<LangRule>(); |
| 131 | 4 | final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName); |
| 132 | ||
| 133 | 4 | if (lRulesIS == null) { |
| 134 | 1 | throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN); |
| 135 | } | |
| 136 | ||
| 137 | 3 | final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING); |
| 138 | 3 | boolean inExtendedComment = false; |
| 139 | 882 | while (scanner.hasNextLine()) { |
| 140 | 879 | final String rawLine = scanner.nextLine(); |
| 141 | 879 | String line = rawLine; |
| 142 | ||
| 143 | 879 | if (inExtendedComment) { |
| 144 | // check for closing comment marker, otherwise discard doc comment line | |
| 145 | 45 | if (line.endsWith(ResourceConstants.EXT_CMT_END)) { |
| 146 | 3 | inExtendedComment = false; |
| 147 | } | |
| 148 | } else { | |
| 149 | 834 | if (line.startsWith(ResourceConstants.EXT_CMT_START)) { |
| 150 | 3 | inExtendedComment = true; |
| 151 | } else { | |
| 152 | // discard comments | |
| 153 | 831 | final int cmtI = line.indexOf(ResourceConstants.CMT); |
| 154 | 831 | if (cmtI >= 0) { |
| 155 | 144 | line = line.substring(0, cmtI); |
| 156 | } | |
| 157 | ||
| 158 | // trim leading-trailing whitespace | |
| 159 | 831 | line = line.trim(); |
| 160 | ||
| 161 | 831 | if (line.length() == 0) { |
| 162 | 75 | continue; // empty lines can be safely skipped |
| 163 | } | |
| 164 | ||
| 165 | // split it up | |
| 166 | 756 | final String[] parts = line.split("\\s+"); |
| 167 | ||
| 168 | 756 | if (parts.length != 3) { |
| 169 | 0 | throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" + |
| 170 | languageRulesResourceName + "'"); | |
| 171 | } | |
| 172 | ||
| 173 | 756 | final Pattern pattern = Pattern.compile(parts[0]); |
| 174 | 756 | final String[] langs = parts[1].split("\\+"); |
| 175 | 756 | final boolean accept = parts[2].equals("true"); |
| 176 | ||
| 177 | 756 | rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept)); |
| 178 | } | |
| 179 | } | |
| 180 | 804 | } |
| 181 | ||
| 182 | 3 | return new Lang(rules, languages); |
| 183 | } | |
| 184 | ||
| 185 | private final Languages languages; | |
| 186 | private final List<LangRule> rules; | |
| 187 | ||
| 188 | 3 | private Lang(final List<LangRule> rules, final Languages languages) { |
| 189 | 3 | this.rules = Collections.unmodifiableList(rules); |
| 190 | 3 | this.languages = languages; |
| 191 | 3 | } |
| 192 | ||
| 193 | /** | |
| 194 | * Guesses the language of a word. | |
| 195 | * | |
| 196 | * @param text | |
| 197 | * the word | |
| 198 | * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match | |
| 199 | */ | |
| 200 | public String guessLanguage(final String text) { | |
| 201 | 0 | final Languages.LanguageSet ls = guessLanguages(text); |
| 202 | 0 | return ls.isSingleton() ? ls.getAny() : Languages.ANY; |
| 203 | } | |
| 204 | ||
| 205 | /** | |
| 206 | * Guesses the languages of a word. | |
| 207 | * | |
| 208 | * @param input | |
| 209 | * the word | |
| 210 | * @return a Set of Strings of language names that are potential matches for the input word | |
| 211 | */ | |
| 212 | public Languages.LanguageSet guessLanguages(final String input) { | |
| 213 | 67162 | final String text = input.toLowerCase(Locale.ENGLISH); |
| 214 | ||
| 215 | 67162 | final Set<String> langs = new HashSet<String>(this.languages.getLanguages()); |
| 216 | 67162 | for (final LangRule rule : this.rules) { |
| 217 | 16924824 | if (rule.matches(text)) { |
| 218 | 3506 | if (rule.acceptOnMatch) { |
| 219 | 408 | langs.retainAll(rule.languages); |
| 220 | } else { | |
| 221 | 3098 | langs.removeAll(rule.languages); |
| 222 | } | |
| 223 | } | |
| 224 | 16924824 | } |
| 225 | ||
| 226 | 67162 | final Languages.LanguageSet ls = Languages.LanguageSet.from(langs); |
| 227 | 67162 | return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; |
| 228 | } | |
| 229 | } |