| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Nysiis |
|
| 7.25;7.25 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.commons.codec.language; | |
| 19 | ||
| 20 | import java.util.regex.Pattern; | |
| 21 | ||
| 22 | import org.apache.commons.codec.EncoderException; | |
| 23 | import org.apache.commons.codec.StringEncoder; | |
| 24 | ||
| 25 | /** | |
| 26 | * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a | |
| 27 | * general purpose scheme to find word with similar phonemes. | |
| 28 | * <p> | |
| 29 | * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. | |
| 30 | * <p> | |
| 31 | * Algorithm description: | |
| 32 | * <pre> | |
| 33 | * 1. Transcode first characters of name | |
| 34 | * 1a. MAC -> MCC | |
| 35 | * 1b. KN -> NN | |
| 36 | * 1c. K -> C | |
| 37 | * 1d. PH -> FF | |
| 38 | * 1e. PF -> FF | |
| 39 | * 1f. SCH -> SSS | |
| 40 | * 2. Transcode last characters of name | |
| 41 | * 2a. EE, IE -> Y | |
| 42 | * 2b. DT,RT,RD,NT,ND -> D | |
| 43 | * 3. First character of key = first character of name | |
| 44 | * 4. Transcode remaining characters by following these rules, incrementing by one character each time | |
| 45 | * 4a. EV -> AF else A,E,I,O,U -> A | |
| 46 | * 4b. Q -> G | |
| 47 | * 4c. Z -> S | |
| 48 | * 4d. M -> N | |
| 49 | * 4e. KN -> N else K -> C | |
| 50 | * 4f. SCH -> SSS | |
| 51 | * 4g. PH -> FF | |
| 52 | * 4h. H -> If previous or next is nonvowel, previous | |
| 53 | * 4i. W -> If previous is vowel, previous | |
| 54 | * 4j. Add current to key if current != last key character | |
| 55 | * 5. If last character is S, remove it | |
| 56 | * 6. If last characters are AY, replace with Y | |
| 57 | * 7. If last character is A, remove it | |
| 58 | * 8. Collapse all strings of repeated characters | |
| 59 | * 9. Add original first character of name as first character of key | |
| 60 | * </pre> | |
| 61 | * <p> | |
| 62 | * This class is immutable and thread-safe. | |
| 63 | * | |
| 64 | * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a> | |
| 65 | * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a> | |
| 66 | * @see Soundex | |
| 67 | * @since 1.7 | |
| 68 | * @version $Id: Nysiis.java 1429868 2013-01-07 16:08:05Z ggregory $ | |
| 69 | */ | |
| 70 | public class Nysiis implements StringEncoder { | |
| 71 | ||
| 72 | 1 | private static final char[] CHARS_A = new char[] { 'A' }; |
| 73 | 1 | private static final char[] CHARS_AF = new char[] { 'A', 'F' }; |
| 74 | 1 | private static final char[] CHARS_C = new char[] { 'C' }; |
| 75 | 1 | private static final char[] CHARS_FF = new char[] { 'F', 'F' }; |
| 76 | 1 | private static final char[] CHARS_G = new char[] { 'G' }; |
| 77 | 1 | private static final char[] CHARS_N = new char[] { 'N' }; |
| 78 | 1 | private static final char[] CHARS_NN = new char[] { 'N', 'N' }; |
| 79 | 1 | private static final char[] CHARS_S = new char[] { 'S' }; |
| 80 | 1 | private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' }; |
| 81 | ||
| 82 | 1 | private static final Pattern PAT_MAC = Pattern.compile("^MAC"); |
| 83 | 1 | private static final Pattern PAT_KN = Pattern.compile("^KN"); |
| 84 | 1 | private static final Pattern PAT_K = Pattern.compile("^K"); |
| 85 | 1 | private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); |
| 86 | 1 | private static final Pattern PAT_SCH = Pattern.compile("^SCH"); |
| 87 | 1 | private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); |
| 88 | 1 | private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); |
| 89 | ||
| 90 | private static final char SPACE = ' '; | |
| 91 | private static final int TRUE_LENGTH = 6; | |
| 92 | ||
| 93 | /** | |
| 94 | * Tests if the given character is a vowel. | |
| 95 | * | |
| 96 | * @param c | |
| 97 | * the character to test | |
| 98 | * @return {@code true} if the character is a vowel, {@code false} otherwise | |
| 99 | */ | |
| 100 | private static boolean isVowel(final char c) { | |
| 101 | 340 | return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; |
| 102 | } | |
| 103 | ||
| 104 | /** | |
| 105 | * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at | |
| 106 | * a time: [i-1, i, i+1, i+2]. | |
| 107 | * | |
| 108 | * @param prev | |
| 109 | * the previous character | |
| 110 | * @param curr | |
| 111 | * the current character | |
| 112 | * @param next | |
| 113 | * the next character | |
| 114 | * @param aNext | |
| 115 | * the after next character | |
| 116 | * @return a transcoded array of characters, starting from the current position | |
| 117 | */ | |
| 118 | private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { | |
| 119 | // 1. EV -> AF | |
| 120 | 321 | if (curr == 'E' && next == 'V') { |
| 121 | 2 | return CHARS_AF; |
| 122 | } | |
| 123 | ||
| 124 | // A, E, I, O, U -> A | |
| 125 | 319 | if (isVowel(curr)) { |
| 126 | 111 | return CHARS_A; |
| 127 | } | |
| 128 | ||
| 129 | // 2. Q -> G, Z -> S, M -> N | |
| 130 | 208 | if (curr == 'Q') { |
| 131 | 2 | return CHARS_G; |
| 132 | 206 | } else if (curr == 'Z') { |
| 133 | 5 | return CHARS_S; |
| 134 | 201 | } else if (curr == 'M') { |
| 135 | 11 | return CHARS_N; |
| 136 | } | |
| 137 | ||
| 138 | // 3. KN -> NN else K -> C | |
| 139 | 190 | if (curr == 'K') { |
| 140 | 5 | if (next == 'N') { |
| 141 | 1 | return CHARS_NN; |
| 142 | } else { | |
| 143 | 4 | return CHARS_C; |
| 144 | } | |
| 145 | } | |
| 146 | ||
| 147 | // 4. SCH -> SSS | |
| 148 | 185 | if (curr == 'S' && next == 'C' && aNext == 'H') { |
| 149 | 2 | return CHARS_SSS; |
| 150 | } | |
| 151 | ||
| 152 | // PH -> FF | |
| 153 | 183 | if (curr == 'P' && next == 'H') { |
| 154 | 1 | return CHARS_FF; |
| 155 | } | |
| 156 | ||
| 157 | // 5. H -> If previous or next is a non vowel, previous. | |
| 158 | 182 | if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { |
| 159 | 11 | return new char[] { prev }; |
| 160 | } | |
| 161 | ||
| 162 | // 6. W -> If previous is vowel, previous. | |
| 163 | 171 | if (curr == 'W' && isVowel(prev)) { |
| 164 | 4 | return new char[] { prev }; |
| 165 | } | |
| 166 | ||
| 167 | 167 | return new char[] { curr }; |
| 168 | } | |
| 169 | ||
| 170 | /** Indicates the strict mode. */ | |
| 171 | private final boolean strict; | |
| 172 | ||
| 173 | /** | |
| 174 | * Creates an instance of the {@link Nysiis} encoder with strict mode (original form), | |
| 175 | * i.e. encoded strings have a maximum length of 6. | |
| 176 | */ | |
| 177 | public Nysiis() { | |
| 178 | 23 | this(true); |
| 179 | 23 | } |
| 180 | ||
| 181 | /** | |
| 182 | * Create an instance of the {@link Nysiis} encoder with the specified strict mode: | |
| 183 | * | |
| 184 | * <ul> | |
| 185 | * <li>{@code true}: encoded strings have a maximum length of 6</li> | |
| 186 | * <li>{@code false}: encoded strings may have arbitrary length</li> | |
| 187 | * </ul> | |
| 188 | * | |
| 189 | * @param strict | |
| 190 | * the strict mode | |
| 191 | */ | |
| 192 | 47 | public Nysiis(final boolean strict) { |
| 193 | 47 | this.strict = strict; |
| 194 | 47 | } |
| 195 | ||
| 196 | /** | |
| 197 | * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the | |
| 198 | * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type | |
| 199 | * {@link String}. | |
| 200 | * | |
| 201 | * @param obj | |
| 202 | * Object to encode | |
| 203 | * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. | |
| 204 | * @throws EncoderException | |
| 205 | * if the parameter supplied is not of a {@link String} | |
| 206 | * @throws IllegalArgumentException | |
| 207 | * if a character is not mapped | |
| 208 | */ | |
| 209 | @Override | |
| 210 | public Object encode(final Object obj) throws EncoderException { | |
| 211 | 4 | if (!(obj instanceof String)) { |
| 212 | 1 | throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); |
| 213 | } | |
| 214 | 3 | return this.nysiis((String) obj); |
| 215 | } | |
| 216 | ||
| 217 | /** | |
| 218 | * Encodes a String using the NYSIIS algorithm. | |
| 219 | * | |
| 220 | * @param str | |
| 221 | * A String object to encode | |
| 222 | * @return A Nysiis code corresponding to the String supplied | |
| 223 | * @throws IllegalArgumentException | |
| 224 | * if a character is not mapped | |
| 225 | */ | |
| 226 | @Override | |
| 227 | public String encode(final String str) { | |
| 228 | 93 | return this.nysiis(str); |
| 229 | } | |
| 230 | ||
| 231 | /** | |
| 232 | * Indicates the strict mode for this {@link Nysiis} encoder. | |
| 233 | * | |
| 234 | * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise | |
| 235 | */ | |
| 236 | public boolean isStrict() { | |
| 237 | 92 | return this.strict; |
| 238 | } | |
| 239 | ||
| 240 | /** | |
| 241 | * Retrieves the NYSIIS code for a given String object. | |
| 242 | * | |
| 243 | * @param str | |
| 244 | * String to encode using the NYSIIS algorithm | |
| 245 | * @return A NYSIIS code for the String supplied | |
| 246 | */ | |
| 247 | public String nysiis(String str) { | |
| 248 | 96 | if (str == null) { |
| 249 | 1 | return null; |
| 250 | } | |
| 251 | ||
| 252 | // Use the same clean rules as Soundex | |
| 253 | 95 | str = SoundexUtils.clean(str); |
| 254 | ||
| 255 | 95 | if (str.length() == 0) { |
| 256 | 3 | return str; |
| 257 | } | |
| 258 | ||
| 259 | // Translate first characters of name: | |
| 260 | // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS | |
| 261 | 92 | str = PAT_MAC.matcher(str).replaceFirst("MCC"); |
| 262 | 92 | str = PAT_KN.matcher(str).replaceFirst("NN"); |
| 263 | 92 | str = PAT_K.matcher(str).replaceFirst("C"); |
| 264 | 92 | str = PAT_PH_PF.matcher(str).replaceFirst("FF"); |
| 265 | 92 | str = PAT_SCH.matcher(str).replaceFirst("SSS"); |
| 266 | ||
| 267 | // Translate last characters of name: | |
| 268 | // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D | |
| 269 | 92 | str = PAT_EE_IE.matcher(str).replaceFirst("Y"); |
| 270 | 92 | str = PAT_DT_ETC.matcher(str).replaceFirst("D"); |
| 271 | ||
| 272 | // First character of key = first character of name. | |
| 273 | 92 | final StringBuilder key = new StringBuilder(str.length()); |
| 274 | 92 | key.append(str.charAt(0)); |
| 275 | ||
| 276 | // Transcode remaining characters, incrementing by one character each time | |
| 277 | 92 | final char[] chars = str.toCharArray(); |
| 278 | 92 | final int len = chars.length; |
| 279 | ||
| 280 | 413 | for (int i = 1; i < len; i++) { |
| 281 | 321 | final char next = i < len - 1 ? chars[i + 1] : SPACE; |
| 282 | 321 | final char aNext = i < len - 2 ? chars[i + 2] : SPACE; |
| 283 | 321 | final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); |
| 284 | 321 | System.arraycopy(transcoded, 0, chars, i, transcoded.length); |
| 285 | ||
| 286 | // only append the current char to the key if it is different from the last one | |
| 287 | 321 | if (chars[i] != chars[i - 1]) { |
| 288 | 254 | key.append(chars[i]); |
| 289 | } | |
| 290 | } | |
| 291 | ||
| 292 | 92 | if (key.length() > 1) { |
| 293 | 86 | char lastChar = key.charAt(key.length() - 1); |
| 294 | ||
| 295 | // If last character is S, remove it. | |
| 296 | 86 | if (lastChar == 'S') { |
| 297 | 10 | key.deleteCharAt(key.length() - 1); |
| 298 | 10 | lastChar = key.charAt(key.length() - 1); |
| 299 | } | |
| 300 | ||
| 301 | 86 | if (key.length() > 2) { |
| 302 | 66 | final char last2Char = key.charAt(key.length() - 2); |
| 303 | // If last characters are AY, replace with Y. | |
| 304 | 66 | if (last2Char == 'A' && lastChar == 'Y') { |
| 305 | 4 | key.deleteCharAt(key.length() - 2); |
| 306 | } | |
| 307 | } | |
| 308 | ||
| 309 | // If last character is A, remove it. | |
| 310 | 86 | if (lastChar == 'A') { |
| 311 | 12 | key.deleteCharAt(key.length() - 1); |
| 312 | } | |
| 313 | } | |
| 314 | ||
| 315 | 92 | final String string = key.toString(); |
| 316 | 92 | return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; |
| 317 | } | |
| 318 | ||
| 319 | } |