001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.regex.Pattern; 021 022import org.apache.commons.codec.EncoderException; 023import org.apache.commons.codec.StringEncoder; 024 025/** 026 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a 027 * general purpose scheme to find word with similar phonemes. 028 * <p> 029 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. 030 * <p> 031 * Algorithm description: 032 * <pre> 033 * 1. Transcode first characters of name 034 * 1a. MAC -> MCC 035 * 1b. KN -> NN 036 * 1c. K -> C 037 * 1d. PH -> FF 038 * 1e. PF -> FF 039 * 1f. SCH -> SSS 040 * 2. Transcode last characters of name 041 * 2a. EE, IE -> Y 042 * 2b. DT,RT,RD,NT,ND -> D 043 * 3. First character of key = first character of name 044 * 4. Transcode remaining characters by following these rules, incrementing by one character each time 045 * 4a. EV -> AF else A,E,I,O,U -> A 046 * 4b. Q -> G 047 * 4c. Z -> S 048 * 4d. M -> N 049 * 4e. KN -> N else K -> C 050 * 4f. SCH -> SSS 051 * 4g. PH -> FF 052 * 4h. H -> If previous or next is nonvowel, previous 053 * 4i. W -> If previous is vowel, previous 054 * 4j. Add current to key if current != last key character 055 * 5. If last character is S, remove it 056 * 6. If last characters are AY, replace with Y 057 * 7. If last character is A, remove it 058 * 8. Collapse all strings of repeated characters 059 * 9. Add original first character of name as first character of key 060 * </pre> 061 * <p> 062 * This class is immutable and thread-safe. 063 * 064 * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a> 065 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a> 066 * @see Soundex 067 * @since 1.7 068 * @version $Id: Nysiis.html 928559 2014-11-10 02:53:54Z ggregory $ 069 */ 070public class Nysiis implements StringEncoder { 071 072 private static final char[] CHARS_A = new char[] { 'A' }; 073 private static final char[] CHARS_AF = new char[] { 'A', 'F' }; 074 private static final char[] CHARS_C = new char[] { 'C' }; 075 private static final char[] CHARS_FF = new char[] { 'F', 'F' }; 076 private static final char[] CHARS_G = new char[] { 'G' }; 077 private static final char[] CHARS_N = new char[] { 'N' }; 078 private static final char[] CHARS_NN = new char[] { 'N', 'N' }; 079 private static final char[] CHARS_S = new char[] { 'S' }; 080 private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' }; 081 082 private static final Pattern PAT_MAC = Pattern.compile("^MAC"); 083 private static final Pattern PAT_KN = Pattern.compile("^KN"); 084 private static final Pattern PAT_K = Pattern.compile("^K"); 085 private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); 086 private static final Pattern PAT_SCH = Pattern.compile("^SCH"); 087 private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); 088 private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); 089 090 private static final char SPACE = ' '; 091 private static final int TRUE_LENGTH = 6; 092 093 /** 094 * Tests if the given character is a vowel. 095 * 096 * @param c 097 * the character to test 098 * @return <code>true</code> if the character is a vowel, <code>false</code> otherwise 099 */ 100 private static boolean isVowel(final char c) { 101 return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; 102 } 103 104 /** 105 * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at 106 * a time: [i-1, i, i+1, i+2]. 107 * 108 * @param prev 109 * the previous character 110 * @param curr 111 * the current character 112 * @param next 113 * the next character 114 * @param aNext 115 * the after next character 116 * @return a transcoded array of characters, starting from the current position 117 */ 118 private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { 119 // 1. EV -> AF 120 if (curr == 'E' && next == 'V') { 121 return CHARS_AF; 122 } 123 124 // A, E, I, O, U -> A 125 if (isVowel(curr)) { 126 return CHARS_A; 127 } 128 129 // 2. Q -> G, Z -> S, M -> N 130 if (curr == 'Q') { 131 return CHARS_G; 132 } else if (curr == 'Z') { 133 return CHARS_S; 134 } else if (curr == 'M') { 135 return CHARS_N; 136 } 137 138 // 3. KN -> NN else K -> C 139 if (curr == 'K') { 140 if (next == 'N') { 141 return CHARS_NN; 142 } else { 143 return CHARS_C; 144 } 145 } 146 147 // 4. SCH -> SSS 148 if (curr == 'S' && next == 'C' && aNext == 'H') { 149 return CHARS_SSS; 150 } 151 152 // PH -> FF 153 if (curr == 'P' && next == 'H') { 154 return CHARS_FF; 155 } 156 157 // 5. H -> If previous or next is a non vowel, previous. 158 if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { 159 return new char[] { prev }; 160 } 161 162 // 6. W -> If previous is vowel, previous. 163 if (curr == 'W' && isVowel(prev)) { 164 return new char[] { prev }; 165 } 166 167 return new char[] { curr }; 168 } 169 170 /** Indicates the strict mode. */ 171 private final boolean strict; 172 173 /** 174 * Creates an instance of the {@link Nysiis} encoder with strict mode (original form), 175 * i.e. encoded strings have a maximum length of 6. 176 */ 177 public Nysiis() { 178 this(true); 179 } 180 181 /** 182 * Create an instance of the {@link Nysiis} encoder with the specified strict mode: 183 * 184 * <ul> 185 * <li><code>true</code>: encoded strings have a maximum length of 6</li> 186 * <li><code>false</code>: encoded strings may have arbitrary length</li> 187 * </ul> 188 * 189 * @param strict 190 * the strict mode 191 */ 192 public Nysiis(final boolean strict) { 193 this.strict = strict; 194 } 195 196 /** 197 * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the 198 * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type 199 * {@link String}. 200 * 201 * @param obj 202 * Object to encode 203 * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. 204 * @throws EncoderException 205 * if the parameter supplied is not of a {@link String} 206 * @throws IllegalArgumentException 207 * if a character is not mapped 208 */ 209 @Override 210 public Object encode(final Object obj) throws EncoderException { 211 if (!(obj instanceof String)) { 212 throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); 213 } 214 return this.nysiis((String) obj); 215 } 216 217 /** 218 * Encodes a String using the NYSIIS algorithm. 219 * 220 * @param str 221 * A String object to encode 222 * @return A Nysiis code corresponding to the String supplied 223 * @throws IllegalArgumentException 224 * if a character is not mapped 225 */ 226 @Override 227 public String encode(final String str) { 228 return this.nysiis(str); 229 } 230 231 /** 232 * Indicates the strict mode for this {@link Nysiis} encoder. 233 * 234 * @return <code>true</code> if the encoder is configured for strict mode, <code>false</code> otherwise 235 */ 236 public boolean isStrict() { 237 return this.strict; 238 } 239 240 /** 241 * Retrieves the NYSIIS code for a given String object. 242 * 243 * @param str 244 * String to encode using the NYSIIS algorithm 245 * @return A NYSIIS code for the String supplied 246 */ 247 public String nysiis(String str) { 248 if (str == null) { 249 return null; 250 } 251 252 // Use the same clean rules as Soundex 253 str = SoundexUtils.clean(str); 254 255 if (str.length() == 0) { 256 return str; 257 } 258 259 // Translate first characters of name: 260 // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS 261 str = PAT_MAC.matcher(str).replaceFirst("MCC"); 262 str = PAT_KN.matcher(str).replaceFirst("NN"); 263 str = PAT_K.matcher(str).replaceFirst("C"); 264 str = PAT_PH_PF.matcher(str).replaceFirst("FF"); 265 str = PAT_SCH.matcher(str).replaceFirst("SSS"); 266 267 // Translate last characters of name: 268 // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D 269 str = PAT_EE_IE.matcher(str).replaceFirst("Y"); 270 str = PAT_DT_ETC.matcher(str).replaceFirst("D"); 271 272 // First character of key = first character of name. 273 final StringBuilder key = new StringBuilder(str.length()); 274 key.append(str.charAt(0)); 275 276 // Transcode remaining characters, incrementing by one character each time 277 final char[] chars = str.toCharArray(); 278 final int len = chars.length; 279 280 for (int i = 1; i < len; i++) { 281 final char next = i < len - 1 ? chars[i + 1] : SPACE; 282 final char aNext = i < len - 2 ? chars[i + 2] : SPACE; 283 final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); 284 System.arraycopy(transcoded, 0, chars, i, transcoded.length); 285 286 // only append the current char to the key if it is different from the last one 287 if (chars[i] != chars[i - 1]) { 288 key.append(chars[i]); 289 } 290 } 291 292 if (key.length() > 1) { 293 char lastChar = key.charAt(key.length() - 1); 294 295 // If last character is S, remove it. 296 if (lastChar == 'S') { 297 key.deleteCharAt(key.length() - 1); 298 lastChar = key.charAt(key.length() - 1); 299 } 300 301 if (key.length() > 2) { 302 final char last2Char = key.charAt(key.length() - 2); 303 // If last characters are AY, replace with Y. 304 if (last2Char == 'A' && lastChar == 'Y') { 305 key.deleteCharAt(key.length() - 2); 306 } 307 } 308 309 // If last character is A, remove it. 310 if (lastChar == 'A') { 311 key.deleteCharAt(key.length() - 1); 312 } 313 } 314 315 final String string = key.toString(); 316 return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; 317 } 318 319}