1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.language; 19 20 import org.apache.commons.codec.EncoderException; 21 import org.apache.commons.codec.StringEncoder; 22 23 /** 24 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 25 * general purpose scheme to find word with similar phonemes. 26 * 27 * This class is thread-safe. 28 * Although not strictly immutable, the {@link #maxLength} field is not actually used. 29 * 30 * @version $Id: Soundex.java 1811347 2017-10-06 15:21:18Z ggregory $ 31 */ 32 public class Soundex implements StringEncoder { 33 34 /** 35 * The marker character used to indicate a silent (ignored) character. 36 * These are ignored except when they appear as the first character. 37 * <p> 38 * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism 39 * because changing it might break existing code. Mappings that don't contain 40 * a silent marker code are treated as though H and W are silent. 41 * <p> 42 * To override this, use the {@link #Soundex(String, boolean)} constructor. 43 * @since 1.11 44 */ 45 public static final char SILENT_MARKER = '-'; 46 47 /** 48 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 49 * means do not encode, but treat as a separator when it occurs between consonants with the same code. 50 * <p> 51 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 52 * up the value for the constant values page.) 53 * <p> 54 * <b>Note that letters H and W are treated specially.</b> 55 * They are ignored (after the first letter) and don't act as separators 56 * between consonants with the same code. 57 * @see #US_ENGLISH_MAPPING 58 */ 59 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 60 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 61 62 /** 63 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 64 * means do not encode. 65 * 66 * @see Soundex#Soundex(char[]) 67 */ 68 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 69 70 /** 71 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 72 * This treats H and W as silent letters. 73 * Apart from when they appear as the first letter, they are ignored. 74 * They don't act as separators between duplicate codes. 75 * 76 * @see #US_ENGLISH_MAPPING 77 * @see #US_ENGLISH_MAPPING_STRING 78 */ 79 public static final Soundex US_ENGLISH = new Soundex(); 80 81 /** 82 * An instance of Soundex using the Simplified Soundex mapping, as described here: 83 * http://west-penwith.org.uk/misc/soundex.htm 84 * <p> 85 * This treats H and W the same as vowels (AEIOUY). 86 * Such letters aren't encoded (after the first), but they do 87 * act as separators when dropping duplicate codes. 88 * The mapping is otherwise the same as for {@link #US_ENGLISH} 89 * <p> 90 * @since 1.11 91 */ 92 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); 93 94 /** 95 * An instance of Soundex using the mapping as per the Genealogy site: 96 * http://www.genealogy.com/articles/research/00000060.html 97 * <p> 98 * This treats vowels (AEIOUY), H and W as silent letters. 99 * Such letters are ignored (after the first) and do not 100 * act as separators when dropping duplicate codes. 101 * <p> 102 * The codes for consonants are otherwise the same as for 103 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED} 104 * 105 * @since 1.11 106 */ 107 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); 108 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 109 110 /** 111 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 112 * 113 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 114 */ 115 @Deprecated 116 private int maxLength = 4; 117 118 /** 119 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 120 * letter is mapped. This implementation contains a default map for US_ENGLISH 121 */ 122 private final char[] soundexMapping; 123 124 /** 125 * Should H and W be treated specially? 126 * <p> 127 * In versions of the code prior to 1.11, 128 * the code always treated H and W as silent (ignored) letters. 129 * If this field is false, H and W are no longer special-cased. 130 */ 131 private final boolean specialCaseHW; 132 133 /** 134 * Creates an instance using US_ENGLISH_MAPPING 135 * 136 * @see Soundex#Soundex(char[]) 137 * @see Soundex#US_ENGLISH_MAPPING 138 */ 139 public Soundex() { 140 this.soundexMapping = US_ENGLISH_MAPPING; 141 this.specialCaseHW = true; 142 } 143 144 /** 145 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 146 * mapping for a non-Western character set. 147 * 148 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 149 * letter is mapped. This implementation contains a default map for US_ENGLISH 150 * <p> 151 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 152 * 153 * @param mapping 154 * Mapping array to use when finding the corresponding code for a given character 155 */ 156 public Soundex(final char[] mapping) { 157 this.soundexMapping = new char[mapping.length]; 158 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); 159 this.specialCaseHW = !hasMarker(this.soundexMapping); 160 } 161 162 private boolean hasMarker(final char[] mapping) { 163 for(final char ch : mapping) { 164 if (ch == SILENT_MARKER) { 165 return true; 166 } 167 } 168 return false; 169 } 170 171 /** 172 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 173 * and/or possibly provide an internationalized mapping for a non-Western character set. 174 * <p> 175 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 176 * 177 * @param mapping 178 * Mapping string to use when finding the corresponding code for a given character 179 * @since 1.4 180 */ 181 public Soundex(final String mapping) { 182 this.soundexMapping = mapping.toCharArray(); 183 this.specialCaseHW = !hasMarker(this.soundexMapping); 184 } 185 186 /** 187 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 188 * and/or possibly provide an internationalized mapping for a non-Western character set. 189 * 190 * @param mapping 191 * Mapping string to use when finding the corresponding code for a given character 192 * @param specialCaseHW if true, then 193 * @since 1.11 194 */ 195 public Soundex(final String mapping, final boolean specialCaseHW) { 196 this.soundexMapping = mapping.toCharArray(); 197 this.specialCaseHW = specialCaseHW; 198 } 199 200 /** 201 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 202 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 203 * identical values. 204 * 205 * @param s1 206 * A String that will be encoded and compared. 207 * @param s2 208 * A String that will be encoded and compared. 209 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 210 * 211 * @see SoundexUtils#difference(StringEncoder,String,String) 212 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 213 * T-SQL DIFFERENCE </a> 214 * 215 * @throws EncoderException 216 * if an error occurs encoding one of the strings 217 * @since 1.3 218 */ 219 public int difference(final String s1, final String s2) throws EncoderException { 220 return SoundexUtils.difference(this, s1, s2); 221 } 222 223 /** 224 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 225 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. 226 * 227 * @param obj 228 * Object to encode 229 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String 230 * supplied. 231 * @throws EncoderException 232 * if the parameter supplied is not of type java.lang.String 233 * @throws IllegalArgumentException 234 * if a character is not mapped 235 */ 236 @Override 237 public Object encode(final Object obj) throws EncoderException { 238 if (!(obj instanceof String)) { 239 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 240 } 241 return soundex((String) obj); 242 } 243 244 /** 245 * Encodes a String using the soundex algorithm. 246 * 247 * @param str 248 * A String object to encode 249 * @return A Soundex code corresponding to the String supplied 250 * @throws IllegalArgumentException 251 * if a character is not mapped 252 */ 253 @Override 254 public String encode(final String str) { 255 return soundex(str); 256 } 257 258 /** 259 * Returns the maxLength. Standard Soundex 260 * 261 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 262 * @return int 263 */ 264 @Deprecated 265 public int getMaxLength() { 266 return this.maxLength; 267 } 268 269 /** 270 * Maps the given upper-case character to its Soundex code. 271 * 272 * @param ch 273 * An upper-case character. 274 * @return A Soundex code. 275 * @throws IllegalArgumentException 276 * Thrown if <code>ch</code> is not mapped. 277 */ 278 private char map(final char ch) { 279 final int index = ch - 'A'; 280 if (index < 0 || index >= this.soundexMapping.length) { 281 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); 282 } 283 return this.soundexMapping[index]; 284 } 285 286 /** 287 * Sets the maxLength. 288 * 289 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 290 * @param maxLength 291 * The maxLength to set 292 */ 293 @Deprecated 294 public void setMaxLength(final int maxLength) { 295 this.maxLength = maxLength; 296 } 297 298 /** 299 * Retrieves the Soundex code for a given String object. 300 * 301 * @param str 302 * String to encode using the Soundex algorithm 303 * @return A soundex code for the String supplied 304 * @throws IllegalArgumentException 305 * if a character is not mapped 306 */ 307 public String soundex(String str) { 308 if (str == null) { 309 return null; 310 } 311 str = SoundexUtils.clean(str); 312 if (str.length() == 0) { 313 return str; 314 } 315 final char out[] = {'0', '0', '0', '0'}; 316 int count = 0; 317 final char first = str.charAt(0); 318 out[count++] = first; 319 char lastDigit = map(first); // previous digit 320 for(int i = 1; i < str.length() && count < out.length ; i++) { 321 final char ch = str.charAt(i); 322 if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely 323 continue; 324 } 325 final char digit = map(ch); 326 if (digit == SILENT_MARKER) { 327 continue; 328 } 329 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats 330 out[count++] = digit; 331 } 332 lastDigit = digit; 333 } 334 return new String(out); 335 } 336 337 }