001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 025 * general purpose scheme to find word with similar phonemes. 026 * 027 * This class is thread-safe. 028 * Although not strictly immutable, the {@link #maxLength} field is not actually used. 029 * 030 */ 031public class Soundex implements StringEncoder { 032 033 /** 034 * The marker character used to indicate a silent (ignored) character. 035 * These are ignored except when they appear as the first character. 036 * <p> 037 * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism 038 * because changing it might break existing code. Mappings that don't contain 039 * a silent marker code are treated as though H and W are silent. 040 * <p> 041 * To override this, use the {@link #Soundex(String, boolean)} constructor. 042 * @since 1.11 043 */ 044 public static final char SILENT_MARKER = '-'; 045 046 /** 047 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 048 * means do not encode, but treat as a separator when it occurs between consonants with the same code. 049 * <p> 050 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 051 * up the value for the constant values page.) 052 * <p> 053 * <b>Note that letters H and W are treated specially.</b> 054 * They are ignored (after the first letter) and don't act as separators 055 * between consonants with the same code. 056 * @see #US_ENGLISH_MAPPING 057 */ 058 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 059 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 060 061 /** 062 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 063 * means do not encode. 064 * 065 * @see Soundex#Soundex(char[]) 066 */ 067 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 068 069 /** 070 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 071 * This treats H and W as silent letters. 072 * Apart from when they appear as the first letter, they are ignored. 073 * They don't act as separators between duplicate codes. 074 * 075 * @see #US_ENGLISH_MAPPING 076 * @see #US_ENGLISH_MAPPING_STRING 077 */ 078 public static final Soundex US_ENGLISH = new Soundex(); 079 080 /** 081 * An instance of Soundex using the Simplified Soundex mapping, as described here: 082 * http://west-penwith.org.uk/misc/soundex.htm 083 * <p> 084 * This treats H and W the same as vowels (AEIOUY). 085 * Such letters aren't encoded (after the first), but they do 086 * act as separators when dropping duplicate codes. 087 * The mapping is otherwise the same as for {@link #US_ENGLISH} 088 * <p> 089 * @since 1.11 090 */ 091 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); 092 093 /** 094 * An instance of Soundex using the mapping as per the Genealogy site: 095 * http://www.genealogy.com/articles/research/00000060.html 096 * <p> 097 * This treats vowels (AEIOUY), H and W as silent letters. 098 * Such letters are ignored (after the first) and do not 099 * act as separators when dropping duplicate codes. 100 * <p> 101 * The codes for consonants are otherwise the same as for 102 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED} 103 * 104 * @since 1.11 105 */ 106 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); 107 // ABCDEFGHIJKLMNOPQRSTUVWXYZ 108 109 /** 110 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 111 * 112 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 113 */ 114 @Deprecated 115 private int maxLength = 4; 116 117 /** 118 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 119 * letter is mapped. This implementation contains a default map for US_ENGLISH 120 */ 121 private final char[] soundexMapping; 122 123 /** 124 * Should H and W be treated specially? 125 * <p> 126 * In versions of the code prior to 1.11, 127 * the code always treated H and W as silent (ignored) letters. 128 * If this field is false, H and W are no longer special-cased. 129 */ 130 private final boolean specialCaseHW; 131 132 /** 133 * Creates an instance using US_ENGLISH_MAPPING 134 * 135 * @see Soundex#Soundex(char[]) 136 * @see Soundex#US_ENGLISH_MAPPING 137 */ 138 public Soundex() { 139 this.soundexMapping = US_ENGLISH_MAPPING; 140 this.specialCaseHW = true; 141 } 142 143 /** 144 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 145 * mapping for a non-Western character set. 146 * 147 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 148 * letter is mapped. This implementation contains a default map for US_ENGLISH 149 * <p> 150 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 151 * 152 * @param mapping 153 * Mapping array to use when finding the corresponding code for a given character 154 */ 155 public Soundex(final char[] mapping) { 156 this.soundexMapping = new char[mapping.length]; 157 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); 158 this.specialCaseHW = !hasMarker(this.soundexMapping); 159 } 160 161 private boolean hasMarker(final char[] mapping) { 162 for(final char ch : mapping) { 163 if (ch == SILENT_MARKER) { 164 return true; 165 } 166 } 167 return false; 168 } 169 170 /** 171 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 172 * and/or possibly provide an internationalized mapping for a non-Western character set. 173 * <p> 174 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment 175 * 176 * @param mapping 177 * Mapping string to use when finding the corresponding code for a given character 178 * @since 1.4 179 */ 180 public Soundex(final String mapping) { 181 this.soundexMapping = mapping.toCharArray(); 182 this.specialCaseHW = !hasMarker(this.soundexMapping); 183 } 184 185 /** 186 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, 187 * and/or possibly provide an internationalized mapping for a non-Western character set. 188 * 189 * @param mapping 190 * Mapping string to use when finding the corresponding code for a given character 191 * @param specialCaseHW if true, then 192 * @since 1.11 193 */ 194 public Soundex(final String mapping, final boolean specialCaseHW) { 195 this.soundexMapping = mapping.toCharArray(); 196 this.specialCaseHW = specialCaseHW; 197 } 198 199 /** 200 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 201 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 202 * identical values. 203 * 204 * @param s1 205 * A String that will be encoded and compared. 206 * @param s2 207 * A String that will be encoded and compared. 208 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 209 * 210 * @see SoundexUtils#difference(StringEncoder,String,String) 211 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 212 * T-SQL DIFFERENCE </a> 213 * 214 * @throws EncoderException 215 * if an error occurs encoding one of the strings 216 * @since 1.3 217 */ 218 public int difference(final String s1, final String s2) throws EncoderException { 219 return SoundexUtils.difference(this, s1, s2); 220 } 221 222 /** 223 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 224 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. 225 * 226 * @param obj 227 * Object to encode 228 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String 229 * supplied. 230 * @throws EncoderException 231 * if the parameter supplied is not of type java.lang.String 232 * @throws IllegalArgumentException 233 * if a character is not mapped 234 */ 235 @Override 236 public Object encode(final Object obj) throws EncoderException { 237 if (!(obj instanceof String)) { 238 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 239 } 240 return soundex((String) obj); 241 } 242 243 /** 244 * Encodes a String using the soundex algorithm. 245 * 246 * @param str 247 * A String object to encode 248 * @return A Soundex code corresponding to the String supplied 249 * @throws IllegalArgumentException 250 * if a character is not mapped 251 */ 252 @Override 253 public String encode(final String str) { 254 return soundex(str); 255 } 256 257 /** 258 * Returns the maxLength. Standard Soundex 259 * 260 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 261 * @return int 262 */ 263 @Deprecated 264 public int getMaxLength() { 265 return this.maxLength; 266 } 267 268 /** 269 * Maps the given upper-case character to its Soundex code. 270 * 271 * @param ch 272 * An upper-case character. 273 * @return A Soundex code. 274 * @throws IllegalArgumentException 275 * Thrown if <code>ch</code> is not mapped. 276 */ 277 private char map(final char ch) { 278 final int index = ch - 'A'; 279 if (index < 0 || index >= this.soundexMapping.length) { 280 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); 281 } 282 return this.soundexMapping[index]; 283 } 284 285 /** 286 * Sets the maxLength. 287 * 288 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 289 * @param maxLength 290 * The maxLength to set 291 */ 292 @Deprecated 293 public void setMaxLength(final int maxLength) { 294 this.maxLength = maxLength; 295 } 296 297 /** 298 * Retrieves the Soundex code for a given String object. 299 * 300 * @param str 301 * String to encode using the Soundex algorithm 302 * @return A soundex code for the String supplied 303 * @throws IllegalArgumentException 304 * if a character is not mapped 305 */ 306 public String soundex(String str) { 307 if (str == null) { 308 return null; 309 } 310 str = SoundexUtils.clean(str); 311 if (str.length() == 0) { 312 return str; 313 } 314 final char out[] = {'0', '0', '0', '0'}; 315 int count = 0; 316 final char first = str.charAt(0); 317 out[count++] = first; 318 char lastDigit = map(first); // previous digit 319 for(int i = 1; i < str.length() && count < out.length ; i++) { 320 final char ch = str.charAt(i); 321 if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely 322 continue; 323 } 324 final char digit = map(ch); 325 if (digit == SILENT_MARKER) { 326 continue; 327 } 328 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats 329 out[count++] = digit; 330 } 331 lastDigit = digit; 332 } 333 return new String(out); 334 } 335 336}