1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
25 * general purpose scheme to find word with similar phonemes.
26 *
27 * This class is thread-safe.
28 * Although not strictly immutable, the {@link #maxLength} field is not actually used.
29 *
30 * @version $Id: Soundex.java 1811347 2017-10-06 15:21:18Z ggregory $
31 */
32 public class Soundex implements StringEncoder {
33
34 /**
35 * The marker character used to indicate a silent (ignored) character.
36 * These are ignored except when they appear as the first character.
37 * <p>
38 * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
39 * because changing it might break existing code. Mappings that don't contain
40 * a silent marker code are treated as though H and W are silent.
41 * <p>
42 * To override this, use the {@link #Soundex(String, boolean)} constructor.
43 * @since 1.11
44 */
45 public static final char SILENT_MARKER = '-';
46
47 /**
48 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
49 * means do not encode, but treat as a separator when it occurs between consonants with the same code.
50 * <p>
51 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
52 * up the value for the constant values page.)
53 * <p>
54 * <b>Note that letters H and W are treated specially.</b>
55 * They are ignored (after the first letter) and don't act as separators
56 * between consonants with the same code.
57 * @see #US_ENGLISH_MAPPING
58 */
59 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
60 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
61
62 /**
63 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
64 * means do not encode.
65 *
66 * @see Soundex#Soundex(char[])
67 */
68 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
69
70 /**
71 * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
72 * This treats H and W as silent letters.
73 * Apart from when they appear as the first letter, they are ignored.
74 * They don't act as separators between duplicate codes.
75 *
76 * @see #US_ENGLISH_MAPPING
77 * @see #US_ENGLISH_MAPPING_STRING
78 */
79 public static final Soundex US_ENGLISH = new Soundex();
80
81 /**
82 * An instance of Soundex using the Simplified Soundex mapping, as described here:
83 * http://west-penwith.org.uk/misc/soundex.htm
84 * <p>
85 * This treats H and W the same as vowels (AEIOUY).
86 * Such letters aren't encoded (after the first), but they do
87 * act as separators when dropping duplicate codes.
88 * The mapping is otherwise the same as for {@link #US_ENGLISH}
89 * <p>
90 * @since 1.11
91 */
92 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
93
94 /**
95 * An instance of Soundex using the mapping as per the Genealogy site:
96 * http://www.genealogy.com/articles/research/00000060.html
97 * <p>
98 * This treats vowels (AEIOUY), H and W as silent letters.
99 * Such letters are ignored (after the first) and do not
100 * act as separators when dropping duplicate codes.
101 * <p>
102 * The codes for consonants are otherwise the same as for
103 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
104 *
105 * @since 1.11
106 */
107 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
108 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
109
110 /**
111 * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
112 *
113 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
114 */
115 @Deprecated
116 private int maxLength = 4;
117
118 /**
119 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
120 * letter is mapped. This implementation contains a default map for US_ENGLISH
121 */
122 private final char[] soundexMapping;
123
124 /**
125 * Should H and W be treated specially?
126 * <p>
127 * In versions of the code prior to 1.11,
128 * the code always treated H and W as silent (ignored) letters.
129 * If this field is false, H and W are no longer special-cased.
130 */
131 private final boolean specialCaseHW;
132
133 /**
134 * Creates an instance using US_ENGLISH_MAPPING
135 *
136 * @see Soundex#Soundex(char[])
137 * @see Soundex#US_ENGLISH_MAPPING
138 */
139 public Soundex() {
140 this.soundexMapping = US_ENGLISH_MAPPING;
141 this.specialCaseHW = true;
142 }
143
144 /**
145 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
146 * mapping for a non-Western character set.
147 *
148 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
149 * letter is mapped. This implementation contains a default map for US_ENGLISH
150 * <p>
151 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
152 *
153 * @param mapping
154 * Mapping array to use when finding the corresponding code for a given character
155 */
156 public Soundex(final char[] mapping) {
157 this.soundexMapping = new char[mapping.length];
158 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
159 this.specialCaseHW = !hasMarker(this.soundexMapping);
160 }
161
162 private boolean hasMarker(final char[] mapping) {
163 for(final char ch : mapping) {
164 if (ch == SILENT_MARKER) {
165 return true;
166 }
167 }
168 return false;
169 }
170
171 /**
172 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
173 * and/or possibly provide an internationalized mapping for a non-Western character set.
174 * <p>
175 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
176 *
177 * @param mapping
178 * Mapping string to use when finding the corresponding code for a given character
179 * @since 1.4
180 */
181 public Soundex(final String mapping) {
182 this.soundexMapping = mapping.toCharArray();
183 this.specialCaseHW = !hasMarker(this.soundexMapping);
184 }
185
186 /**
187 * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
188 * and/or possibly provide an internationalized mapping for a non-Western character set.
189 *
190 * @param mapping
191 * Mapping string to use when finding the corresponding code for a given character
192 * @param specialCaseHW if true, then
193 * @since 1.11
194 */
195 public Soundex(final String mapping, final boolean specialCaseHW) {
196 this.soundexMapping = mapping.toCharArray();
197 this.specialCaseHW = specialCaseHW;
198 }
199
200 /**
201 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
202 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
203 * identical values.
204 *
205 * @param s1
206 * A String that will be encoded and compared.
207 * @param s2
208 * A String that will be encoded and compared.
209 * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
210 *
211 * @see SoundexUtils#difference(StringEncoder,String,String)
212 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
213 * T-SQL DIFFERENCE </a>
214 *
215 * @throws EncoderException
216 * if an error occurs encoding one of the strings
217 * @since 1.3
218 */
219 public int difference(final String s1, final String s2) throws EncoderException {
220 return SoundexUtils.difference(this, s1, s2);
221 }
222
223 /**
224 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
225 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
226 *
227 * @param obj
228 * Object to encode
229 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
230 * supplied.
231 * @throws EncoderException
232 * if the parameter supplied is not of type java.lang.String
233 * @throws IllegalArgumentException
234 * if a character is not mapped
235 */
236 @Override
237 public Object encode(final Object obj) throws EncoderException {
238 if (!(obj instanceof String)) {
239 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
240 }
241 return soundex((String) obj);
242 }
243
244 /**
245 * Encodes a String using the soundex algorithm.
246 *
247 * @param str
248 * A String object to encode
249 * @return A Soundex code corresponding to the String supplied
250 * @throws IllegalArgumentException
251 * if a character is not mapped
252 */
253 @Override
254 public String encode(final String str) {
255 return soundex(str);
256 }
257
258 /**
259 * Returns the maxLength. Standard Soundex
260 *
261 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
262 * @return int
263 */
264 @Deprecated
265 public int getMaxLength() {
266 return this.maxLength;
267 }
268
269 /**
270 * Maps the given upper-case character to its Soundex code.
271 *
272 * @param ch
273 * An upper-case character.
274 * @return A Soundex code.
275 * @throws IllegalArgumentException
276 * Thrown if <code>ch</code> is not mapped.
277 */
278 private char map(final char ch) {
279 final int index = ch - 'A';
280 if (index < 0 || index >= this.soundexMapping.length) {
281 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
282 }
283 return this.soundexMapping[index];
284 }
285
286 /**
287 * Sets the maxLength.
288 *
289 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
290 * @param maxLength
291 * The maxLength to set
292 */
293 @Deprecated
294 public void setMaxLength(final int maxLength) {
295 this.maxLength = maxLength;
296 }
297
298 /**
299 * Retrieves the Soundex code for a given String object.
300 *
301 * @param str
302 * String to encode using the Soundex algorithm
303 * @return A soundex code for the String supplied
304 * @throws IllegalArgumentException
305 * if a character is not mapped
306 */
307 public String soundex(String str) {
308 if (str == null) {
309 return null;
310 }
311 str = SoundexUtils.clean(str);
312 if (str.length() == 0) {
313 return str;
314 }
315 final char out[] = {'0', '0', '0', '0'};
316 int count = 0;
317 final char first = str.charAt(0);
318 out[count++] = first;
319 char lastDigit = map(first); // previous digit
320 for(int i = 1; i < str.length() && count < out.length ; i++) {
321 final char ch = str.charAt(i);
322 if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
323 continue;
324 }
325 final char digit = map(ch);
326 if (digit == SILENT_MARKER) {
327 continue;
328 }
329 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
330 out[count++] = digit;
331 }
332 lastDigit = digit;
333 }
334 return new String(out);
335 }
336
337 }