1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
25 * general purpose scheme to find word with similar phonemes.
26 *
27 * <p>This class is thread-safe.
28 * Although not strictly immutable, the mutable fields are not actually used.</p>
29 */
30 public class Soundex implements StringEncoder {
31
32 /**
33 * The marker character used to indicate a silent (ignored) character.
34 * These are ignored except when they appear as the first character.
35 * <p>
36 * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
37 * because changing it might break existing code. Mappings that don't contain
38 * a silent marker code are treated as though H and W are silent.
39 * </p>
40 * <p>
41 * To override this, use the {@link #Soundex(String, boolean)} constructor.
42 * </p>
43 *
44 * @since 1.11
45 */
46 public static final char SILENT_MARKER = '-';
47
48 /**
49 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
50 * means do not encode, but treat as a separator when it occurs between consonants with the same code.
51 * <p>
52 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
53 * up the value for the constant values page.)
54 * </p>
55 * <p>
56 * <strong>Note that letters H and W are treated specially.</strong>
57 * They are ignored (after the first letter) and don't act as separators
58 * between consonants with the same code.
59 * </p>
60 */
61 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
62
63 /**
64 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
65 * means do not encode.
66 *
67 * @see Soundex#Soundex(char[])
68 */
69 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
70
71 /**
72 * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
73 * This treats H and W as silent letters.
74 * Apart from when they appear as the first letter, they are ignored.
75 * They don't act as separators between duplicate codes.
76 *
77 * @see #US_ENGLISH_MAPPING_STRING
78 */
79 public static final Soundex US_ENGLISH = new Soundex();
80
81 /**
82 * An instance of Soundex using the Simplified Soundex mapping, as described here:
83 * http://west-penwith.org.uk/misc/soundex.htm
84 * <p>
85 * This treats H and W the same as vowels (AEIOUY).
86 * Such letters aren't encoded (after the first), but they do
87 * act as separators when dropping duplicate codes.
88 * The mapping is otherwise the same as for {@link #US_ENGLISH}
89 * </p>
90 *
91 * @since 1.11
92 */
93 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
94
95 /**
96 * An instance of Soundex using the mapping as per the Genealogy site:
97 * http://www.genealogy.com/articles/research/00000060.html
98 * <p>
99 * This treats vowels (AEIOUY), H and W as silent letters.
100 * Such letters are ignored (after the first) and do not
101 * act as separators when dropping duplicate codes.
102 * </p>
103 * <p>
104 * The codes for consonants are otherwise the same as for
105 * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
106 * </p>
107 *
108 * @since 1.11
109 */
110 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
111 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
112
113 /**
114 * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
115 *
116 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
117 */
118 @Deprecated
119 private int maxLength = 4;
120
121 /**
122 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
123 * letter is mapped. This implementation contains a default map for US_ENGLISH
124 */
125 private final char[] soundexMapping;
126
127 /**
128 * Should H and W be treated specially?
129 * <p>
130 * In versions of the code prior to 1.11,
131 * the code always treated H and W as silent (ignored) letters.
132 * If this field is false, H and W are no longer special-cased.
133 * </p>
134 */
135 private final boolean specialCaseHW;
136
137 /**
138 * Creates an instance using US_ENGLISH_MAPPING.
139 *
140 * @see Soundex#Soundex(char[])
141 * @see Soundex#US_ENGLISH_MAPPING_STRING
142 */
143 public Soundex() {
144 this.soundexMapping = US_ENGLISH_MAPPING;
145 this.specialCaseHW = true;
146 }
147
148 /**
149 * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized
150 * mapping for a non-Western character set.
151 * <p>
152 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
153 * letter is mapped. This implementation contains a default map for US_ENGLISH
154 * </p>
155 * <p>
156 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
157 * </p>
158 *
159 * @param mapping
160 * Mapping array to use when finding the corresponding code for a given character.
161 */
162 public Soundex(final char[] mapping) {
163 this.soundexMapping = mapping.clone();
164 this.specialCaseHW = !hasMarker(this.soundexMapping);
165 }
166
167 /**
168 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
169 * and/or possibly provide an internationalized mapping for a non-Western character set.
170 * <p>
171 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
172 * </p>
173 *
174 * @param mapping
175 * Mapping string to use when finding the corresponding code for a given character.
176 * @since 1.4
177 */
178 public Soundex(final String mapping) {
179 this.soundexMapping = mapping.toCharArray();
180 this.specialCaseHW = !hasMarker(this.soundexMapping);
181 }
182
183 /**
184 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
185 * and/or possibly provide an internationalized mapping for a non-Western character set.
186 *
187 * @param mapping
188 * Mapping string to use when finding the corresponding code for a given character.
189 * @param specialCaseHW if true, then
190 * @since 1.11
191 */
192 public Soundex(final String mapping, final boolean specialCaseHW) {
193 this.soundexMapping = mapping.toCharArray();
194 this.specialCaseHW = specialCaseHW;
195 }
196
197 /**
198 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0
199 * indicates little or no similarity, and 4 indicates strong similarity or identical values.
200 *
201 * @param s1 A String that will be encoded and compared.
202 * @param s2 A String that will be encoded and compared.
203 * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
204 * @see SoundexUtils#difference(StringEncoder,String,String)
205 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a>
206 *
207 * @throws EncoderException if an error occurs encoding one of the strings.
208 * @since 1.3
209 */
210 public int difference(final String s1, final String s2) throws EncoderException {
211 return SoundexUtils.difference(this, s1, s2);
212 }
213
214 /**
215 * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
216 * EncoderException if the supplied object is not of type {@link String}.
217 *
218 * @param obj Object to encode.
219 * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied.
220 * @throws EncoderException if the parameter supplied is not of type {@link String}.
221 * @throws IllegalArgumentException if a character is not mapped.
222 */
223 @Override
224 public Object encode(final Object obj) throws EncoderException {
225 if (!(obj instanceof String)) {
226 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
227 }
228 return soundex((String) obj);
229 }
230
231 /**
232 * Encodes a String using the Soundex algorithm.
233 *
234 * @param str A String object to encode.
235 * @return A Soundex code corresponding to the String supplied.
236 * @throws IllegalArgumentException if a character is not mapped.
237 */
238 @Override
239 public String encode(final String str) {
240 return soundex(str);
241 }
242
243 /**
244 * Returns the maxLength. Standard Soundex
245 *
246 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
247 * @return the maxLength.
248 */
249 @Deprecated
250 public int getMaxLength() {
251 return this.maxLength;
252 }
253
254 private boolean hasMarker(final char[] mapping) {
255 for (final char ch : mapping) {
256 if (ch == SILENT_MARKER) {
257 return true;
258 }
259 }
260 return false;
261 }
262
263 /**
264 * Maps the given upper-case character to its Soundex code.
265 *
266 * @param ch
267 * An upper-case character.
268 * @return A Soundex code.
269 * @throws IllegalArgumentException
270 * Thrown if {@code ch} is not mapped.
271 */
272 private char map(final char ch) {
273 final int index = ch - 'A';
274 if (index < 0 || index >= this.soundexMapping.length) {
275 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
276 }
277 return this.soundexMapping[index];
278 }
279
280 /**
281 * Sets the maxLength.
282 *
283 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
284 * @param maxLength
285 * The maxLength to set.
286 */
287 @Deprecated
288 public void setMaxLength(final int maxLength) {
289 this.maxLength = maxLength;
290 }
291
292 /**
293 * Retrieves the Soundex code for a given String object.
294 *
295 * @param str String to encode using the Soundex algorithm.
296 * @return A Soundex code for the String supplied.
297 * @throws IllegalArgumentException if a character is not mapped.
298 */
299 public String soundex(String str) {
300 if (str == null) {
301 return null;
302 }
303 str = SoundexUtils.clean(str);
304 if (str.isEmpty()) {
305 return str;
306 }
307 final char[] out = { '0', '0', '0', '0' };
308 int count = 0;
309 final char first = str.charAt(0);
310 out[count++] = first;
311 char lastDigit = map(first); // previous digit
312 for (int i = 1; i < str.length() && count < out.length; i++) {
313 final char ch = str.charAt(i);
314 if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
315 continue;
316 }
317 final char digit = map(ch);
318 if (digit == SILENT_MARKER) {
319 continue;
320 }
321 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
322 out[count++] = digit;
323 }
324 lastDigit = digit;
325 }
326 return new String(out);
327 }
328
329 }