View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
25   * general purpose scheme to find word with similar phonemes.
26   *
27   * <p>This class is thread-safe.
28   * Although not strictly immutable, the mutable fields are not actually used.</p>
29   */
30  public class Soundex implements StringEncoder {
31  
32      /**
33       * The marker character used to indicate a silent (ignored) character.
34       * These are ignored except when they appear as the first character.
35       * <p>
36       * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
37       * because changing it might break existing code. Mappings that don't contain
38       * a silent marker code are treated as though H and W are silent.
39       * </p>
40       * <p>
41       * To override this, use the {@link #Soundex(String, boolean)} constructor.
42       * </p>
43       *
44       * @since 1.11
45       */
46      public static final char SILENT_MARKER = '-';
47  
48      /**
49       * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
50       * means do not encode, but treat as a separator when it occurs between consonants with the same code.
51       * <p>
52       * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
53       * up the value for the constant values page.)
54       * </p>
55       * <p>
56       * <b>Note that letters H and W are treated specially.</b>
57       * They are ignored (after the first letter) and don't act as separators
58       * between consonants with the same code.
59       * </p>
60       */
61      public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
62  
63      /**
64       * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
65       * means do not encode.
66       *
67       * @see Soundex#Soundex(char[])
68       */
69      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
70  
71      /**
72       * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
73       * This treats H and W as silent letters.
74       * Apart from when they appear as the first letter, they are ignored.
75       * They don't act as separators between duplicate codes.
76       *
77       * @see #US_ENGLISH_MAPPING_STRING
78       */
79      public static final Soundex US_ENGLISH = new Soundex();
80  
81      /**
82       * An instance of Soundex using the Simplified Soundex mapping, as described here:
83       * http://west-penwith.org.uk/misc/soundex.htm
84       * <p>
85       * This treats H and W the same as vowels (AEIOUY).
86       * Such letters aren't encoded (after the first), but they do
87       * act as separators when dropping duplicate codes.
88       * The mapping is otherwise the same as for {@link #US_ENGLISH}
89       * </p>
90       *
91       * @since 1.11
92       */
93      public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
94  
95      /**
96       * An instance of Soundex using the mapping as per the Genealogy site:
97       * http://www.genealogy.com/articles/research/00000060.html
98       * <p>
99       * This treats vowels (AEIOUY), H and W as silent letters.
100      * Such letters are ignored (after the first) and do not
101      * act as separators when dropping duplicate codes.
102      * </p>
103      * <p>
104      * The codes for consonants are otherwise the same as for
105      * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
106      * </p>
107      *
108      * @since 1.11
109      */
110     public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
111     //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
112 
113     /**
114      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
115      *
116      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
117      */
118     @Deprecated
119     private int maxLength = 4;
120 
121     /**
122      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
123      * letter is mapped. This implementation contains a default map for US_ENGLISH
124      */
125     private final char[] soundexMapping;
126 
127     /**
128      * Should H and W be treated specially?
129      * <p>
130      * In versions of the code prior to 1.11,
131      * the code always treated H and W as silent (ignored) letters.
132      * If this field is false, H and W are no longer special-cased.
133      * </p>
134      */
135     private final boolean specialCaseHW;
136 
137     /**
138      * Creates an instance using US_ENGLISH_MAPPING
139      *
140      * @see Soundex#Soundex(char[])
141      * @see Soundex#US_ENGLISH_MAPPING_STRING
142      */
143     public Soundex() {
144         this.soundexMapping = US_ENGLISH_MAPPING;
145         this.specialCaseHW = true;
146     }
147 
148     /**
149      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
150      * mapping for a non-Western character set.
151      * <p>
152      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
153      * letter is mapped. This implementation contains a default map for US_ENGLISH
154      * </p>
155      * <p>
156      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
157      * </p>
158      *
159      * @param mapping
160      *                  Mapping array to use when finding the corresponding code for a given character
161      */
162     public Soundex(final char[] mapping) {
163         this.soundexMapping = mapping.clone();
164         this.specialCaseHW = !hasMarker(this.soundexMapping);
165     }
166 
167     /**
168      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
169      * and/or possibly provide an internationalized mapping for a non-Western character set.
170      * <p>
171      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
172      * </p>
173      *
174      * @param mapping
175      *            Mapping string to use when finding the corresponding code for a given character
176      * @since 1.4
177      */
178     public Soundex(final String mapping) {
179         this.soundexMapping = mapping.toCharArray();
180         this.specialCaseHW = !hasMarker(this.soundexMapping);
181     }
182 
183     /**
184      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
185      * and/or possibly provide an internationalized mapping for a non-Western character set.
186      *
187      * @param mapping
188      *            Mapping string to use when finding the corresponding code for a given character
189      * @param specialCaseHW if true, then
190      * @since 1.11
191      */
192     public Soundex(final String mapping, final boolean specialCaseHW) {
193         this.soundexMapping = mapping.toCharArray();
194         this.specialCaseHW = specialCaseHW;
195     }
196 
197     /**
198      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
199      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
200      * identical values.
201      *
202      * @param s1
203      *                  A String that will be encoded and compared.
204      * @param s2
205      *                  A String that will be encoded and compared.
206      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
207      *
208      * @see SoundexUtils#difference(StringEncoder,String,String)
209      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
210      *          T-SQL DIFFERENCE</a>
211      *
212      * @throws EncoderException
213      *                  if an error occurs encoding one of the strings
214      * @since 1.3
215      */
216     public int difference(final String s1, final String s2) throws EncoderException {
217         return SoundexUtils.difference(this, s1, s2);
218     }
219 
220     /**
221      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
222      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type {@link String}.
223      *
224      * @param obj
225      *                  Object to encode
226      * @return An object (or type {@link String}) containing the soundex code which corresponds to the String
227      *             supplied.
228      * @throws EncoderException
229      *                  if the parameter supplied is not of type {@link String}
230      * @throws IllegalArgumentException
231      *                  if a character is not mapped
232      */
233     @Override
234     public Object encode(final Object obj) throws EncoderException {
235         if (!(obj instanceof String)) {
236             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
237         }
238         return soundex((String) obj);
239     }
240 
241     /**
242      * Encodes a String using the soundex algorithm.
243      *
244      * @param str
245      *                  A String object to encode
246      * @return A Soundex code corresponding to the String supplied
247      * @throws IllegalArgumentException
248      *                  if a character is not mapped
249      */
250     @Override
251     public String encode(final String str) {
252         return soundex(str);
253     }
254 
255     /**
256      * Returns the maxLength. Standard Soundex
257      *
258      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
259      * @return int
260      */
261     @Deprecated
262     public int getMaxLength() {
263         return this.maxLength;
264     }
265 
266     private boolean hasMarker(final char[] mapping) {
267         for (final char ch : mapping) {
268             if (ch == SILENT_MARKER) {
269                 return true;
270             }
271         }
272         return false;
273     }
274 
275     /**
276      * Maps the given upper-case character to its Soundex code.
277      *
278      * @param ch
279      *                  An upper-case character.
280      * @return A Soundex code.
281      * @throws IllegalArgumentException
282      *                  Thrown if {@code ch} is not mapped.
283      */
284     private char map(final char ch) {
285         final int index = ch - 'A';
286         if (index < 0 || index >= this.soundexMapping.length) {
287             throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
288         }
289         return this.soundexMapping[index];
290     }
291 
292     /**
293      * Sets the maxLength.
294      *
295      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
296      * @param maxLength
297      *                  The maxLength to set
298      */
299     @Deprecated
300     public void setMaxLength(final int maxLength) {
301         this.maxLength = maxLength;
302     }
303 
304     /**
305      * Retrieves the Soundex code for a given String object.
306      *
307      * @param str
308      *                  String to encode using the Soundex algorithm
309      * @return A soundex code for the String supplied
310      * @throws IllegalArgumentException
311      *                  if a character is not mapped
312      */
313     public String soundex(String str) {
314         if (str == null) {
315             return null;
316         }
317         str = SoundexUtils.clean(str);
318         if (str.isEmpty()) {
319             return str;
320         }
321         final char[] out = { '0', '0', '0', '0' };
322         int count = 0;
323         final char first = str.charAt(0);
324         out[count++] = first;
325         char lastDigit = map(first); // previous digit
326         for (int i = 1; i < str.length() && count < out.length; i++) {
327             final char ch = str.charAt(i);
328             if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
329                 continue;
330             }
331             final char digit = map(ch);
332             if (digit == SILENT_MARKER) {
333                 continue;
334             }
335             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
336                 out[count++] = digit;
337             }
338             lastDigit = digit;
339         }
340         return new String(out);
341     }
342 
343 }