View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
25   * general purpose scheme to find word with similar phonemes.
26   *
27   * This class is thread-safe.
28   * Although not strictly immutable, the {@link #maxLength} field is not actually used.
29   *
30   * @version $Id: Soundex.java 1811347 2017-10-06 15:21:18Z ggregory $
31   */
32  public class Soundex implements StringEncoder {
33  
34      /**
35       * The marker character used to indicate a silent (ignored) character.
36       * These are ignored except when they appear as the first character.
37       * <p>
38       * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
39       * because changing it might break existing code. Mappings that don't contain
40       * a silent marker code are treated as though H and W are silent.
41       * <p>
42       * To override this, use the {@link #Soundex(String, boolean)} constructor.
43       * @since 1.11
44       */
45      public static final char SILENT_MARKER = '-';
46  
47      /**
48       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
49       * means do not encode, but treat as a separator when it occurs between consonants with the same code.
50       * <p>
51       * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
52       * up the value for the constant values page.)
53       * <p>
54       * <b>Note that letters H and W are treated specially.</b>
55       * They are ignored (after the first letter) and don't act as separators
56       * between consonants with the same code.
57       * @see #US_ENGLISH_MAPPING
58       */
59      //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
60      public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
61  
62      /**
63       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
64       * means do not encode.
65       *
66       * @see Soundex#Soundex(char[])
67       */
68      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
69  
70      /**
71       * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
72       * This treats H and W as silent letters.
73       * Apart from when they appear as the first letter, they are ignored.
74       * They don't act as separators between duplicate codes.
75       *
76       * @see #US_ENGLISH_MAPPING
77       * @see #US_ENGLISH_MAPPING_STRING
78       */
79      public static final Soundex US_ENGLISH = new Soundex();
80  
81      /**
82       * An instance of Soundex using the Simplified Soundex mapping, as described here:
83       * http://west-penwith.org.uk/misc/soundex.htm
84       * <p>
85       * This treats H and W the same as vowels (AEIOUY).
86       * Such letters aren't encoded (after the first), but they do
87       * act as separators when dropping duplicate codes.
88       * The mapping is otherwise the same as for {@link #US_ENGLISH}
89       * <p>
90       * @since 1.11
91       */
92      public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
93  
94      /**
95       * An instance of Soundex using the mapping as per the Genealogy site:
96       * http://www.genealogy.com/articles/research/00000060.html
97       * <p>
98       * This treats vowels (AEIOUY), H and W as silent letters.
99       * Such letters are ignored (after the first) and do not
100      * act as separators when dropping duplicate codes.
101      * <p>
102      * The codes for consonants are otherwise the same as for
103      * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
104      *
105      * @since 1.11
106      */
107     public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
108     //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
109 
110     /**
111      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
112      *
113      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
114      */
115     @Deprecated
116     private int maxLength = 4;
117 
118     /**
119      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
120      * letter is mapped. This implementation contains a default map for US_ENGLISH
121      */
122     private final char[] soundexMapping;
123 
124     /**
125      * Should H and W be treated specially?
126      * <p>
127      * In versions of the code prior to 1.11,
128      * the code always treated H and W as silent (ignored) letters.
129      * If this field is false, H and W are no longer special-cased.
130      */
131     private final boolean specialCaseHW;
132 
133     /**
134      * Creates an instance using US_ENGLISH_MAPPING
135      *
136      * @see Soundex#Soundex(char[])
137      * @see Soundex#US_ENGLISH_MAPPING
138      */
139     public Soundex() {
140         this.soundexMapping = US_ENGLISH_MAPPING;
141         this.specialCaseHW = true;
142     }
143 
144     /**
145      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
146      * mapping for a non-Western character set.
147      *
148      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
149      * letter is mapped. This implementation contains a default map for US_ENGLISH
150      * <p>
151      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
152      *
153      * @param mapping
154      *                  Mapping array to use when finding the corresponding code for a given character
155      */
156     public Soundex(final char[] mapping) {
157         this.soundexMapping = new char[mapping.length];
158         System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
159         this.specialCaseHW = !hasMarker(this.soundexMapping);
160     }
161 
162     private boolean hasMarker(final char[] mapping) {
163         for(final char ch : mapping) {
164             if (ch == SILENT_MARKER) {
165                 return true;
166             }
167         }
168         return false;
169     }
170 
171     /**
172      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
173      * and/or possibly provide an internationalized mapping for a non-Western character set.
174      * <p>
175      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
176      *
177      * @param mapping
178      *            Mapping string to use when finding the corresponding code for a given character
179      * @since 1.4
180      */
181     public Soundex(final String mapping) {
182         this.soundexMapping = mapping.toCharArray();
183         this.specialCaseHW = !hasMarker(this.soundexMapping);
184     }
185 
186     /**
187      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
188      * and/or possibly provide an internationalized mapping for a non-Western character set.
189      *
190      * @param mapping
191      *            Mapping string to use when finding the corresponding code for a given character
192      * @param specialCaseHW if true, then
193      * @since 1.11
194      */
195     public Soundex(final String mapping, final boolean specialCaseHW) {
196         this.soundexMapping = mapping.toCharArray();
197         this.specialCaseHW = specialCaseHW;
198     }
199 
200     /**
201      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
202      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
203      * identical values.
204      *
205      * @param s1
206      *                  A String that will be encoded and compared.
207      * @param s2
208      *                  A String that will be encoded and compared.
209      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
210      *
211      * @see SoundexUtils#difference(StringEncoder,String,String)
212      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
213      *          T-SQL DIFFERENCE </a>
214      *
215      * @throws EncoderException
216      *                  if an error occurs encoding one of the strings
217      * @since 1.3
218      */
219     public int difference(final String s1, final String s2) throws EncoderException {
220         return SoundexUtils.difference(this, s1, s2);
221     }
222 
223     /**
224      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
225      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
226      *
227      * @param obj
228      *                  Object to encode
229      * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
230      *             supplied.
231      * @throws EncoderException
232      *                  if the parameter supplied is not of type java.lang.String
233      * @throws IllegalArgumentException
234      *                  if a character is not mapped
235      */
236     @Override
237     public Object encode(final Object obj) throws EncoderException {
238         if (!(obj instanceof String)) {
239             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
240         }
241         return soundex((String) obj);
242     }
243 
244     /**
245      * Encodes a String using the soundex algorithm.
246      *
247      * @param str
248      *                  A String object to encode
249      * @return A Soundex code corresponding to the String supplied
250      * @throws IllegalArgumentException
251      *                  if a character is not mapped
252      */
253     @Override
254     public String encode(final String str) {
255         return soundex(str);
256     }
257 
258     /**
259      * Returns the maxLength. Standard Soundex
260      *
261      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
262      * @return int
263      */
264     @Deprecated
265     public int getMaxLength() {
266         return this.maxLength;
267     }
268 
269     /**
270      * Maps the given upper-case character to its Soundex code.
271      *
272      * @param ch
273      *                  An upper-case character.
274      * @return A Soundex code.
275      * @throws IllegalArgumentException
276      *                  Thrown if <code>ch</code> is not mapped.
277      */
278     private char map(final char ch) {
279         final int index = ch - 'A';
280         if (index < 0 || index >= this.soundexMapping.length) {
281             throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
282         }
283         return this.soundexMapping[index];
284     }
285 
286     /**
287      * Sets the maxLength.
288      *
289      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
290      * @param maxLength
291      *                  The maxLength to set
292      */
293     @Deprecated
294     public void setMaxLength(final int maxLength) {
295         this.maxLength = maxLength;
296     }
297 
298     /**
299      * Retrieves the Soundex code for a given String object.
300      *
301      * @param str
302      *                  String to encode using the Soundex algorithm
303      * @return A soundex code for the String supplied
304      * @throws IllegalArgumentException
305      *                  if a character is not mapped
306      */
307     public String soundex(String str) {
308         if (str == null) {
309             return null;
310         }
311         str = SoundexUtils.clean(str);
312         if (str.length() == 0) {
313             return str;
314         }
315         final char out[] = {'0', '0', '0', '0'};
316         int count = 0;
317         final char first = str.charAt(0);
318         out[count++] = first;
319         char lastDigit = map(first); // previous digit
320         for(int i = 1; i < str.length() && count < out.length ; i++) {
321             final char ch = str.charAt(i);
322             if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
323                 continue;
324             }
325             final char digit = map(ch);
326             if (digit == SILENT_MARKER) {
327                 continue;
328             }
329             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
330                 out[count++] = digit;
331             }
332             lastDigit = digit;
333         }
334         return new String(out);
335     }
336 
337 }