001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025 * general purpose scheme to find word with similar phonemes.
026 *
027 * This class is thread-safe.
028 * Although not strictly immutable, the {@link #maxLength} field is not actually used.
029 *
030 * @version $Id: Soundex.java 1811347 2017-10-06 15:21:18Z ggregory $
031 */
032public class Soundex implements StringEncoder {
033
034    /**
035     * The marker character used to indicate a silent (ignored) character.
036     * These are ignored except when they appear as the first character.
037     * <p>
038     * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
039     * because changing it might break existing code. Mappings that don't contain
040     * a silent marker code are treated as though H and W are silent.
041     * <p>
042     * To override this, use the {@link #Soundex(String, boolean)} constructor.
043     * @since 1.11
044     */
045    public static final char SILENT_MARKER = '-';
046
047    /**
048     * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
049     * means do not encode, but treat as a separator when it occurs between consonants with the same code.
050     * <p>
051     * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
052     * up the value for the constant values page.)
053     * <p>
054     * <b>Note that letters H and W are treated specially.</b>
055     * They are ignored (after the first letter) and don't act as separators
056     * between consonants with the same code.
057     * @see #US_ENGLISH_MAPPING
058     */
059    //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
060    public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
061
062    /**
063     * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
064     * means do not encode.
065     *
066     * @see Soundex#Soundex(char[])
067     */
068    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
069
070    /**
071     * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
072     * This treats H and W as silent letters.
073     * Apart from when they appear as the first letter, they are ignored.
074     * They don't act as separators between duplicate codes.
075     *
076     * @see #US_ENGLISH_MAPPING
077     * @see #US_ENGLISH_MAPPING_STRING
078     */
079    public static final Soundex US_ENGLISH = new Soundex();
080
081    /**
082     * An instance of Soundex using the Simplified Soundex mapping, as described here:
083     * http://west-penwith.org.uk/misc/soundex.htm
084     * <p>
085     * This treats H and W the same as vowels (AEIOUY).
086     * Such letters aren't encoded (after the first), but they do
087     * act as separators when dropping duplicate codes.
088     * The mapping is otherwise the same as for {@link #US_ENGLISH}
089     * <p>
090     * @since 1.11
091     */
092    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
093
094    /**
095     * An instance of Soundex using the mapping as per the Genealogy site:
096     * http://www.genealogy.com/articles/research/00000060.html
097     * <p>
098     * This treats vowels (AEIOUY), H and W as silent letters.
099     * Such letters are ignored (after the first) and do not
100     * act as separators when dropping duplicate codes.
101     * <p>
102     * The codes for consonants are otherwise the same as for
103     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
104     *
105     * @since 1.11
106     */
107    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
108    //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
109
110    /**
111     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
112     *
113     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
114     */
115    @Deprecated
116    private int maxLength = 4;
117
118    /**
119     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
120     * letter is mapped. This implementation contains a default map for US_ENGLISH
121     */
122    private final char[] soundexMapping;
123
124    /**
125     * Should H and W be treated specially?
126     * <p>
127     * In versions of the code prior to 1.11,
128     * the code always treated H and W as silent (ignored) letters.
129     * If this field is false, H and W are no longer special-cased.
130     */
131    private final boolean specialCaseHW;
132
133    /**
134     * Creates an instance using US_ENGLISH_MAPPING
135     *
136     * @see Soundex#Soundex(char[])
137     * @see Soundex#US_ENGLISH_MAPPING
138     */
139    public Soundex() {
140        this.soundexMapping = US_ENGLISH_MAPPING;
141        this.specialCaseHW = true;
142    }
143
144    /**
145     * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
146     * mapping for a non-Western character set.
147     *
148     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
149     * letter is mapped. This implementation contains a default map for US_ENGLISH
150     * <p>
151     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
152     *
153     * @param mapping
154     *                  Mapping array to use when finding the corresponding code for a given character
155     */
156    public Soundex(final char[] mapping) {
157        this.soundexMapping = new char[mapping.length];
158        System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
159        this.specialCaseHW = !hasMarker(this.soundexMapping);
160    }
161
162    private boolean hasMarker(final char[] mapping) {
163        for(final char ch : mapping) {
164            if (ch == SILENT_MARKER) {
165                return true;
166            }
167        }
168        return false;
169    }
170
171    /**
172     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
173     * and/or possibly provide an internationalized mapping for a non-Western character set.
174     * <p>
175     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
176     *
177     * @param mapping
178     *            Mapping string to use when finding the corresponding code for a given character
179     * @since 1.4
180     */
181    public Soundex(final String mapping) {
182        this.soundexMapping = mapping.toCharArray();
183        this.specialCaseHW = !hasMarker(this.soundexMapping);
184    }
185
186    /**
187     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
188     * and/or possibly provide an internationalized mapping for a non-Western character set.
189     *
190     * @param mapping
191     *            Mapping string to use when finding the corresponding code for a given character
192     * @param specialCaseHW if true, then
193     * @since 1.11
194     */
195    public Soundex(final String mapping, final boolean specialCaseHW) {
196        this.soundexMapping = mapping.toCharArray();
197        this.specialCaseHW = specialCaseHW;
198    }
199
200    /**
201     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
202     * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
203     * identical values.
204     *
205     * @param s1
206     *                  A String that will be encoded and compared.
207     * @param s2
208     *                  A String that will be encoded and compared.
209     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
210     *
211     * @see SoundexUtils#difference(StringEncoder,String,String)
212     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
213     *          T-SQL DIFFERENCE </a>
214     *
215     * @throws EncoderException
216     *                  if an error occurs encoding one of the strings
217     * @since 1.3
218     */
219    public int difference(final String s1, final String s2) throws EncoderException {
220        return SoundexUtils.difference(this, s1, s2);
221    }
222
223    /**
224     * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
225     * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
226     *
227     * @param obj
228     *                  Object to encode
229     * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
230     *             supplied.
231     * @throws EncoderException
232     *                  if the parameter supplied is not of type java.lang.String
233     * @throws IllegalArgumentException
234     *                  if a character is not mapped
235     */
236    @Override
237    public Object encode(final Object obj) throws EncoderException {
238        if (!(obj instanceof String)) {
239            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
240        }
241        return soundex((String) obj);
242    }
243
244    /**
245     * Encodes a String using the soundex algorithm.
246     *
247     * @param str
248     *                  A String object to encode
249     * @return A Soundex code corresponding to the String supplied
250     * @throws IllegalArgumentException
251     *                  if a character is not mapped
252     */
253    @Override
254    public String encode(final String str) {
255        return soundex(str);
256    }
257
258    /**
259     * Returns the maxLength. Standard Soundex
260     *
261     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
262     * @return int
263     */
264    @Deprecated
265    public int getMaxLength() {
266        return this.maxLength;
267    }
268
269    /**
270     * Maps the given upper-case character to its Soundex code.
271     *
272     * @param ch
273     *                  An upper-case character.
274     * @return A Soundex code.
275     * @throws IllegalArgumentException
276     *                  Thrown if <code>ch</code> is not mapped.
277     */
278    private char map(final char ch) {
279        final int index = ch - 'A';
280        if (index < 0 || index >= this.soundexMapping.length) {
281            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
282        }
283        return this.soundexMapping[index];
284    }
285
286    /**
287     * Sets the maxLength.
288     *
289     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
290     * @param maxLength
291     *                  The maxLength to set
292     */
293    @Deprecated
294    public void setMaxLength(final int maxLength) {
295        this.maxLength = maxLength;
296    }
297
298    /**
299     * Retrieves the Soundex code for a given String object.
300     *
301     * @param str
302     *                  String to encode using the Soundex algorithm
303     * @return A soundex code for the String supplied
304     * @throws IllegalArgumentException
305     *                  if a character is not mapped
306     */
307    public String soundex(String str) {
308        if (str == null) {
309            return null;
310        }
311        str = SoundexUtils.clean(str);
312        if (str.length() == 0) {
313            return str;
314        }
315        final char out[] = {'0', '0', '0', '0'};
316        int count = 0;
317        final char first = str.charAt(0);
318        out[count++] = first;
319        char lastDigit = map(first); // previous digit
320        for(int i = 1; i < str.length() && count < out.length ; i++) {
321            final char ch = str.charAt(i);
322            if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
323                continue;
324            }
325            final char digit = map(ch);
326            if (digit == SILENT_MARKER) {
327                continue;
328            }
329            if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
330                out[count++] = digit;
331            }
332            lastDigit = digit;
333        }
334        return new String(out);
335    }
336
337}