001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025 * general purpose scheme to find word with similar phonemes.
026 *
027 * This class is thread-safe.
028 * Although not strictly immutable, the {@link #maxLength} field is not actually used.
029 *
030 */
031public class Soundex implements StringEncoder {
032
033    /**
034     * The marker character used to indicate a silent (ignored) character.
035     * These are ignored except when they appear as the first character.
036     * <p>
037     * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
038     * because changing it might break existing code. Mappings that don't contain
039     * a silent marker code are treated as though H and W are silent.
040     * <p>
041     * To override this, use the {@link #Soundex(String, boolean)} constructor.
042     * @since 1.11
043     */
044    public static final char SILENT_MARKER = '-';
045
046    /**
047     * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
048     * means do not encode, but treat as a separator when it occurs between consonants with the same code.
049     * <p>
050     * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
051     * up the value for the constant values page.)
052     * <p>
053     * <b>Note that letters H and W are treated specially.</b>
054     * They are ignored (after the first letter) and don't act as separators
055     * between consonants with the same code.
056     * @see #US_ENGLISH_MAPPING
057     */
058    //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
059    public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
060
061    /**
062     * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
063     * means do not encode.
064     *
065     * @see Soundex#Soundex(char[])
066     */
067    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
068
069    /**
070     * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
071     * This treats H and W as silent letters.
072     * Apart from when they appear as the first letter, they are ignored.
073     * They don't act as separators between duplicate codes.
074     *
075     * @see #US_ENGLISH_MAPPING
076     * @see #US_ENGLISH_MAPPING_STRING
077     */
078    public static final Soundex US_ENGLISH = new Soundex();
079
080    /**
081     * An instance of Soundex using the Simplified Soundex mapping, as described here:
082     * http://west-penwith.org.uk/misc/soundex.htm
083     * <p>
084     * This treats H and W the same as vowels (AEIOUY).
085     * Such letters aren't encoded (after the first), but they do
086     * act as separators when dropping duplicate codes.
087     * The mapping is otherwise the same as for {@link #US_ENGLISH}
088     * <p>
089     * @since 1.11
090     */
091    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
092
093    /**
094     * An instance of Soundex using the mapping as per the Genealogy site:
095     * http://www.genealogy.com/articles/research/00000060.html
096     * <p>
097     * This treats vowels (AEIOUY), H and W as silent letters.
098     * Such letters are ignored (after the first) and do not
099     * act as separators when dropping duplicate codes.
100     * <p>
101     * The codes for consonants are otherwise the same as for
102     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
103     *
104     * @since 1.11
105     */
106    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
107    //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
108
109    /**
110     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
111     *
112     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
113     */
114    @Deprecated
115    private int maxLength = 4;
116
117    /**
118     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
119     * letter is mapped. This implementation contains a default map for US_ENGLISH
120     */
121    private final char[] soundexMapping;
122
123    /**
124     * Should H and W be treated specially?
125     * <p>
126     * In versions of the code prior to 1.11,
127     * the code always treated H and W as silent (ignored) letters.
128     * If this field is false, H and W are no longer special-cased.
129     */
130    private final boolean specialCaseHW;
131
132    /**
133     * Creates an instance using US_ENGLISH_MAPPING
134     *
135     * @see Soundex#Soundex(char[])
136     * @see Soundex#US_ENGLISH_MAPPING
137     */
138    public Soundex() {
139        this.soundexMapping = US_ENGLISH_MAPPING;
140        this.specialCaseHW = true;
141    }
142
143    /**
144     * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
145     * mapping for a non-Western character set.
146     *
147     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
148     * letter is mapped. This implementation contains a default map for US_ENGLISH
149     * <p>
150     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
151     *
152     * @param mapping
153     *                  Mapping array to use when finding the corresponding code for a given character
154     */
155    public Soundex(final char[] mapping) {
156        this.soundexMapping = new char[mapping.length];
157        System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
158        this.specialCaseHW = !hasMarker(this.soundexMapping);
159    }
160
161    private boolean hasMarker(final char[] mapping) {
162        for(final char ch : mapping) {
163            if (ch == SILENT_MARKER) {
164                return true;
165            }
166        }
167        return false;
168    }
169
170    /**
171     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
172     * and/or possibly provide an internationalized mapping for a non-Western character set.
173     * <p>
174     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
175     *
176     * @param mapping
177     *            Mapping string to use when finding the corresponding code for a given character
178     * @since 1.4
179     */
180    public Soundex(final String mapping) {
181        this.soundexMapping = mapping.toCharArray();
182        this.specialCaseHW = !hasMarker(this.soundexMapping);
183    }
184
185    /**
186     * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
187     * and/or possibly provide an internationalized mapping for a non-Western character set.
188     *
189     * @param mapping
190     *            Mapping string to use when finding the corresponding code for a given character
191     * @param specialCaseHW if true, then
192     * @since 1.11
193     */
194    public Soundex(final String mapping, final boolean specialCaseHW) {
195        this.soundexMapping = mapping.toCharArray();
196        this.specialCaseHW = specialCaseHW;
197    }
198
199    /**
200     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
201     * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
202     * identical values.
203     *
204     * @param s1
205     *                  A String that will be encoded and compared.
206     * @param s2
207     *                  A String that will be encoded and compared.
208     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
209     *
210     * @see SoundexUtils#difference(StringEncoder,String,String)
211     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
212     *          T-SQL DIFFERENCE </a>
213     *
214     * @throws EncoderException
215     *                  if an error occurs encoding one of the strings
216     * @since 1.3
217     */
218    public int difference(final String s1, final String s2) throws EncoderException {
219        return SoundexUtils.difference(this, s1, s2);
220    }
221
222    /**
223     * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
224     * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
225     *
226     * @param obj
227     *                  Object to encode
228     * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
229     *             supplied.
230     * @throws EncoderException
231     *                  if the parameter supplied is not of type java.lang.String
232     * @throws IllegalArgumentException
233     *                  if a character is not mapped
234     */
235    @Override
236    public Object encode(final Object obj) throws EncoderException {
237        if (!(obj instanceof String)) {
238            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
239        }
240        return soundex((String) obj);
241    }
242
243    /**
244     * Encodes a String using the soundex algorithm.
245     *
246     * @param str
247     *                  A String object to encode
248     * @return A Soundex code corresponding to the String supplied
249     * @throws IllegalArgumentException
250     *                  if a character is not mapped
251     */
252    @Override
253    public String encode(final String str) {
254        return soundex(str);
255    }
256
257    /**
258     * Returns the maxLength. Standard Soundex
259     *
260     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
261     * @return int
262     */
263    @Deprecated
264    public int getMaxLength() {
265        return this.maxLength;
266    }
267
268    /**
269     * Maps the given upper-case character to its Soundex code.
270     *
271     * @param ch
272     *                  An upper-case character.
273     * @return A Soundex code.
274     * @throws IllegalArgumentException
275     *                  Thrown if <code>ch</code> is not mapped.
276     */
277    private char map(final char ch) {
278        final int index = ch - 'A';
279        if (index < 0 || index >= this.soundexMapping.length) {
280            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
281        }
282        return this.soundexMapping[index];
283    }
284
285    /**
286     * Sets the maxLength.
287     *
288     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
289     * @param maxLength
290     *                  The maxLength to set
291     */
292    @Deprecated
293    public void setMaxLength(final int maxLength) {
294        this.maxLength = maxLength;
295    }
296
297    /**
298     * Retrieves the Soundex code for a given String object.
299     *
300     * @param str
301     *                  String to encode using the Soundex algorithm
302     * @return A soundex code for the String supplied
303     * @throws IllegalArgumentException
304     *                  if a character is not mapped
305     */
306    public String soundex(String str) {
307        if (str == null) {
308            return null;
309        }
310        str = SoundexUtils.clean(str);
311        if (str.length() == 0) {
312            return str;
313        }
314        final char out[] = {'0', '0', '0', '0'};
315        int count = 0;
316        final char first = str.charAt(0);
317        out[count++] = first;
318        char lastDigit = map(first); // previous digit
319        for(int i = 1; i < str.length() && count < out.length ; i++) {
320            final char ch = str.charAt(i);
321            if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
322                continue;
323            }
324            final char digit = map(ch);
325            if (digit == SILENT_MARKER) {
326                continue;
327            }
328            if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
329                out[count++] = digit;
330            }
331            lastDigit = digit;
332        }
333        return new String(out);
334    }
335
336}