001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025     * general purpose scheme to find word with similar phonemes.
026     * 
027     * @author Apache Software Foundation
028     * @version $Id: Soundex.java 1201529 2011-11-13 21:57:16Z ggregory $
029     */
030    public class Soundex implements StringEncoder {
031    
032        /**
033         * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
034         * means do not encode.
035         * <p>
036         * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
037         * up the value for the constant values page.)
038         * </p>
039         * 
040         * @see #US_ENGLISH_MAPPING
041         */
042        public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
043    
044        /**
045         * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
046         * means do not encode.
047         * 
048         * @see Soundex#Soundex(char[])
049         */
050        private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
051    
052        /**
053         * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
054         * 
055         * @see #US_ENGLISH_MAPPING
056         */
057        public static final Soundex US_ENGLISH = new Soundex();
058    
059        /**
060         * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
061         * 
062         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
063         */
064        private int maxLength = 4;
065    
066        /**
067         * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
068         * letter is mapped. This implementation contains a default map for US_ENGLISH
069         */
070        private final char[] soundexMapping;
071    
072        /**
073         * Creates an instance using US_ENGLISH_MAPPING
074         * 
075         * @see Soundex#Soundex(char[])
076         * @see Soundex#US_ENGLISH_MAPPING
077         */
078        public Soundex() {
079            this.soundexMapping = US_ENGLISH_MAPPING;
080        }
081    
082        /**
083         * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
084         * mapping for a non-Western character set.
085         * 
086         * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
087         * letter is mapped. This implementation contains a default map for US_ENGLISH
088         * 
089         * @param mapping
090         *                  Mapping array to use when finding the corresponding code for a given character
091         */
092        public Soundex(char[] mapping) {
093            this.soundexMapping = new char[mapping.length];
094            System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
095        }
096    
097        /**
098         * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
099         * and/or possibly provide an internationalized mapping for a non-Western character set.
100         * 
101         * @param mapping
102         *            Mapping string to use when finding the corresponding code for a given character
103         * @since 1.4
104         */
105        public Soundex(String mapping) {
106            this.soundexMapping = mapping.toCharArray();
107        }
108    
109        /**
110         * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
111         * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
112         * identical values.
113         * 
114         * @param s1
115         *                  A String that will be encoded and compared.
116         * @param s2
117         *                  A String that will be encoded and compared.
118         * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
119         * 
120         * @see SoundexUtils#difference(StringEncoder,String,String)
121         * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
122         *          T-SQL DIFFERENCE </a>
123         * 
124         * @throws EncoderException
125         *                  if an error occurs encoding one of the strings
126         * @since 1.3
127         */
128        public int difference(String s1, String s2) throws EncoderException {
129            return SoundexUtils.difference(this, s1, s2);
130        }
131    
132        /**
133         * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
134         * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
135         * 
136         * @param pObject
137         *                  Object to encode
138         * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
139         *             supplied.
140         * @throws EncoderException
141         *                  if the parameter supplied is not of type java.lang.String
142         * @throws IllegalArgumentException
143         *                  if a character is not mapped
144         */
145        public Object encode(Object pObject) throws EncoderException {
146            if (!(pObject instanceof String)) {
147                throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
148            }
149            return soundex((String) pObject);
150        }
151    
152        /**
153         * Encodes a String using the soundex algorithm.
154         * 
155         * @param pString
156         *                  A String object to encode
157         * @return A Soundex code corresponding to the String supplied
158         * @throws IllegalArgumentException
159         *                  if a character is not mapped
160         */
161        public String encode(String pString) {
162            return soundex(pString);
163        }
164    
165        /**
166         * Used internally by the SoundEx algorithm.
167         * 
168         * Consonants from the same code group separated by W or H are treated as one.
169         * 
170         * @param str
171         *                  the cleaned working string to encode (in upper case).
172         * @param index
173         *                  the character position to encode
174         * @return Mapping code for a particular character
175         * @throws IllegalArgumentException
176         *                  if the character is not mapped
177         */
178        private char getMappingCode(String str, int index) {
179            // map() throws IllegalArgumentException
180            char mappedChar = this.map(str.charAt(index));
181            // HW rule check
182            if (index > 1 && mappedChar != '0') {
183                char hwChar = str.charAt(index - 1);
184                if ('H' == hwChar || 'W' == hwChar) {
185                    char preHWChar = str.charAt(index - 2);
186                    char firstCode = this.map(preHWChar);
187                    if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
188                        return 0;
189                    }
190                }
191            }
192            return mappedChar;
193        }
194    
195        /**
196         * Returns the maxLength. Standard Soundex
197         * 
198         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
199         * @return int
200         */
201        public int getMaxLength() {
202            return this.maxLength;
203        }
204    
205        /**
206         * Returns the soundex mapping.
207         * 
208         * @return soundexMapping.
209         */
210        private char[] getSoundexMapping() {
211            return this.soundexMapping;
212        }
213    
214        /**
215         * Maps the given upper-case character to its Soundex code.
216         * 
217         * @param ch
218         *                  An upper-case character.
219         * @return A Soundex code.
220         * @throws IllegalArgumentException
221         *                  Thrown if <code>ch</code> is not mapped.
222         */
223        private char map(char ch) {
224            int index = ch - 'A';
225            if (index < 0 || index >= this.getSoundexMapping().length) {
226                throw new IllegalArgumentException("The character is not mapped: " + ch);
227            }
228            return this.getSoundexMapping()[index];
229        }
230    
231        /**
232         * Sets the maxLength.
233         * 
234         * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
235         * @param maxLength
236         *                  The maxLength to set
237         */
238        public void setMaxLength(int maxLength) {
239            this.maxLength = maxLength;
240        }
241        
242        /**
243         * Retrieves the Soundex code for a given String object.
244         * 
245         * @param str
246         *                  String to encode using the Soundex algorithm
247         * @return A soundex code for the String supplied
248         * @throws IllegalArgumentException
249         *                  if a character is not mapped
250         */
251        public String soundex(String str) {
252            if (str == null) {
253                return null;
254            }
255            str = SoundexUtils.clean(str);
256            if (str.length() == 0) {
257                return str;
258            }
259            char out[] = {'0', '0', '0', '0'};
260            char last, mapped;
261            int incount = 1, count = 1;
262            out[0] = str.charAt(0);
263            // getMappingCode() throws IllegalArgumentException
264            last = getMappingCode(str, 0);
265            while ((incount < str.length()) && (count < out.length)) {
266                mapped = getMappingCode(str, incount++);
267                if (mapped != 0) {
268                    if ((mapped != '0') && (mapped != last)) {
269                        out[count++] = mapped;
270                    }
271                    last = mapped;
272                }
273            }
274            return new String(out);
275        }
276    
277    }