001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Refined Soundex value. A refined soundex code is
025 * optimized for spell checking words. Soundex method originally developed by
026 * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
027 *
028 * <p>This class is immutable and thread-safe.</p>
029 */
030public class RefinedSoundex implements StringEncoder {
031
032    /**
033     * Mapping:
034     * <pre>
035     * 0: A E I O U Y H W
036     * 1: B P
037     * 2: F V
038     * 3: C K S
039     * 4: G J
040     * 5: Q X Z
041     * 6: D T
042     * 7: L
043     * 8: M N
044     * 9: R
045     * </pre>
046     * @since 1.4
047     */
048    //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
049    public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
050
051   /**
052     * RefinedSoundex is *refined* for a number of reasons one being that the
053     * mappings have been altered. This implementation contains default
054     * mappings for US English.
055     */
056    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
057
058    /**
059     * This static variable contains an instance of the RefinedSoundex using
060     * the US_ENGLISH mapping.
061     */
062    public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
063
064    /**
065     * Every letter of the alphabet is "mapped" to a numerical value. This char
066     * array holds the values to which each letter is mapped. This
067     * implementation contains a default map for US_ENGLISH
068     */
069    private final char[] soundexMapping;
070
071     /**
072     * Creates an instance of the RefinedSoundex object using the default US
073     * English mapping.
074     */
075    public RefinedSoundex() {
076        this.soundexMapping = US_ENGLISH_MAPPING;
077    }
078
079    /**
080     * Creates a refined soundex instance using a custom mapping. This
081     * constructor can be used to customize the mapping, and/or possibly
082     * provide an internationalized mapping for a non-Western character set.
083     *
084     * @param mapping
085     *                  Mapping array to use when finding the corresponding code for
086     *                  a given character
087     */
088    public RefinedSoundex(final char[] mapping) {
089        this.soundexMapping = mapping.clone();
090    }
091
092    /**
093     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
094     * and/or possibly provide an internationalized mapping for a non-Western character set.
095     *
096     * @param mapping
097     *            Mapping string to use when finding the corresponding code for a given character
098     * @since 1.4
099     */
100    public RefinedSoundex(final String mapping) {
101        this.soundexMapping = mapping.toCharArray();
102    }
103
104    /**
105     * Returns the number of characters in the two encoded Strings that are the
106     * same. This return value ranges from 0 to the length of the shortest
107     * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
108     * example) indicates strong similarity or identical values. For refined
109     * Soundex, the return value can be greater than 4.
110     *
111     * @param s1
112     *                  A String that will be encoded and compared.
113     * @param s2
114     *                  A String that will be encoded and compared.
115     * @return The number of characters in the two encoded Strings that are the
116     *             same from 0 to the length of the shortest encoded String.
117     *
118     * @see SoundexUtils#difference(StringEncoder,String,String)
119     * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
120     *          MS T-SQL DIFFERENCE</a>
121     *
122     * @throws EncoderException
123     *                  if an error occurs encoding one of the strings
124     * @since 1.3
125     */
126    public int difference(final String s1, final String s2) throws EncoderException {
127        return SoundexUtils.difference(this, s1, s2);
128    }
129
130    /**
131     * Encodes an Object using the refined soundex algorithm. This method is
132     * provided in order to satisfy the requirements of the Encoder interface,
133     * and will throw an EncoderException if the supplied object is not of type
134     * {@link String}.
135     *
136     * @param obj
137     *                  Object to encode
138     * @return An object (or type {@link String}) containing the refined
139     *             soundex code which corresponds to the String supplied.
140     * @throws EncoderException
141     *                  if the parameter supplied is not of type {@link String}
142     */
143    @Override
144    public Object encode(final Object obj) throws EncoderException {
145        if (!(obj instanceof String)) {
146            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
147        }
148        return soundex((String) obj);
149    }
150
151    /**
152     * Encodes a String using the refined soundex algorithm.
153     *
154     * @param str
155     *                  A String object to encode
156     * @return A Soundex code corresponding to the String supplied
157     */
158    @Override
159    public String encode(final String str) {
160        return soundex(str);
161    }
162
163    /**
164     * Returns the mapping code for a given character. The mapping codes are
165     * maintained in an internal char array named soundexMapping, and the
166     * default values of these mappings are US English.
167     *
168     * @param c
169     *                  char to get mapping for
170     * @return A character (really a numeral) to return for the given char
171     */
172    char getMappingCode(final char c) {
173        if (!Character.isLetter(c)) {
174            return 0;
175        }
176        final int index = Character.toUpperCase(c) - 'A';
177        if (index < 0 || index >= this.soundexMapping.length) {
178            return 0;
179        }
180        return this.soundexMapping[index];
181    }
182
183    /**
184     * Retrieves the Refined Soundex code for a given String object.
185     *
186     * @param str
187     *                  String to encode using the Refined Soundex algorithm
188     * @return A soundex code for the String supplied
189     */
190    public String soundex(String str) {
191        if (str == null) {
192            return null;
193        }
194        str = SoundexUtils.clean(str);
195        if (str.isEmpty()) {
196            return str;
197        }
198
199        final StringBuilder sBuf = new StringBuilder();
200        sBuf.append(str.charAt(0));
201
202        char last, current;
203        last = '*';
204
205        for (int i = 0; i < str.length(); i++) {
206
207            current = getMappingCode(str.charAt(i));
208            if (current == last) {
209                continue;
210            }
211            if (current != 0) {
212                sBuf.append(current);
213            }
214
215            last = current;
216
217        }
218
219        return sBuf.toString();
220    }
221}