View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Refined Soundex value. A refined soundex code is
25   * optimized for spell checking words. Soundex method originally developed by
26   * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
27   *
28   * <p>This class is immutable and thread-safe.</p>
29   */
30  public class RefinedSoundex implements StringEncoder {
31  
32      /**
33       * Mapping:
34       * <pre>
35       * 0: A E I O U Y H W
36       * 1: B P
37       * 2: F V
38       * 3: C K S
39       * 4: G J
40       * 5: Q X Z
41       * 6: D T
42       * 7: L
43       * 8: M N
44       * 9: R
45       * </pre>
46       * @since 1.4
47       */
48      //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
49      public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
50  
51     /**
52       * RefinedSoundex is *refined* for a number of reasons one being that the
53       * mappings have been altered. This implementation contains default
54       * mappings for US English.
55       */
56      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
57  
58      /**
59       * This static variable contains an instance of the RefinedSoundex using
60       * the US_ENGLISH mapping.
61       */
62      public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
63  
64      /**
65       * Every letter of the alphabet is "mapped" to a numerical value. This char
66       * array holds the values to which each letter is mapped. This
67       * implementation contains a default map for US_ENGLISH
68       */
69      private final char[] soundexMapping;
70  
71       /**
72       * Creates an instance of the RefinedSoundex object using the default US
73       * English mapping.
74       */
75      public RefinedSoundex() {
76          this.soundexMapping = US_ENGLISH_MAPPING;
77      }
78  
79      /**
80       * Creates a refined soundex instance using a custom mapping. This
81       * constructor can be used to customize the mapping, and/or possibly
82       * provide an internationalized mapping for a non-Western character set.
83       *
84       * @param mapping
85       *                  Mapping array to use when finding the corresponding code for
86       *                  a given character
87       */
88      public RefinedSoundex(final char[] mapping) {
89          this.soundexMapping = mapping.clone();
90      }
91  
92      /**
93       * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
94       * and/or possibly provide an internationalized mapping for a non-Western character set.
95       *
96       * @param mapping
97       *            Mapping string to use when finding the corresponding code for a given character
98       * @since 1.4
99       */
100     public RefinedSoundex(final String mapping) {
101         this.soundexMapping = mapping.toCharArray();
102     }
103 
104     /**
105      * Returns the number of characters in the two encoded Strings that are the
106      * same. This return value ranges from 0 to the length of the shortest
107      * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
108      * example) indicates strong similarity or identical values. For refined
109      * Soundex, the return value can be greater than 4.
110      *
111      * @param s1
112      *                  A String that will be encoded and compared.
113      * @param s2
114      *                  A String that will be encoded and compared.
115      * @return The number of characters in the two encoded Strings that are the
116      *             same from 0 to the length of the shortest encoded String.
117      *
118      * @see SoundexUtils#difference(StringEncoder,String,String)
119      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
120      *          MS T-SQL DIFFERENCE</a>
121      *
122      * @throws EncoderException
123      *                  if an error occurs encoding one of the strings
124      * @since 1.3
125      */
126     public int difference(final String s1, final String s2) throws EncoderException {
127         return SoundexUtils.difference(this, s1, s2);
128     }
129 
130     /**
131      * Encodes an Object using the refined soundex algorithm. This method is
132      * provided in order to satisfy the requirements of the Encoder interface,
133      * and will throw an EncoderException if the supplied object is not of type
134      * {@link String}.
135      *
136      * @param obj
137      *                  Object to encode
138      * @return An object (or type {@link String}) containing the refined
139      *             soundex code which corresponds to the String supplied.
140      * @throws EncoderException
141      *                  if the parameter supplied is not of type {@link String}
142      */
143     @Override
144     public Object encode(final Object obj) throws EncoderException {
145         if (!(obj instanceof String)) {
146             throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
147         }
148         return soundex((String) obj);
149     }
150 
151     /**
152      * Encodes a String using the refined soundex algorithm.
153      *
154      * @param str
155      *                  A String object to encode
156      * @return A Soundex code corresponding to the String supplied
157      */
158     @Override
159     public String encode(final String str) {
160         return soundex(str);
161     }
162 
163     /**
164      * Returns the mapping code for a given character. The mapping codes are
165      * maintained in an internal char array named soundexMapping, and the
166      * default values of these mappings are US English.
167      *
168      * @param c
169      *                  char to get mapping for
170      * @return A character (really a numeral) to return for the given char
171      */
172     char getMappingCode(final char c) {
173         if (!Character.isLetter(c)) {
174             return 0;
175         }
176         final int index = Character.toUpperCase(c) - 'A';
177         if (index < 0 || index >= this.soundexMapping.length) {
178             return 0;
179         }
180         return this.soundexMapping[index];
181     }
182 
183     /**
184      * Retrieves the Refined Soundex code for a given String object.
185      *
186      * @param str
187      *                  String to encode using the Refined Soundex algorithm
188      * @return A soundex code for the String supplied
189      */
190     public String soundex(String str) {
191         if (str == null) {
192             return null;
193         }
194         str = SoundexUtils.clean(str);
195         if (str.isEmpty()) {
196             return str;
197         }
198 
199         final StringBuilder sBuf = new StringBuilder();
200         sBuf.append(str.charAt(0));
201 
202         char last, current;
203         last = '*';
204 
205         for (int i = 0; i < str.length(); i++) {
206 
207             current = getMappingCode(str.charAt(i));
208             if (current == last) {
209                 continue;
210             }
211             if (current != 0) {
212                 sBuf.append(current);
213             }
214 
215             last = current;
216 
217         }
218 
219         return sBuf.toString();
220     }
221 }