View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
25   * general purpose scheme to find word with similar phonemes.
26   * 
27   * @author Apache Software Foundation
28   * @version $Id: Soundex.java 1201529 2011-11-13 21:57:16Z ggregory $
29   */
30  public class Soundex implements StringEncoder {
31  
32      /**
33       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
34       * means do not encode.
35       * <p>
36       * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
37       * up the value for the constant values page.)
38       * </p>
39       * 
40       * @see #US_ENGLISH_MAPPING
41       */
42      public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
43  
44      /**
45       * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
46       * means do not encode.
47       * 
48       * @see Soundex#Soundex(char[])
49       */
50      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
51  
52      /**
53       * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
54       * 
55       * @see #US_ENGLISH_MAPPING
56       */
57      public static final Soundex US_ENGLISH = new Soundex();
58  
59      /**
60       * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
61       * 
62       * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
63       */
64      private int maxLength = 4;
65  
66      /**
67       * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
68       * letter is mapped. This implementation contains a default map for US_ENGLISH
69       */
70      private final char[] soundexMapping;
71  
72      /**
73       * Creates an instance using US_ENGLISH_MAPPING
74       * 
75       * @see Soundex#Soundex(char[])
76       * @see Soundex#US_ENGLISH_MAPPING
77       */
78      public Soundex() {
79          this.soundexMapping = US_ENGLISH_MAPPING;
80      }
81  
82      /**
83       * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
84       * mapping for a non-Western character set.
85       * 
86       * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
87       * letter is mapped. This implementation contains a default map for US_ENGLISH
88       * 
89       * @param mapping
90       *                  Mapping array to use when finding the corresponding code for a given character
91       */
92      public Soundex(char[] mapping) {
93          this.soundexMapping = new char[mapping.length];
94          System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
95      }
96  
97      /**
98       * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
99       * and/or possibly provide an internationalized mapping for a non-Western character set.
100      * 
101      * @param mapping
102      *            Mapping string to use when finding the corresponding code for a given character
103      * @since 1.4
104      */
105     public Soundex(String mapping) {
106         this.soundexMapping = mapping.toCharArray();
107     }
108 
109     /**
110      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
111      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
112      * identical values.
113      * 
114      * @param s1
115      *                  A String that will be encoded and compared.
116      * @param s2
117      *                  A String that will be encoded and compared.
118      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
119      * 
120      * @see SoundexUtils#difference(StringEncoder,String,String)
121      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
122      *          T-SQL DIFFERENCE </a>
123      * 
124      * @throws EncoderException
125      *                  if an error occurs encoding one of the strings
126      * @since 1.3
127      */
128     public int difference(String s1, String s2) throws EncoderException {
129         return SoundexUtils.difference(this, s1, s2);
130     }
131 
132     /**
133      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
134      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
135      * 
136      * @param pObject
137      *                  Object to encode
138      * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
139      *             supplied.
140      * @throws EncoderException
141      *                  if the parameter supplied is not of type java.lang.String
142      * @throws IllegalArgumentException
143      *                  if a character is not mapped
144      */
145     public Object encode(Object pObject) throws EncoderException {
146         if (!(pObject instanceof String)) {
147             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
148         }
149         return soundex((String) pObject);
150     }
151 
152     /**
153      * Encodes a String using the soundex algorithm.
154      * 
155      * @param pString
156      *                  A String object to encode
157      * @return A Soundex code corresponding to the String supplied
158      * @throws IllegalArgumentException
159      *                  if a character is not mapped
160      */
161     public String encode(String pString) {
162         return soundex(pString);
163     }
164 
165     /**
166      * Used internally by the SoundEx algorithm.
167      * 
168      * Consonants from the same code group separated by W or H are treated as one.
169      * 
170      * @param str
171      *                  the cleaned working string to encode (in upper case).
172      * @param index
173      *                  the character position to encode
174      * @return Mapping code for a particular character
175      * @throws IllegalArgumentException
176      *                  if the character is not mapped
177      */
178     private char getMappingCode(String str, int index) {
179         // map() throws IllegalArgumentException
180         char mappedChar = this.map(str.charAt(index));
181         // HW rule check
182         if (index > 1 && mappedChar != '0') {
183             char hwChar = str.charAt(index - 1);
184             if ('H' == hwChar || 'W' == hwChar) {
185                 char preHWChar = str.charAt(index - 2);
186                 char firstCode = this.map(preHWChar);
187                 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
188                     return 0;
189                 }
190             }
191         }
192         return mappedChar;
193     }
194 
195     /**
196      * Returns the maxLength. Standard Soundex
197      * 
198      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
199      * @return int
200      */
201     public int getMaxLength() {
202         return this.maxLength;
203     }
204 
205     /**
206      * Returns the soundex mapping.
207      * 
208      * @return soundexMapping.
209      */
210     private char[] getSoundexMapping() {
211         return this.soundexMapping;
212     }
213 
214     /**
215      * Maps the given upper-case character to its Soundex code.
216      * 
217      * @param ch
218      *                  An upper-case character.
219      * @return A Soundex code.
220      * @throws IllegalArgumentException
221      *                  Thrown if <code>ch</code> is not mapped.
222      */
223     private char map(char ch) {
224         int index = ch - 'A';
225         if (index < 0 || index >= this.getSoundexMapping().length) {
226             throw new IllegalArgumentException("The character is not mapped: " + ch);
227         }
228         return this.getSoundexMapping()[index];
229     }
230 
231     /**
232      * Sets the maxLength.
233      * 
234      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
235      * @param maxLength
236      *                  The maxLength to set
237      */
238     public void setMaxLength(int maxLength) {
239         this.maxLength = maxLength;
240     }
241     
242     /**
243      * Retrieves the Soundex code for a given String object.
244      * 
245      * @param str
246      *                  String to encode using the Soundex algorithm
247      * @return A soundex code for the String supplied
248      * @throws IllegalArgumentException
249      *                  if a character is not mapped
250      */
251     public String soundex(String str) {
252         if (str == null) {
253             return null;
254         }
255         str = SoundexUtils.clean(str);
256         if (str.length() == 0) {
257             return str;
258         }
259         char out[] = {'0', '0', '0', '0'};
260         char last, mapped;
261         int incount = 1, count = 1;
262         out[0] = str.charAt(0);
263         // getMappingCode() throws IllegalArgumentException
264         last = getMappingCode(str, 0);
265         while ((incount < str.length()) && (count < out.length)) {
266             mapped = getMappingCode(str, incount++);
267             if (mapped != 0) {
268                 if ((mapped != '0') && (mapped != last)) {
269                     out[count++] = mapped;
270                 }
271                 last = mapped;
272             }
273         }
274         return new String(out);
275     }
276 
277 }