1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Refined Soundex value. A refined soundex code is
25 * optimized for spell checking words. Soundex method originally developed by
26 * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
27 *
28 * @author Apache Software Foundation
29 * @version $Id: RefinedSoundex.java 1170351 2011-09-13 21:09:09Z ggregory $
30 */
31 public class RefinedSoundex implements StringEncoder {
32
33 /**
34 * @since 1.4
35 */
36 public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
37
38 /**
39 * RefinedSoundex is *refined* for a number of reasons one being that the
40 * mappings have been altered. This implementation contains default
41 * mappings for US English.
42 */
43 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
44
45 /**
46 * Every letter of the alphabet is "mapped" to a numerical value. This char
47 * array holds the values to which each letter is mapped. This
48 * implementation contains a default map for US_ENGLISH
49 */
50 private final char[] soundexMapping;
51
52 /**
53 * This static variable contains an instance of the RefinedSoundex using
54 * the US_ENGLISH mapping.
55 */
56 public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
57
58 /**
59 * Creates an instance of the RefinedSoundex object using the default US
60 * English mapping.
61 */
62 public RefinedSoundex() {
63 this.soundexMapping = US_ENGLISH_MAPPING;
64 }
65
66 /**
67 * Creates a refined soundex instance using a custom mapping. This
68 * constructor can be used to customize the mapping, and/or possibly
69 * provide an internationalized mapping for a non-Western character set.
70 *
71 * @param mapping
72 * Mapping array to use when finding the corresponding code for
73 * a given character
74 */
75 public RefinedSoundex(char[] mapping) {
76 this.soundexMapping = new char[mapping.length];
77 System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
78 }
79
80 /**
81 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
82 * and/or possibly provide an internationalized mapping for a non-Western character set.
83 *
84 * @param mapping
85 * Mapping string to use when finding the corresponding code for a given character
86 * @since 1.4
87 */
88 public RefinedSoundex(String mapping) {
89 this.soundexMapping = mapping.toCharArray();
90 }
91
92 /**
93 * Returns the number of characters in the two encoded Strings that are the
94 * same. This return value ranges from 0 to the length of the shortest
95 * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
96 * example) indicates strong similarity or identical values. For refined
97 * Soundex, the return value can be greater than 4.
98 *
99 * @param s1
100 * A String that will be encoded and compared.
101 * @param s2
102 * A String that will be encoded and compared.
103 * @return The number of characters in the two encoded Strings that are the
104 * same from 0 to to the length of the shortest encoded String.
105 *
106 * @see SoundexUtils#difference(StringEncoder,String,String)
107 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108 * MS T-SQL DIFFERENCE</a>
109 *
110 * @throws EncoderException
111 * if an error occurs encoding one of the strings
112 * @since 1.3
113 */
114 public int difference(String s1, String s2) throws EncoderException {
115 return SoundexUtils.difference(this, s1, s2);
116 }
117
118 /**
119 * Encodes an Object using the refined soundex algorithm. This method is
120 * provided in order to satisfy the requirements of the Encoder interface,
121 * and will throw an EncoderException if the supplied object is not of type
122 * java.lang.String.
123 *
124 * @param pObject
125 * Object to encode
126 * @return An object (or type java.lang.String) containing the refined
127 * soundex code which corresponds to the String supplied.
128 * @throws EncoderException
129 * if the parameter supplied is not of type java.lang.String
130 */
131 public Object encode(Object pObject) throws EncoderException {
132 if (!(pObject instanceof String)) {
133 throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
134 }
135 return soundex((String) pObject);
136 }
137
138 /**
139 * Encodes a String using the refined soundex algorithm.
140 *
141 * @param pString
142 * A String object to encode
143 * @return A Soundex code corresponding to the String supplied
144 */
145 public String encode(String pString) {
146 return soundex(pString);
147 }
148
149 /**
150 * Returns the mapping code for a given character. The mapping codes are
151 * maintained in an internal char array named soundexMapping, and the
152 * default values of these mappings are US English.
153 *
154 * @param c
155 * char to get mapping for
156 * @return A character (really a numeral) to return for the given char
157 */
158 char getMappingCode(char c) {
159 if (!Character.isLetter(c)) {
160 return 0;
161 }
162 return this.soundexMapping[Character.toUpperCase(c) - 'A'];
163 }
164
165 /**
166 * Retrieves the Refined Soundex code for a given String object.
167 *
168 * @param str
169 * String to encode using the Refined Soundex algorithm
170 * @return A soundex code for the String supplied
171 */
172 public String soundex(String str) {
173 if (str == null) {
174 return null;
175 }
176 str = SoundexUtils.clean(str);
177 if (str.length() == 0) {
178 return str;
179 }
180
181 StringBuffer sBuf = new StringBuffer();
182 sBuf.append(str.charAt(0));
183
184 char last, current;
185 last = '*';
186
187 for (int i = 0; i < str.length(); i++) {
188
189 current = getMappingCode(str.charAt(i));
190 if (current == last) {
191 continue;
192 } else if (current != 0) {
193 sBuf.append(current);
194 }
195
196 last = current;
197
198 }
199
200 return sBuf.toString();
201 }
202 }