| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Soundex |
|
| 2.8333333333333335;2.833 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.commons.codec.language; | |
| 19 | ||
| 20 | import org.apache.commons.codec.EncoderException; | |
| 21 | import org.apache.commons.codec.StringEncoder; | |
| 22 | ||
| 23 | /** | |
| 24 | * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a | |
| 25 | * general purpose scheme to find word with similar phonemes. | |
| 26 | * | |
| 27 | * This class is thread-safe. | |
| 28 | * Although not strictly immutable, the {@link #maxLength} field is not actually used. | |
| 29 | * | |
| 30 | * @version $Id: Soundex.java 1429868 2013-01-07 16:08:05Z ggregory $ | |
| 31 | */ | |
| 32 | public class Soundex implements StringEncoder { | |
| 33 | ||
| 34 | /** | |
| 35 | * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position | |
| 36 | * means do not encode. | |
| 37 | * <p> | |
| 38 | * (This constant is provided as both an implementation convenience and to allow Javadoc to pick | |
| 39 | * up the value for the constant values page.) | |
| 40 | * </p> | |
| 41 | * | |
| 42 | * @see #US_ENGLISH_MAPPING | |
| 43 | */ | |
| 44 | public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; | |
| 45 | ||
| 46 | /** | |
| 47 | * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position | |
| 48 | * means do not encode. | |
| 49 | * | |
| 50 | * @see Soundex#Soundex(char[]) | |
| 51 | */ | |
| 52 | 1 | private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); |
| 53 | ||
| 54 | /** | |
| 55 | * An instance of Soundex using the US_ENGLISH_MAPPING mapping. | |
| 56 | * | |
| 57 | * @see #US_ENGLISH_MAPPING | |
| 58 | */ | |
| 59 | 1 | public static final Soundex US_ENGLISH = new Soundex(); |
| 60 | ||
| 61 | /** | |
| 62 | * The maximum length of a Soundex code - Soundex codes are only four characters by definition. | |
| 63 | * | |
| 64 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
| 65 | */ | |
| 66 | 33 | @Deprecated |
| 67 | private int maxLength = 4; | |
| 68 | ||
| 69 | /** | |
| 70 | * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each | |
| 71 | * letter is mapped. This implementation contains a default map for US_ENGLISH | |
| 72 | */ | |
| 73 | private final char[] soundexMapping; | |
| 74 | ||
| 75 | /** | |
| 76 | * Creates an instance using US_ENGLISH_MAPPING | |
| 77 | * | |
| 78 | * @see Soundex#Soundex(char[]) | |
| 79 | * @see Soundex#US_ENGLISH_MAPPING | |
| 80 | */ | |
| 81 | 31 | public Soundex() { |
| 82 | 31 | this.soundexMapping = US_ENGLISH_MAPPING; |
| 83 | 31 | } |
| 84 | ||
| 85 | /** | |
| 86 | * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized | |
| 87 | * mapping for a non-Western character set. | |
| 88 | * | |
| 89 | * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each | |
| 90 | * letter is mapped. This implementation contains a default map for US_ENGLISH | |
| 91 | * | |
| 92 | * @param mapping | |
| 93 | * Mapping array to use when finding the corresponding code for a given character | |
| 94 | */ | |
| 95 | 1 | public Soundex(final char[] mapping) { |
| 96 | 1 | this.soundexMapping = new char[mapping.length]; |
| 97 | 1 | System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); |
| 98 | 1 | } |
| 99 | ||
| 100 | /** | |
| 101 | * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, | |
| 102 | * and/or possibly provide an internationalized mapping for a non-Western character set. | |
| 103 | * | |
| 104 | * @param mapping | |
| 105 | * Mapping string to use when finding the corresponding code for a given character | |
| 106 | * @since 1.4 | |
| 107 | */ | |
| 108 | 1 | public Soundex(final String mapping) { |
| 109 | 1 | this.soundexMapping = mapping.toCharArray(); |
| 110 | 1 | } |
| 111 | ||
| 112 | /** | |
| 113 | * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This | |
| 114 | * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or | |
| 115 | * identical values. | |
| 116 | * | |
| 117 | * @param s1 | |
| 118 | * A String that will be encoded and compared. | |
| 119 | * @param s2 | |
| 120 | * A String that will be encoded and compared. | |
| 121 | * @return The number of characters in the two encoded Strings that are the same from 0 to 4. | |
| 122 | * | |
| 123 | * @see SoundexUtils#difference(StringEncoder,String,String) | |
| 124 | * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS | |
| 125 | * T-SQL DIFFERENCE </a> | |
| 126 | * | |
| 127 | * @throws EncoderException | |
| 128 | * if an error occurs encoding one of the strings | |
| 129 | * @since 1.3 | |
| 130 | */ | |
| 131 | public int difference(final String s1, final String s2) throws EncoderException { | |
| 132 | 12 | return SoundexUtils.difference(this, s1, s2); |
| 133 | } | |
| 134 | ||
| 135 | /** | |
| 136 | * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of | |
| 137 | * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. | |
| 138 | * | |
| 139 | * @param obj | |
| 140 | * Object to encode | |
| 141 | * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String | |
| 142 | * supplied. | |
| 143 | * @throws EncoderException | |
| 144 | * if the parameter supplied is not of type java.lang.String | |
| 145 | * @throws IllegalArgumentException | |
| 146 | * if a character is not mapped | |
| 147 | */ | |
| 148 | @Override | |
| 149 | public Object encode(final Object obj) throws EncoderException { | |
| 150 | 6 | if (!(obj instanceof String)) { |
| 151 | 1 | throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); |
| 152 | } | |
| 153 | 5 | return soundex((String) obj); |
| 154 | } | |
| 155 | ||
| 156 | /** | |
| 157 | * Encodes a String using the soundex algorithm. | |
| 158 | * | |
| 159 | * @param str | |
| 160 | * A String object to encode | |
| 161 | * @return A Soundex code corresponding to the String supplied | |
| 162 | * @throws IllegalArgumentException | |
| 163 | * if a character is not mapped | |
| 164 | */ | |
| 165 | @Override | |
| 166 | public String encode(final String str) { | |
| 167 | 175 | return soundex(str); |
| 168 | } | |
| 169 | ||
| 170 | /** | |
| 171 | * Used internally by the SoundEx algorithm. | |
| 172 | * | |
| 173 | * Consonants from the same code group separated by W or H are treated as one. | |
| 174 | * | |
| 175 | * @param str | |
| 176 | * the cleaned working string to encode (in upper case). | |
| 177 | * @param index | |
| 178 | * the character position to encode | |
| 179 | * @return Mapping code for a particular character | |
| 180 | * @throws IllegalArgumentException | |
| 181 | * if the character is not mapped | |
| 182 | */ | |
| 183 | private char getMappingCode(final String str, final int index) { | |
| 184 | // map() throws IllegalArgumentException | |
| 185 | 971 | final char mappedChar = this.map(str.charAt(index)); |
| 186 | // HW rule check | |
| 187 | 969 | if (index > 1 && mappedChar != '0') { |
| 188 | 367 | final char hwChar = str.charAt(index - 1); |
| 189 | 367 | if ('H' == hwChar || 'W' == hwChar) { |
| 190 | 10 | final char preHWChar = str.charAt(index - 2); |
| 191 | 10 | final char firstCode = this.map(preHWChar); |
| 192 | 10 | if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { |
| 193 | 4 | return 0; |
| 194 | } | |
| 195 | } | |
| 196 | } | |
| 197 | 965 | return mappedChar; |
| 198 | } | |
| 199 | ||
| 200 | /** | |
| 201 | * Returns the maxLength. Standard Soundex | |
| 202 | * | |
| 203 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
| 204 | * @return int | |
| 205 | */ | |
| 206 | @Deprecated | |
| 207 | public int getMaxLength() { | |
| 208 | 0 | return this.maxLength; |
| 209 | } | |
| 210 | ||
| 211 | /** | |
| 212 | * Returns the soundex mapping. | |
| 213 | * | |
| 214 | * @return soundexMapping. | |
| 215 | */ | |
| 216 | private char[] getSoundexMapping() { | |
| 217 | 1960 | return this.soundexMapping; |
| 218 | } | |
| 219 | ||
| 220 | /** | |
| 221 | * Maps the given upper-case character to its Soundex code. | |
| 222 | * | |
| 223 | * @param ch | |
| 224 | * An upper-case character. | |
| 225 | * @return A Soundex code. | |
| 226 | * @throws IllegalArgumentException | |
| 227 | * Thrown if <code>ch</code> is not mapped. | |
| 228 | */ | |
| 229 | private char map(final char ch) { | |
| 230 | 981 | final int index = ch - 'A'; |
| 231 | 981 | if (index < 0 || index >= this.getSoundexMapping().length) { |
| 232 | 2 | throw new IllegalArgumentException("The character is not mapped: " + ch); |
| 233 | } | |
| 234 | 979 | return this.getSoundexMapping()[index]; |
| 235 | } | |
| 236 | ||
| 237 | /** | |
| 238 | * Sets the maxLength. | |
| 239 | * | |
| 240 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
| 241 | * @param maxLength | |
| 242 | * The maxLength to set | |
| 243 | */ | |
| 244 | @Deprecated | |
| 245 | public void setMaxLength(final int maxLength) { | |
| 246 | 0 | this.maxLength = maxLength; |
| 247 | 0 | } |
| 248 | ||
| 249 | /** | |
| 250 | * Retrieves the Soundex code for a given String object. | |
| 251 | * | |
| 252 | * @param str | |
| 253 | * String to encode using the Soundex algorithm | |
| 254 | * @return A soundex code for the String supplied | |
| 255 | * @throws IllegalArgumentException | |
| 256 | * if a character is not mapped | |
| 257 | */ | |
| 258 | public String soundex(String str) { | |
| 259 | 184 | if (str == null) { |
| 260 | 3 | return null; |
| 261 | } | |
| 262 | 181 | str = SoundexUtils.clean(str); |
| 263 | 181 | if (str.length() == 0) { |
| 264 | 7 | return str; |
| 265 | } | |
| 266 | 174 | final char out[] = {'0', '0', '0', '0'}; |
| 267 | char last, mapped; | |
| 268 | 174 | int incount = 1, count = 1; |
| 269 | 174 | out[0] = str.charAt(0); |
| 270 | // getMappingCode() throws IllegalArgumentException | |
| 271 | 174 | last = getMappingCode(str, 0); |
| 272 | 969 | while (incount < str.length() && count < out.length) { |
| 273 | 797 | mapped = getMappingCode(str, incount++); |
| 274 | 797 | if (mapped != 0) { |
| 275 | 793 | if (mapped != '0' && mapped != last) { |
| 276 | 383 | out[count++] = mapped; |
| 277 | } | |
| 278 | 793 | last = mapped; |
| 279 | } | |
| 280 | } | |
| 281 | 172 | return new String(out); |
| 282 | } | |
| 283 | ||
| 284 | } |