| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| RefinedSoundex |
|
| 2.375;2.375 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.commons.codec.language; | |
| 19 | ||
| 20 | import org.apache.commons.codec.EncoderException; | |
| 21 | import org.apache.commons.codec.StringEncoder; | |
| 22 | ||
| 23 | /** | |
| 24 | * Encodes a string into a Refined Soundex value. A refined soundex code is | |
| 25 | * optimized for spell checking words. Soundex method originally developed by | |
| 26 | * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>. | |
| 27 | * | |
| 28 | * <p>This class is immutable and thread-safe.</p> | |
| 29 | * | |
| 30 | * @version $Id: RefinedSoundex.java 1429868 2013-01-07 16:08:05Z ggregory $ | |
| 31 | */ | |
| 32 | public class RefinedSoundex implements StringEncoder { | |
| 33 | ||
| 34 | /** | |
| 35 | * @since 1.4 | |
| 36 | */ | |
| 37 | public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505"; | |
| 38 | ||
| 39 | /** | |
| 40 | * RefinedSoundex is *refined* for a number of reasons one being that the | |
| 41 | * mappings have been altered. This implementation contains default | |
| 42 | * mappings for US English. | |
| 43 | */ | |
| 44 | 1 | private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); |
| 45 | ||
| 46 | /** | |
| 47 | * Every letter of the alphabet is "mapped" to a numerical value. This char | |
| 48 | * array holds the values to which each letter is mapped. This | |
| 49 | * implementation contains a default map for US_ENGLISH | |
| 50 | */ | |
| 51 | private final char[] soundexMapping; | |
| 52 | ||
| 53 | /** | |
| 54 | * This static variable contains an instance of the RefinedSoundex using | |
| 55 | * the US_ENGLISH mapping. | |
| 56 | */ | |
| 57 | 1 | public static final RefinedSoundex US_ENGLISH = new RefinedSoundex(); |
| 58 | ||
| 59 | /** | |
| 60 | * Creates an instance of the RefinedSoundex object using the default US | |
| 61 | * English mapping. | |
| 62 | */ | |
| 63 | 12 | public RefinedSoundex() { |
| 64 | 12 | this.soundexMapping = US_ENGLISH_MAPPING; |
| 65 | 12 | } |
| 66 | ||
| 67 | /** | |
| 68 | * Creates a refined soundex instance using a custom mapping. This | |
| 69 | * constructor can be used to customize the mapping, and/or possibly | |
| 70 | * provide an internationalized mapping for a non-Western character set. | |
| 71 | * | |
| 72 | * @param mapping | |
| 73 | * Mapping array to use when finding the corresponding code for | |
| 74 | * a given character | |
| 75 | */ | |
| 76 | 1 | public RefinedSoundex(final char[] mapping) { |
| 77 | 1 | this.soundexMapping = new char[mapping.length]; |
| 78 | 1 | System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); |
| 79 | 1 | } |
| 80 | ||
| 81 | /** | |
| 82 | * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, | |
| 83 | * and/or possibly provide an internationalized mapping for a non-Western character set. | |
| 84 | * | |
| 85 | * @param mapping | |
| 86 | * Mapping string to use when finding the corresponding code for a given character | |
| 87 | * @since 1.4 | |
| 88 | */ | |
| 89 | 1 | public RefinedSoundex(final String mapping) { |
| 90 | 1 | this.soundexMapping = mapping.toCharArray(); |
| 91 | 1 | } |
| 92 | ||
| 93 | /** | |
| 94 | * Returns the number of characters in the two encoded Strings that are the | |
| 95 | * same. This return value ranges from 0 to the length of the shortest | |
| 96 | * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for | |
| 97 | * example) indicates strong similarity or identical values. For refined | |
| 98 | * Soundex, the return value can be greater than 4. | |
| 99 | * | |
| 100 | * @param s1 | |
| 101 | * A String that will be encoded and compared. | |
| 102 | * @param s2 | |
| 103 | * A String that will be encoded and compared. | |
| 104 | * @return The number of characters in the two encoded Strings that are the | |
| 105 | * same from 0 to to the length of the shortest encoded String. | |
| 106 | * | |
| 107 | * @see SoundexUtils#difference(StringEncoder,String,String) | |
| 108 | * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> | |
| 109 | * MS T-SQL DIFFERENCE</a> | |
| 110 | * | |
| 111 | * @throws EncoderException | |
| 112 | * if an error occurs encoding one of the strings | |
| 113 | * @since 1.3 | |
| 114 | */ | |
| 115 | public int difference(final String s1, final String s2) throws EncoderException { | |
| 116 | 12 | return SoundexUtils.difference(this, s1, s2); |
| 117 | } | |
| 118 | ||
| 119 | /** | |
| 120 | * Encodes an Object using the refined soundex algorithm. This method is | |
| 121 | * provided in order to satisfy the requirements of the Encoder interface, | |
| 122 | * and will throw an EncoderException if the supplied object is not of type | |
| 123 | * java.lang.String. | |
| 124 | * | |
| 125 | * @param obj | |
| 126 | * Object to encode | |
| 127 | * @return An object (or type java.lang.String) containing the refined | |
| 128 | * soundex code which corresponds to the String supplied. | |
| 129 | * @throws EncoderException | |
| 130 | * if the parameter supplied is not of type java.lang.String | |
| 131 | */ | |
| 132 | @Override | |
| 133 | public Object encode(final Object obj) throws EncoderException { | |
| 134 | 4 | if (!(obj instanceof String)) { |
| 135 | 1 | throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); |
| 136 | } | |
| 137 | 3 | return soundex((String) obj); |
| 138 | } | |
| 139 | ||
| 140 | /** | |
| 141 | * Encodes a String using the refined soundex algorithm. | |
| 142 | * | |
| 143 | * @param str | |
| 144 | * A String object to encode | |
| 145 | * @return A Soundex code corresponding to the String supplied | |
| 146 | */ | |
| 147 | @Override | |
| 148 | public String encode(final String str) { | |
| 149 | 43 | return soundex(str); |
| 150 | } | |
| 151 | ||
| 152 | /** | |
| 153 | * Returns the mapping code for a given character. The mapping codes are | |
| 154 | * maintained in an internal char array named soundexMapping, and the | |
| 155 | * default values of these mappings are US English. | |
| 156 | * | |
| 157 | * @param c | |
| 158 | * char to get mapping for | |
| 159 | * @return A character (really a numeral) to return for the given char | |
| 160 | */ | |
| 161 | char getMappingCode(final char c) { | |
| 162 | 194 | if (!Character.isLetter(c)) { |
| 163 | 1 | return 0; |
| 164 | } | |
| 165 | 193 | return this.soundexMapping[Character.toUpperCase(c) - 'A']; |
| 166 | } | |
| 167 | ||
| 168 | /** | |
| 169 | * Retrieves the Refined Soundex code for a given String object. | |
| 170 | * | |
| 171 | * @param str | |
| 172 | * String to encode using the Refined Soundex algorithm | |
| 173 | * @return A soundex code for the String supplied | |
| 174 | */ | |
| 175 | public String soundex(String str) { | |
| 176 | 49 | if (str == null) { |
| 177 | 3 | return null; |
| 178 | } | |
| 179 | 46 | str = SoundexUtils.clean(str); |
| 180 | 46 | if (str.length() == 0) { |
| 181 | 7 | return str; |
| 182 | } | |
| 183 | ||
| 184 | 39 | final StringBuilder sBuf = new StringBuilder(); |
| 185 | 39 | sBuf.append(str.charAt(0)); |
| 186 | ||
| 187 | char last, current; | |
| 188 | 39 | last = '*'; |
| 189 | ||
| 190 | 232 | for (int i = 0; i < str.length(); i++) { |
| 191 | ||
| 192 | 193 | current = getMappingCode(str.charAt(i)); |
| 193 | 193 | if (current == last) { |
| 194 | 20 | continue; |
| 195 | 173 | } else if (current != 0) { |
| 196 | 173 | sBuf.append(current); |
| 197 | } | |
| 198 | ||
| 199 | 173 | last = current; |
| 200 | ||
| 201 | } | |
| 202 | ||
| 203 | 39 | return sBuf.toString(); |
| 204 | } | |
| 205 | } |