View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Refined Soundex value. A refined soundex code is
25   * optimized for spell checking words. Soundex method originally developed by
26   * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
27   * 
28   * @author Apache Software Foundation
29   * @version $Id: RefinedSoundex.java 1170351 2011-09-13 21:09:09Z ggregory $
30   */
31  public class RefinedSoundex implements StringEncoder {
32  
33      /**
34       * @since 1.4
35       */
36      public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
37  
38     /**
39       * RefinedSoundex is *refined* for a number of reasons one being that the
40       * mappings have been altered. This implementation contains default
41       * mappings for US English.
42       */
43      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
44  
45      /**
46       * Every letter of the alphabet is "mapped" to a numerical value. This char
47       * array holds the values to which each letter is mapped. This
48       * implementation contains a default map for US_ENGLISH
49       */
50      private final char[] soundexMapping;
51  
52      /**
53       * This static variable contains an instance of the RefinedSoundex using
54       * the US_ENGLISH mapping.
55       */
56      public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
57  
58       /**
59       * Creates an instance of the RefinedSoundex object using the default US
60       * English mapping.
61       */
62      public RefinedSoundex() {
63          this.soundexMapping = US_ENGLISH_MAPPING;
64      }
65  
66      /**
67       * Creates a refined soundex instance using a custom mapping. This
68       * constructor can be used to customize the mapping, and/or possibly
69       * provide an internationalized mapping for a non-Western character set.
70       * 
71       * @param mapping
72       *                  Mapping array to use when finding the corresponding code for
73       *                  a given character
74       */
75      public RefinedSoundex(char[] mapping) {
76          this.soundexMapping = new char[mapping.length];
77          System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
78      }
79  
80      /**
81       * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
82       * and/or possibly provide an internationalized mapping for a non-Western character set.
83       * 
84       * @param mapping
85       *            Mapping string to use when finding the corresponding code for a given character
86       * @since 1.4
87       */
88      public RefinedSoundex(String mapping) {
89          this.soundexMapping = mapping.toCharArray();
90      }
91  
92      /**
93       * Returns the number of characters in the two encoded Strings that are the
94       * same. This return value ranges from 0 to the length of the shortest
95       * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
96       * example) indicates strong similarity or identical values. For refined
97       * Soundex, the return value can be greater than 4.
98       * 
99       * @param s1
100      *                  A String that will be encoded and compared.
101      * @param s2
102      *                  A String that will be encoded and compared.
103      * @return The number of characters in the two encoded Strings that are the
104      *             same from 0 to to the length of the shortest encoded String.
105      * 
106      * @see SoundexUtils#difference(StringEncoder,String,String)
107      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108      *          MS T-SQL DIFFERENCE</a>
109      * 
110      * @throws EncoderException
111      *                  if an error occurs encoding one of the strings
112      * @since 1.3
113      */
114     public int difference(String s1, String s2) throws EncoderException {
115         return SoundexUtils.difference(this, s1, s2);
116     }
117 
118     /**
119      * Encodes an Object using the refined soundex algorithm. This method is
120      * provided in order to satisfy the requirements of the Encoder interface,
121      * and will throw an EncoderException if the supplied object is not of type
122      * java.lang.String.
123      * 
124      * @param pObject
125      *                  Object to encode
126      * @return An object (or type java.lang.String) containing the refined
127      *             soundex code which corresponds to the String supplied.
128      * @throws EncoderException
129      *                  if the parameter supplied is not of type java.lang.String
130      */
131     public Object encode(Object pObject) throws EncoderException {
132         if (!(pObject instanceof String)) {
133             throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
134         }
135         return soundex((String) pObject);
136     }
137 
138     /**
139      * Encodes a String using the refined soundex algorithm.
140      * 
141      * @param pString
142      *                  A String object to encode
143      * @return A Soundex code corresponding to the String supplied
144      */
145     public String encode(String pString) {
146         return soundex(pString);
147     }
148 
149     /**
150      * Returns the mapping code for a given character. The mapping codes are
151      * maintained in an internal char array named soundexMapping, and the
152      * default values of these mappings are US English.
153      * 
154      * @param c
155      *                  char to get mapping for
156      * @return A character (really a numeral) to return for the given char
157      */
158     char getMappingCode(char c) {
159         if (!Character.isLetter(c)) {
160             return 0;
161         }
162         return this.soundexMapping[Character.toUpperCase(c) - 'A'];
163     }
164 
165     /**
166      * Retrieves the Refined Soundex code for a given String object.
167      * 
168      * @param str
169      *                  String to encode using the Refined Soundex algorithm
170      * @return A soundex code for the String supplied
171      */
172     public String soundex(String str) {
173         if (str == null) {
174             return null;
175         }
176         str = SoundexUtils.clean(str);
177         if (str.length() == 0) {
178             return str;
179         }
180 
181         StringBuffer sBuf = new StringBuffer();
182         sBuf.append(str.charAt(0));
183 
184         char last, current;
185         last = '*';
186 
187         for (int i = 0; i < str.length(); i++) {
188 
189             current = getMappingCode(str.charAt(i));
190             if (current == last) {
191                 continue;
192             } else if (current != 0) {
193                 sBuf.append(current);
194             }
195 
196             last = current;
197 
198         }
199 
200         return sBuf.toString();
201     }
202 }