View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */ 
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25   * 
26   * @author Apache Software Foundation
27   * @version $Id: SoundexUtils.java 480406 2006-11-29 04:56:58Z bayard $
28   * @since 1.3
29   */
30  final class SoundexUtils {
31  
32      /**
33  	 * Cleans up the input string before Soundex processing by only returning
34  	 * upper case letters.
35  	 * 
36  	 * @param str
37  	 *                  The String to clean.
38  	 * @return A clean String.
39  	 */
40      static String clean(String str) {
41          if (str == null || str.length() == 0) {
42              return str;
43          }
44          int len = str.length();
45          char[] chars = new char[len];
46          int count = 0;
47          for (int i = 0; i < len; i++) {
48              if (Character.isLetter(str.charAt(i))) {
49                  chars[count++] = str.charAt(i);
50              }
51          }
52          if (count == len) {
53              return str.toUpperCase();
54          }
55          return new String(chars, 0, count).toUpperCase();
56      }
57  
58      /**
59  	 * Encodes the Strings and returns the number of characters in the two
60  	 * encoded Strings that are the same.
61  	 * <ul>
62  	 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
63  	 * little or no similarity, and 4 indicates strong similarity or identical
64  	 * values.</li>
65  	 * <li>For refined Soundex, the return value can be greater than 4.</li>
66  	 * </ul>
67  	 * 
68  	 * @param encoder
69  	 *                  The encoder to use to encode the Strings.
70  	 * @param s1
71  	 *                  A String that will be encoded and compared.
72  	 * @param s2
73  	 *                  A String that will be encoded and compared.
74  	 * @return The number of characters in the two Soundex encoded Strings that
75  	 *             are the same.
76  	 * 
77  	 * @see #differenceEncoded(String,String)
78  	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
79  	 *          MS T-SQL DIFFERENCE</a>
80  	 * 
81  	 * @throws EncoderException
82  	 *                  if an error occurs encoding one of the strings
83  	 */
84      static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
85          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
86      }
87  
88      /**
89  	 * Returns the number of characters in the two Soundex encoded Strings that
90  	 * are the same.
91  	 * <ul>
92  	 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
93  	 * little or no similarity, and 4 indicates strong similarity or identical
94  	 * values.</li>
95  	 * <li>For refined Soundex, the return value can be greater than 4.</li>
96  	 * </ul>
97  	 * 
98  	 * @param es1
99  	 *                  An encoded String.
100 	 * @param es2
101 	 *                  An encoded String.
102 	 * @return The number of characters in the two Soundex encoded Strings that
103 	 *             are the same.
104 	 * 
105 	 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
106 	 *          MS T-SQL DIFFERENCE</a>
107 	 */
108     static int differenceEncoded(String es1, String es2) {
109 
110         if (es1 == null || es2 == null) {
111             return 0;
112         }
113         int lengthToMatch = Math.min(es1.length(), es2.length());
114         int diff = 0;
115         for (int i = 0; i < lengthToMatch; i++) {
116             if (es1.charAt(i) == es2.charAt(i)) {
117                 diff++;
118             }
119         }
120         return diff;
121     }
122 
123 }