View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25   * 
26   * @author Apache Software Foundation
27   * @version $Id: SoundexUtils.java 1157192 2011-08-12 17:27:38Z ggregory $
28   * @since 1.3
29   */
30  final class SoundexUtils {
31  
32      /**
33       * Cleans up the input string before Soundex processing by only returning
34       * upper case letters.
35       * 
36       * @param str
37       *                  The String to clean.
38       * @return A clean String.
39       */
40      static String clean(String str) {
41          if (str == null || str.length() == 0) {
42              return str;
43          }
44          int len = str.length();
45          char[] chars = new char[len];
46          int count = 0;
47          for (int i = 0; i < len; i++) {
48              if (Character.isLetter(str.charAt(i))) {
49                  chars[count++] = str.charAt(i);
50              }
51          }
52          if (count == len) {
53              return str.toUpperCase(java.util.Locale.ENGLISH);
54          }
55          return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
56      }
57  
58      /**
59       * Encodes the Strings and returns the number of characters in the two
60       * encoded Strings that are the same.
61       * <ul>
62       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
63       * little or no similarity, and 4 indicates strong similarity or identical
64       * values.</li>
65       * <li>For refined Soundex, the return value can be greater than 4.</li>
66       * </ul>
67       * 
68       * @param encoder
69       *                  The encoder to use to encode the Strings.
70       * @param s1
71       *                  A String that will be encoded and compared.
72       * @param s2
73       *                  A String that will be encoded and compared.
74       * @return The number of characters in the two Soundex encoded Strings that
75       *             are the same.
76       * 
77       * @see #differenceEncoded(String,String)
78       * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
79       *          MS T-SQL DIFFERENCE</a>
80       * 
81       * @throws EncoderException
82       *                  if an error occurs encoding one of the strings
83       */
84      static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
85          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
86      }
87  
88      /**
89       * Returns the number of characters in the two Soundex encoded Strings that
90       * are the same.
91       * <ul>
92       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
93       * little or no similarity, and 4 indicates strong similarity or identical
94       * values.</li>
95       * <li>For refined Soundex, the return value can be greater than 4.</li>
96       * </ul>
97       * 
98       * @param es1
99       *                  An encoded String.
100      * @param es2
101      *                  An encoded String.
102      * @return The number of characters in the two Soundex encoded Strings that
103      *             are the same.
104      * 
105      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
106      *          MS T-SQL DIFFERENCE</a>
107      */
108     static int differenceEncoded(String es1, String es2) {
109 
110         if (es1 == null || es2 == null) {
111             return 0;
112         }
113         int lengthToMatch = Math.min(es1.length(), es2.length());
114         int diff = 0;
115         for (int i = 0; i < lengthToMatch; i++) {
116             if (es1.charAt(i) == es2.charAt(i)) {
117                 diff++;
118             }
119         }
120         return diff;
121     }
122 
123 }