View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25   *
26   * <p>This class is immutable and thread-safe.</p>
27   *
28   * @version $Id: SoundexUtils.java 1429868 2013-01-07 16:08:05Z ggregory $
29   * @since 1.3
30   */
31  final class SoundexUtils {
32  
33      /**
34       * Cleans up the input string before Soundex processing by only returning
35       * upper case letters.
36       *
37       * @param str
38       *                  The String to clean.
39       * @return A clean String.
40       */
41      static String clean(final String str) {
42          if (str == null || str.length() == 0) {
43              return str;
44          }
45          final int len = str.length();
46          final char[] chars = new char[len];
47          int count = 0;
48          for (int i = 0; i < len; i++) {
49              if (Character.isLetter(str.charAt(i))) {
50                  chars[count++] = str.charAt(i);
51              }
52          }
53          if (count == len) {
54              return str.toUpperCase(java.util.Locale.ENGLISH);
55          }
56          return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
57      }
58  
59      /**
60       * Encodes the Strings and returns the number of characters in the two
61       * encoded Strings that are the same.
62       * <ul>
63       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
64       * little or no similarity, and 4 indicates strong similarity or identical
65       * values.</li>
66       * <li>For refined Soundex, the return value can be greater than 4.</li>
67       * </ul>
68       *
69       * @param encoder
70       *                  The encoder to use to encode the Strings.
71       * @param s1
72       *                  A String that will be encoded and compared.
73       * @param s2
74       *                  A String that will be encoded and compared.
75       * @return The number of characters in the two Soundex encoded Strings that
76       *             are the same.
77       *
78       * @see #differenceEncoded(String,String)
79       * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
80       *          MS T-SQL DIFFERENCE</a>
81       *
82       * @throws EncoderException
83       *                  if an error occurs encoding one of the strings
84       */
85      static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
86          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
87      }
88  
89      /**
90       * Returns the number of characters in the two Soundex encoded Strings that
91       * are the same.
92       * <ul>
93       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
94       * little or no similarity, and 4 indicates strong similarity or identical
95       * values.</li>
96       * <li>For refined Soundex, the return value can be greater than 4.</li>
97       * </ul>
98       *
99       * @param es1
100      *                  An encoded String.
101      * @param es2
102      *                  An encoded String.
103      * @return The number of characters in the two Soundex encoded Strings that
104      *             are the same.
105      *
106      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
107      *          MS T-SQL DIFFERENCE</a>
108      */
109     static int differenceEncoded(final String es1, final String es2) {
110 
111         if (es1 == null || es2 == null) {
112             return 0;
113         }
114         final int lengthToMatch = Math.min(es1.length(), es2.length());
115         int diff = 0;
116         for (int i = 0; i < lengthToMatch; i++) {
117             if (es1.charAt(i) == es2.charAt(i)) {
118                 diff++;
119             }
120         }
121         return diff;
122     }
123 
124 }