View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.Locale;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
27   *
28   * <p>This class is immutable and thread-safe.</p>
29   *
30   * @since 1.3
31   */
32  final class SoundexUtils {
33  
34      /**
35       * Cleans up the input string before Soundex processing by only returning
36       * upper case letters.
37       *
38       * @param str
39       *                  The String to clean.
40       * @return A clean String.
41       */
42      static String clean(final String str) {
43          if (isEmpty(str)) {
44              return str;
45          }
46          final int len = str.length();
47          final char[] chars = new char[len];
48          int count = 0;
49          for (int i = 0; i < len; i++) {
50              if (Character.isLetter(str.charAt(i))) {
51                  chars[count++] = str.charAt(i);
52              }
53          }
54          if (count == len) {
55              return str.toUpperCase(Locale.ENGLISH);
56          }
57          return new String(chars, 0, count).toUpperCase(Locale.ENGLISH);
58      }
59  
60      /**
61       * Encodes the Strings and returns the number of characters in the two
62       * encoded Strings that are the same.
63       * <ul>
64       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
65       * little or no similarity, and 4 indicates strong similarity or identical
66       * values.</li>
67       * <li>For refined Soundex, the return value can be greater than 4.</li>
68       * </ul>
69       *
70       * @param encoder
71       *                  The encoder to use to encode the Strings.
72       * @param s1
73       *                  A String that will be encoded and compared.
74       * @param s2
75       *                  A String that will be encoded and compared.
76       * @return The number of characters in the two Soundex encoded Strings that
77       *             are the same.
78       *
79       * @see #differenceEncoded(String,String)
80       * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
81       *          MS T-SQL DIFFERENCE</a>
82       *
83       * @throws EncoderException
84       *                  if an error occurs encoding one of the strings
85       */
86      static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
87          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
88      }
89  
90      /**
91       * Returns the number of characters in the two Soundex encoded Strings that
92       * are the same.
93       * <ul>
94       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
95       * little or no similarity, and 4 indicates strong similarity or identical
96       * values.</li>
97       * <li>For refined Soundex, the return value can be greater than 4.</li>
98       * </ul>
99       *
100      * @param es1
101      *                  An encoded String.
102      * @param es2
103      *                  An encoded String.
104      * @return The number of characters in the two Soundex encoded Strings that
105      *             are the same.
106      *
107      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108      *          MS T-SQL DIFFERENCE</a>
109      */
110     static int differenceEncoded(final String es1, final String es2) {
111 
112         if (es1 == null || es2 == null) {
113             return 0;
114         }
115         final int lengthToMatch = Math.min(es1.length(), es2.length());
116         int diff = 0;
117         for (int i = 0; i < lengthToMatch; i++) {
118             if (es1.charAt(i) == es2.charAt(i)) {
119                 diff++;
120             }
121         }
122         return diff;
123     }
124 
125     /**
126      * <p>Checks if a CharSequence is empty ("") or null.</p>
127      *
128      * <pre>
129      * StringUtils.isEmpty(null)      = true
130      * StringUtils.isEmpty("")        = true
131      * StringUtils.isEmpty(" ")       = false
132      * StringUtils.isEmpty("bob")     = false
133      * StringUtils.isEmpty("  bob  ") = false
134      * </pre>
135      *
136      * @param cs  the CharSequence to check, may be null.
137      * @return {@code true} if the CharSequence is empty or null.
138      */
139     static boolean isEmpty(final CharSequence cs) {
140         return cs == null || cs.length() == 0;
141     }
142 
143 }