View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25   *
26   * <p>This class is immutable and thread-safe.</p>
27   *
28   * @since 1.3
29   */
30  final class SoundexUtils {
31  
32      /**
33       * Cleans up the input string before Soundex processing by only returning
34       * upper case letters.
35       *
36       * @param str
37       *                  The String to clean.
38       * @return A clean String.
39       */
40      static String clean(final String str) {
41          if (isEmpty(str)) {
42              return str;
43          }
44          final int len = str.length();
45          final char[] chars = new char[len];
46          int count = 0;
47          for (int i = 0; i < len; i++) {
48              if (Character.isLetter(str.charAt(i))) {
49                  chars[count++] = str.charAt(i);
50              }
51          }
52          if (count == len) {
53              return str.toUpperCase(java.util.Locale.ENGLISH);
54          }
55          return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
56      }
57  
58      /**
59       * Encodes the Strings and returns the number of characters in the two
60       * encoded Strings that are the same.
61       * <ul>
62       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
63       * little or no similarity, and 4 indicates strong similarity or identical
64       * values.</li>
65       * <li>For refined Soundex, the return value can be greater than 4.</li>
66       * </ul>
67       *
68       * @param encoder
69       *                  The encoder to use to encode the Strings.
70       * @param s1
71       *                  A String that will be encoded and compared.
72       * @param s2
73       *                  A String that will be encoded and compared.
74       * @return The number of characters in the two Soundex encoded Strings that
75       *             are the same.
76       *
77       * @see #differenceEncoded(String,String)
78       * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
79       *          MS T-SQL DIFFERENCE</a>
80       *
81       * @throws EncoderException
82       *                  if an error occurs encoding one of the strings
83       */
84      static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
85          return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
86      }
87  
88      /**
89       * Returns the number of characters in the two Soundex encoded Strings that
90       * are the same.
91       * <ul>
92       * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
93       * little or no similarity, and 4 indicates strong similarity or identical
94       * values.</li>
95       * <li>For refined Soundex, the return value can be greater than 4.</li>
96       * </ul>
97       *
98       * @param es1
99       *                  An encoded String.
100      * @param es2
101      *                  An encoded String.
102      * @return The number of characters in the two Soundex encoded Strings that
103      *             are the same.
104      *
105      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
106      *          MS T-SQL DIFFERENCE</a>
107      */
108     static int differenceEncoded(final String es1, final String es2) {
109 
110         if (es1 == null || es2 == null) {
111             return 0;
112         }
113         final int lengthToMatch = Math.min(es1.length(), es2.length());
114         int diff = 0;
115         for (int i = 0; i < lengthToMatch; i++) {
116             if (es1.charAt(i) == es2.charAt(i)) {
117                 diff++;
118             }
119         }
120         return diff;
121     }
122 
123     /**
124      * <p>Checks if a CharSequence is empty ("") or null.</p>
125      *
126      * <pre>
127      * StringUtils.isEmpty(null)      = true
128      * StringUtils.isEmpty("")        = true
129      * StringUtils.isEmpty(" ")       = false
130      * StringUtils.isEmpty("bob")     = false
131      * StringUtils.isEmpty("  bob  ") = false
132      * </pre>
133      *
134      * @param cs  the CharSequence to check, may be null
135      * @return {@code true} if the CharSequence is empty or null
136      */
137     static boolean isEmpty(final CharSequence cs) {
138         return cs == null || cs.length() == 0;
139     }
140 
141 }