1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import java.util.Locale;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25 /**
26 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
27 *
28 * <p>This class is immutable and thread-safe.</p>
29 *
30 * @since 1.3
31 */
32 final class SoundexUtils {
33
34 /**
35 * Cleans up the input string before Soundex processing by only returning
36 * upper case letters.
37 *
38 * @param str
39 * The String to clean.
40 * @return A clean String.
41 */
42 static String clean(final String str) {
43 if (isEmpty(str)) {
44 return str;
45 }
46 final int len = str.length();
47 final char[] chars = new char[len];
48 int count = 0;
49 for (int i = 0; i < len; i++) {
50 if (Character.isLetter(str.charAt(i))) {
51 chars[count++] = str.charAt(i);
52 }
53 }
54 if (count == len) {
55 return str.toUpperCase(Locale.ENGLISH);
56 }
57 return new String(chars, 0, count).toUpperCase(Locale.ENGLISH);
58 }
59
60 /**
61 * Encodes the Strings and returns the number of characters in the two
62 * encoded Strings that are the same.
63 * <ul>
64 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
65 * little or no similarity, and 4 indicates strong similarity or identical
66 * values.</li>
67 * <li>For refined Soundex, the return value can be greater than 4.</li>
68 * </ul>
69 *
70 * @param encoder
71 * The encoder to use to encode the Strings.
72 * @param s1
73 * A String that will be encoded and compared.
74 * @param s2
75 * A String that will be encoded and compared.
76 * @return The number of characters in the two Soundex encoded Strings that
77 * are the same.
78 *
79 * @see #differenceEncoded(String,String)
80 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
81 * MS T-SQL DIFFERENCE</a>
82 *
83 * @throws EncoderException
84 * if an error occurs encoding one of the strings
85 */
86 static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
87 return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
88 }
89
90 /**
91 * Returns the number of characters in the two Soundex encoded Strings that
92 * are the same.
93 * <ul>
94 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
95 * little or no similarity, and 4 indicates strong similarity or identical
96 * values.</li>
97 * <li>For refined Soundex, the return value can be greater than 4.</li>
98 * </ul>
99 *
100 * @param es1
101 * An encoded String.
102 * @param es2
103 * An encoded String.
104 * @return The number of characters in the two Soundex encoded Strings that
105 * are the same.
106 *
107 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108 * MS T-SQL DIFFERENCE</a>
109 */
110 static int differenceEncoded(final String es1, final String es2) {
111
112 if (es1 == null || es2 == null) {
113 return 0;
114 }
115 final int lengthToMatch = Math.min(es1.length(), es2.length());
116 int diff = 0;
117 for (int i = 0; i < lengthToMatch; i++) {
118 if (es1.charAt(i) == es2.charAt(i)) {
119 diff++;
120 }
121 }
122 return diff;
123 }
124
125 /**
126 * <p>Checks if a CharSequence is empty ("") or null.</p>
127 *
128 * <pre>
129 * StringUtils.isEmpty(null) = true
130 * StringUtils.isEmpty("") = true
131 * StringUtils.isEmpty(" ") = false
132 * StringUtils.isEmpty("bob") = false
133 * StringUtils.isEmpty(" bob ") = false
134 * </pre>
135 *
136 * @param cs the CharSequence to check, may be null.
137 * @return {@code true} if the CharSequence is empty or null.
138 */
139 static boolean isEmpty(final CharSequence cs) {
140 return cs == null || cs.length() == 0;
141 }
142
143 }