1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25 *
26 * <p>This class is immutable and thread-safe.</p>
27 *
28 * @version $Id: SoundexUtils.java 1429868 2013-01-07 16:08:05Z ggregory $
29 * @since 1.3
30 */
31 final class SoundexUtils {
32
33 /**
34 * Cleans up the input string before Soundex processing by only returning
35 * upper case letters.
36 *
37 * @param str
38 * The String to clean.
39 * @return A clean String.
40 */
41 static String clean(final String str) {
42 if (str == null || str.length() == 0) {
43 return str;
44 }
45 final int len = str.length();
46 final char[] chars = new char[len];
47 int count = 0;
48 for (int i = 0; i < len; i++) {
49 if (Character.isLetter(str.charAt(i))) {
50 chars[count++] = str.charAt(i);
51 }
52 }
53 if (count == len) {
54 return str.toUpperCase(java.util.Locale.ENGLISH);
55 }
56 return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
57 }
58
59 /**
60 * Encodes the Strings and returns the number of characters in the two
61 * encoded Strings that are the same.
62 * <ul>
63 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
64 * little or no similarity, and 4 indicates strong similarity or identical
65 * values.</li>
66 * <li>For refined Soundex, the return value can be greater than 4.</li>
67 * </ul>
68 *
69 * @param encoder
70 * The encoder to use to encode the Strings.
71 * @param s1
72 * A String that will be encoded and compared.
73 * @param s2
74 * A String that will be encoded and compared.
75 * @return The number of characters in the two Soundex encoded Strings that
76 * are the same.
77 *
78 * @see #differenceEncoded(String,String)
79 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
80 * MS T-SQL DIFFERENCE</a>
81 *
82 * @throws EncoderException
83 * if an error occurs encoding one of the strings
84 */
85 static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
86 return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
87 }
88
89 /**
90 * Returns the number of characters in the two Soundex encoded Strings that
91 * are the same.
92 * <ul>
93 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
94 * little or no similarity, and 4 indicates strong similarity or identical
95 * values.</li>
96 * <li>For refined Soundex, the return value can be greater than 4.</li>
97 * </ul>
98 *
99 * @param es1
100 * An encoded String.
101 * @param es2
102 * An encoded String.
103 * @return The number of characters in the two Soundex encoded Strings that
104 * are the same.
105 *
106 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
107 * MS T-SQL DIFFERENCE</a>
108 */
109 static int differenceEncoded(final String es1, final String es2) {
110
111 if (es1 == null || es2 == null) {
112 return 0;
113 }
114 final int lengthToMatch = Math.min(es1.length(), es2.length());
115 int diff = 0;
116 for (int i = 0; i < lengthToMatch; i++) {
117 if (es1.charAt(i) == es2.charAt(i)) {
118 diff++;
119 }
120 }
121 return diff;
122 }
123
124 }