1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
25 *
26 * @author Apache Software Foundation
27 * @version $Id: SoundexUtils.java 480406 2006-11-29 04:56:58Z bayard $
28 * @since 1.3
29 */
30 final class SoundexUtils {
31
32 /**
33 * Cleans up the input string before Soundex processing by only returning
34 * upper case letters.
35 *
36 * @param str
37 * The String to clean.
38 * @return A clean String.
39 */
40 static String clean(String str) {
41 if (str == null || str.length() == 0) {
42 return str;
43 }
44 int len = str.length();
45 char[] chars = new char[len];
46 int count = 0;
47 for (int i = 0; i < len; i++) {
48 if (Character.isLetter(str.charAt(i))) {
49 chars[count++] = str.charAt(i);
50 }
51 }
52 if (count == len) {
53 return str.toUpperCase();
54 }
55 return new String(chars, 0, count).toUpperCase();
56 }
57
58 /**
59 * Encodes the Strings and returns the number of characters in the two
60 * encoded Strings that are the same.
61 * <ul>
62 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
63 * little or no similarity, and 4 indicates strong similarity or identical
64 * values.</li>
65 * <li>For refined Soundex, the return value can be greater than 4.</li>
66 * </ul>
67 *
68 * @param encoder
69 * The encoder to use to encode the Strings.
70 * @param s1
71 * A String that will be encoded and compared.
72 * @param s2
73 * A String that will be encoded and compared.
74 * @return The number of characters in the two Soundex encoded Strings that
75 * are the same.
76 *
77 * @see #differenceEncoded(String,String)
78 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
79 * MS T-SQL DIFFERENCE</a>
80 *
81 * @throws EncoderException
82 * if an error occurs encoding one of the strings
83 */
84 static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
85 return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
86 }
87
88 /**
89 * Returns the number of characters in the two Soundex encoded Strings that
90 * are the same.
91 * <ul>
92 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
93 * little or no similarity, and 4 indicates strong similarity or identical
94 * values.</li>
95 * <li>For refined Soundex, the return value can be greater than 4.</li>
96 * </ul>
97 *
98 * @param es1
99 * An encoded String.
100 * @param es2
101 * An encoded String.
102 * @return The number of characters in the two Soundex encoded Strings that
103 * are the same.
104 *
105 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
106 * MS T-SQL DIFFERENCE</a>
107 */
108 static int differenceEncoded(String es1, String es2) {
109
110 if (es1 == null || es2 == null) {
111 return 0;
112 }
113 int lengthToMatch = Math.min(es1.length(), es2.length());
114 int diff = 0;
115 for (int i = 0; i < lengthToMatch; i++) {
116 if (es1.charAt(i) == es2.charAt(i)) {
117 diff++;
118 }
119 }
120 return diff;
121 }
122
123 }