1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import java.util.Locale;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25 /**
26 * Encodes a string into a Metaphone value.
27 * <p>
28 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
29 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
30 * </p>
31 * <p>
32 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
33 * p 39.</CITE>
34 * </p>
35 * <p>
36 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
37 * </p>
38 * <ul>
39 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
40 * (broken link 4/30/2013) </li>
41 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
42 * (link checked 4/30/2013) </li>
43 * </ul>
44 * <p>
45 * They have had undocumented changes from the originally published algorithm.
46 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
47 * </p>
48 * <p>
49 * This class is conditionally thread-safe.
50 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
51 * but is not volatile, and accesses are not synchronized.
52 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
53 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
54 * after initial setup.
55 * </p>
56 */
57 public class Metaphone implements StringEncoder {
58
59 /**
60 * Five values in the English language.
61 */
62 private static final String VOWELS = "AEIOU";
63
64 /**
65 * Variable used in Metaphone algorithm.
66 */
67 private static final String FRONTV = "EIY";
68
69 /**
70 * Variable used in Metaphone algorithm.
71 */
72 private static final String VARSON = "CSPTG";
73
74 /**
75 * The max code length for Metaphone is 4.
76 */
77 private int maxCodeLen = 4;
78
79 /**
80 * Constructs a new instance.
81 */
82 public Metaphone() {
83 // empty
84 }
85
86 /**
87 * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
88 * EncoderException if the supplied object is not of type {@link String}.
89 *
90 * @param obj Object to encode.
91 * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied.
92 * @throws EncoderException if the parameter supplied is not of type {@link String}.
93 */
94 @Override
95 public Object encode(final Object obj) throws EncoderException {
96 if (!(obj instanceof String)) {
97 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
98 }
99 return metaphone((String) obj);
100 }
101
102 /**
103 * Encodes a String using the Metaphone algorithm.
104 *
105 * @param str String object to encode.
106 * @return The Metaphone code corresponding to the String supplied.
107 */
108 @Override
109 public String encode(final String str) {
110 return metaphone(str);
111 }
112
113 /**
114 * Gets the maxCodeLen.
115 *
116 * @return the maxCodeLen.
117 */
118 public int getMaxCodeLen() {
119 return this.maxCodeLen;
120 }
121
122 private boolean isLastChar(final int wdsz, final int n) {
123 return n + 1 == wdsz;
124 }
125
126 /**
127 * Tests is the Metaphones of two strings are identical.
128 *
129 * @param str1 First of two strings to compare.
130 * @param str2 Second of two strings to compare.
131 * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise.
132 */
133 public boolean isMetaphoneEqual(final String str1, final String str2) {
134 return metaphone(str1).equals(metaphone(str2));
135 }
136
137 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
138 boolean matches = false;
139 if (index >= 0 && index < string.length() - 1) {
140 matches = string.charAt(index + 1) == c;
141 }
142 return matches;
143 }
144
145 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
146 boolean matches = false;
147 if (index > 0 && index < string.length()) {
148 matches = string.charAt(index - 1) == c;
149 }
150 return matches;
151 }
152
153 private boolean isVowel(final StringBuilder string, final int index) {
154 return VOWELS.indexOf(string.charAt(index)) >= 0;
155 }
156
157 /**
158 * Find the Metaphone value of a String. This is similar to the
159 * Soundex algorithm, but better at finding similar sounding words.
160 * All input is converted to upper case.
161 * Limitations: Input format is expected to be a single ASCII word
162 * with only characters in the A - Z range, no punctuation or numbers.
163 *
164 * @param txt String to find the Metaphone code for.
165 * @return A Metaphone code corresponding to the String supplied.
166 */
167 public String metaphone(final String txt) {
168 boolean hard = false;
169 final int txtLength;
170 if (txt == null || (txtLength = txt.length()) == 0) {
171 return "";
172 }
173 // single character is itself
174 if (txtLength == 1) {
175 return txt.toUpperCase(Locale.ENGLISH);
176 }
177
178 final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray();
179
180 final StringBuilder local = new StringBuilder(40); // manipulate
181 final StringBuilder code = new StringBuilder(10); // output
182 // handle initial 2 characters exceptions
183 switch (inwd[0]) {
184 case 'K':
185 case 'G':
186 case 'P': /* looking for KN, etc */
187 if (inwd[1] == 'N') {
188 local.append(inwd, 1, inwd.length - 1);
189 } else {
190 local.append(inwd);
191 }
192 break;
193 case 'A': /* looking for AE */
194 if (inwd[1] == 'E') {
195 local.append(inwd, 1, inwd.length - 1);
196 } else {
197 local.append(inwd);
198 }
199 break;
200 case 'W': /* looking for WR or WH */
201 if (inwd[1] == 'R') { // WR -> R
202 local.append(inwd, 1, inwd.length - 1);
203 break;
204 }
205 if (inwd[1] == 'H') {
206 local.append(inwd, 1, inwd.length - 1);
207 local.setCharAt(0, 'W'); // WH -> W
208 } else {
209 local.append(inwd);
210 }
211 break;
212 case 'X': /* initial X becomes S */
213 inwd[0] = 'S';
214 local.append(inwd);
215 break;
216 default:
217 local.append(inwd);
218 } // now local has working string with initials fixed
219
220 final int wdsz = local.length();
221 int n = 0;
222
223 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
224 final char symb = local.charAt(n);
225 // remove duplicate letters except C
226 if (symb == 'C' || !isPreviousChar(local, n, symb)) {
227 // not dup
228 switch (symb) {
229 case 'A':
230 case 'E':
231 case 'I':
232 case 'O':
233 case 'U':
234 if (n == 0) {
235 code.append(symb);
236 }
237 break; // only use vowel if leading char
238 case 'B':
239 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
240 break;
241 }
242 code.append(symb);
243 break;
244 case 'C': // lots of C special cases
245 /* discard if SCI, SCE or SCY */
246 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247 break;
248 }
249 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
250 code.append('K');
251 break;
252 }
253 if (regionMatch(local, n, "CIA") || isNextChar(local, n, 'H')) { // "CIA" -> X or CH -> X
254 code.append('X');
255 break;
256 }
257 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
258 code.append('S');
259 break; // CI,CE,CY -> S
260 }
261 code.append('K'); // default C -> K
262 break;
263 case 'D':
264 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
265 code.append('J');
266 n += 2;
267 } else {
268 code.append('T');
269 }
270 break;
271 case 'G': // GH silent at end or before consonant
272 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
273 break;
274 }
275 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
276 break;
277 }
278 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
279 break; // silent G
280 }
281 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
282 hard = isPreviousChar(local, n, 'G');
283 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
284 code.append('J');
285 } else {
286 code.append('K');
287 }
288 break;
289 case 'H':
290 if (isLastChar(wdsz, n)) {
291 break; // terminal H
292 }
293 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
294 break;
295 }
296 if (isVowel(local, n + 1)) {
297 code.append('H'); // Hvowel
298 }
299 break;
300 case 'F':
301 case 'J':
302 case 'L':
303 case 'M':
304 case 'N':
305 case 'R':
306 code.append(symb);
307 break;
308 case 'K':
309 if (n > 0) { // not initial
310 if (!isPreviousChar(local, n, 'C')) {
311 code.append(symb);
312 }
313 } else {
314 code.append(symb); // initial K
315 }
316 break;
317 case 'P':
318 if (isNextChar(local, n, 'H')) {
319 // PH -> F
320 code.append('F');
321 } else {
322 code.append(symb);
323 }
324 break;
325 case 'Q':
326 code.append('K');
327 break;
328 case 'S':
329 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
330 code.append('X');
331 } else {
332 code.append('S');
333 }
334 break;
335 case 'T':
336 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
337 code.append('X');
338 break;
339 }
340 if (regionMatch(local, n, "TCH")) {
341 // Silent if in "TCH"
342 break;
343 }
344 // substitute numeral 0 for TH (resembles theta after all)
345 if (regionMatch(local, n, "TH")) {
346 code.append('0');
347 } else {
348 code.append('T');
349 }
350 break;
351 case 'V':
352 code.append('F');
353 break;
354 case 'W':
355 case 'Y': // silent if not followed by vowel
356 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
357 code.append(symb);
358 }
359 break;
360 case 'X':
361 code.append('K');
362 code.append('S');
363 break;
364 case 'Z':
365 code.append('S');
366 break;
367 default:
368 // do nothing
369 break;
370 } // end switch
371 } // end else from symb != 'C'
372 n++;
373 if (code.length() > getMaxCodeLen()) {
374 code.setLength(getMaxCodeLen());
375 }
376 }
377 return code.toString();
378 }
379
380 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
381 boolean matches = false;
382 if (index >= 0 && index + test.length() - 1 < string.length()) {
383 final String substring = string.substring(index, index + test.length());
384 matches = substring.equals(test);
385 }
386 return matches;
387 }
388
389 /**
390 * Sets the maxCodeLen.
391 *
392 * @param maxCodeLen The maxCodeLen to set.
393 */
394 public void setMaxCodeLen(final int maxCodeLen) {
395 this.maxCodeLen = maxCodeLen;
396 }
397
398 }