1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import java.util.Locale;
21
22 import org.apache.commons.codec.EncoderException;
23 import org.apache.commons.codec.StringEncoder;
24
25 /**
26 * Encodes a string into a Metaphone value.
27 * <p>
28 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
29 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
30 * </p>
31 * <p>
32 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
33 * p 39.</CITE>
34 * </p>
35 * <p>
36 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
37 * </p>
38 * <ul>
39 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
40 * (broken link 4/30/2013) </li>
41 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
42 * (link checked 4/30/2013) </li>
43 * </ul>
44 * <p>
45 * They have had undocumented changes from the originally published algorithm.
46 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
47 * </p>
48 * <p>
49 * This class is conditionally thread-safe.
50 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
51 * but is not volatile, and accesses are not synchronized.
52 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
53 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
54 * after initial setup.
55 * </p>
56 */
57 public class Metaphone implements StringEncoder {
58
59 /**
60 * Five values in the English language.
61 */
62 private static final String VOWELS = "AEIOU";
63
64 /**
65 * Variable used in Metaphone algorithm.
66 */
67 private static final String FRONTV = "EIY";
68
69 /**
70 * Variable used in Metaphone algorithm.
71 */
72 private static final String VARSON = "CSPTG";
73
74 /**
75 * The max code length for Metaphone is 4.
76 */
77 private int maxCodeLen = 4;
78
79 /**
80 * Constructs a new instance.
81 */
82 public Metaphone() {
83 // empty
84 }
85
86 /**
87 * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
88 * EncoderException if the supplied object is not of type {@link String}.
89 *
90 * @param obj Object to encode.
91 * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied.
92 * @throws EncoderException if the parameter supplied is not of type {@link String}.
93 */
94 @Override
95 public Object encode(final Object obj) throws EncoderException {
96 if (!(obj instanceof String)) {
97 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
98 }
99 return metaphone((String) obj);
100 }
101
102 /**
103 * Encodes a String using the Metaphone algorithm.
104 *
105 * @param str String object to encode
106 * @return The Metaphone code corresponding to the String supplied
107 */
108 @Override
109 public String encode(final String str) {
110 return metaphone(str);
111 }
112
113 /**
114 * Gets the maxCodeLen.
115 *
116 * @return the maxCodeLen.
117 */
118 public int getMaxCodeLen() {
119 return this.maxCodeLen;
120 }
121
122 private boolean isLastChar(final int wdsz, final int n) {
123 return n + 1 == wdsz;
124 }
125
126 /**
127 * Tests is the Metaphones of two strings are identical.
128 *
129 * @param str1 First of two strings to compare.
130 * @param str2 Second of two strings to compare.
131 * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise.
132 */
133 public boolean isMetaphoneEqual(final String str1, final String str2) {
134 return metaphone(str1).equals(metaphone(str2));
135 }
136
137 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
138 boolean matches = false;
139 if (index >= 0 && index < string.length() - 1) {
140 matches = string.charAt(index + 1) == c;
141 }
142 return matches;
143 }
144
145 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
146 boolean matches = false;
147 if (index > 0 && index < string.length()) {
148 matches = string.charAt(index - 1) == c;
149 }
150 return matches;
151 }
152
153 private boolean isVowel(final StringBuilder string, final int index) {
154 return VOWELS.indexOf(string.charAt(index)) >= 0;
155 }
156
157 /**
158 * Find the Metaphone value of a String. This is similar to the
159 * Soundex algorithm, but better at finding similar sounding words.
160 * All input is converted to upper case.
161 * Limitations: Input format is expected to be a single ASCII word
162 * with only characters in the A - Z range, no punctuation or numbers.
163 *
164 * @param txt String to find the Metaphone code for.
165 * @return A Metaphone code corresponding to the String supplied.
166 */
167 public String metaphone(final String txt) {
168 boolean hard = false;
169 final int txtLength;
170 if (txt == null || (txtLength = txt.length()) == 0) {
171 return "";
172 }
173 // single character is itself
174 if (txtLength == 1) {
175 return txt.toUpperCase(Locale.ENGLISH);
176 }
177
178 final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray();
179
180 final StringBuilder local = new StringBuilder(40); // manipulate
181 final StringBuilder code = new StringBuilder(10); // output
182 // handle initial 2 characters exceptions
183 switch (inwd[0]) {
184 case 'K':
185 case 'G':
186 case 'P': /* looking for KN, etc */
187 if (inwd[1] == 'N') {
188 local.append(inwd, 1, inwd.length - 1);
189 } else {
190 local.append(inwd);
191 }
192 break;
193 case 'A': /* looking for AE */
194 if (inwd[1] == 'E') {
195 local.append(inwd, 1, inwd.length - 1);
196 } else {
197 local.append(inwd);
198 }
199 break;
200 case 'W': /* looking for WR or WH */
201 if (inwd[1] == 'R') { // WR -> R
202 local.append(inwd, 1, inwd.length - 1);
203 break;
204 }
205 if (inwd[1] == 'H') {
206 local.append(inwd, 1, inwd.length - 1);
207 local.setCharAt(0, 'W'); // WH -> W
208 } else {
209 local.append(inwd);
210 }
211 break;
212 case 'X': /* initial X becomes S */
213 inwd[0] = 'S';
214 local.append(inwd);
215 break;
216 default:
217 local.append(inwd);
218 } // now local has working string with initials fixed
219
220 final int wdsz = local.length();
221 int n = 0;
222
223 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well
224 final char symb = local.charAt(n);
225 // remove duplicate letters except C
226 if (symb == 'C' || !isPreviousChar(local, n, symb)) {
227 // not dup
228 switch (symb) {
229 case 'A':
230 case 'E':
231 case 'I':
232 case 'O':
233 case 'U':
234 if (n == 0) {
235 code.append(symb);
236 }
237 break; // only use vowel if leading char
238 case 'B':
239 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB
240 break;
241 }
242 code.append(symb);
243 break;
244 case 'C': // lots of C special cases
245 /* discard if SCI, SCE or SCY */
246 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
247 break;
248 }
249 if (regionMatch(local, n, "CIA")) { // "CIA" -> X
250 code.append('X');
251 break;
252 }
253 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
254 code.append('S');
255 break; // CI,CE,CY -> S
256 }
257 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
258 code.append('K');
259 break;
260 }
261 if (!isNextChar(local, n, 'H') || n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
262 code.append('K');
263 } else {
264 code.append('X'); // CHvowel -> X
265 }
266 break;
267 case 'D':
268 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
269 code.append('J');
270 n += 2;
271 } else {
272 code.append('T');
273 }
274 break;
275 case 'G': // GH silent at end or before consonant
276 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
277 break;
278 }
279 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
280 break;
281 }
282 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
283 break; // silent G
284 }
285 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
286 hard = isPreviousChar(local, n, 'G');
287 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
288 code.append('J');
289 } else {
290 code.append('K');
291 }
292 break;
293 case 'H':
294 if (isLastChar(wdsz, n)) {
295 break; // terminal H
296 }
297 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
298 break;
299 }
300 if (isVowel(local, n + 1)) {
301 code.append('H'); // Hvowel
302 }
303 break;
304 case 'F':
305 case 'J':
306 case 'L':
307 case 'M':
308 case 'N':
309 case 'R':
310 code.append(symb);
311 break;
312 case 'K':
313 if (n > 0) { // not initial
314 if (!isPreviousChar(local, n, 'C')) {
315 code.append(symb);
316 }
317 } else {
318 code.append(symb); // initial K
319 }
320 break;
321 case 'P':
322 if (isNextChar(local, n, 'H')) {
323 // PH -> F
324 code.append('F');
325 } else {
326 code.append(symb);
327 }
328 break;
329 case 'Q':
330 code.append('K');
331 break;
332 case 'S':
333 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
334 code.append('X');
335 } else {
336 code.append('S');
337 }
338 break;
339 case 'T':
340 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
341 code.append('X');
342 break;
343 }
344 if (regionMatch(local, n, "TCH")) {
345 // Silent if in "TCH"
346 break;
347 }
348 // substitute numeral 0 for TH (resembles theta after all)
349 if (regionMatch(local, n, "TH")) {
350 code.append('0');
351 } else {
352 code.append('T');
353 }
354 break;
355 case 'V':
356 code.append('F');
357 break;
358 case 'W':
359 case 'Y': // silent if not followed by vowel
360 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
361 code.append(symb);
362 }
363 break;
364 case 'X':
365 code.append('K');
366 code.append('S');
367 break;
368 case 'Z':
369 code.append('S');
370 break;
371 default:
372 // do nothing
373 break;
374 } // end switch
375 } // end else from symb != 'C'
376 n++;
377 if (code.length() > getMaxCodeLen()) {
378 code.setLength(getMaxCodeLen());
379 }
380 }
381 return code.toString();
382 }
383
384 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
385 boolean matches = false;
386 if (index >= 0 && index + test.length() - 1 < string.length()) {
387 final String substring = string.substring(index, index + test.length());
388 matches = substring.equals(test);
389 }
390 return matches;
391 }
392
393 /**
394 * Sets the maxCodeLen.
395 *
396 * @param maxCodeLen The maxCodeLen to set.
397 */
398 public void setMaxCodeLen(final int maxCodeLen) {
399 this.maxCodeLen = maxCodeLen;
400 }
401
402 }