1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Metaphone value.
25 * <p>
26 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28 * <p>
29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30 * p 39.</CITE>
31 * <p>
32 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
33 * </p>
34 * <ul>
35 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </li>
36 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </li>
37 * </ul>
38 * <p>
39 * They have had undocumented changes from the originally published algorithm.
40 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
41 * <p>
42 * This class is conditionally thread-safe.
43 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
44 * but is not volatile, and accesses are not synchronized.
45 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
46 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
47 * after initial setup.
48 *
49 * @version $Id: Metaphone.java 1477596 2013-04-30 12:42:29Z ggregory $
50 */
51 public class Metaphone implements StringEncoder {
52
53 /**
54 * Five values in the English language
55 */
56 private static final String VOWELS = "AEIOU";
57
58 /**
59 * Variable used in Metaphone algorithm
60 */
61 private static final String FRONTV = "EIY";
62
63 /**
64 * Variable used in Metaphone algorithm
65 */
66 private static final String VARSON = "CSPTG";
67
68 /**
69 * The max code length for metaphone is 4
70 */
71 private int maxCodeLen = 4;
72
73 /**
74 * Creates an instance of the Metaphone encoder
75 */
76 public Metaphone() {
77 super();
78 }
79
80 /**
81 * Find the metaphone value of a String. This is similar to the
82 * soundex algorithm, but better at finding similar sounding words.
83 * All input is converted to upper case.
84 * Limitations: Input format is expected to be a single ASCII word
85 * with only characters in the A - Z range, no punctuation or numbers.
86 *
87 * @param txt String to find the metaphone code for
88 * @return A metaphone code corresponding to the String supplied
89 */
90 public String metaphone(final String txt) {
91 boolean hard = false;
92 if (txt == null || txt.length() == 0) {
93 return "";
94 }
95 // single character is itself
96 if (txt.length() == 1) {
97 return txt.toUpperCase(java.util.Locale.ENGLISH);
98 }
99
100 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
101
102 final StringBuilder local = new StringBuilder(40); // manipulate
103 final StringBuilder code = new StringBuilder(10); // output
104 // handle initial 2 characters exceptions
105 switch(inwd[0]) {
106 case 'K':
107 case 'G':
108 case 'P': /* looking for KN, etc*/
109 if (inwd[1] == 'N') {
110 local.append(inwd, 1, inwd.length - 1);
111 } else {
112 local.append(inwd);
113 }
114 break;
115 case 'A': /* looking for AE */
116 if (inwd[1] == 'E') {
117 local.append(inwd, 1, inwd.length - 1);
118 } else {
119 local.append(inwd);
120 }
121 break;
122 case 'W': /* looking for WR or WH */
123 if (inwd[1] == 'R') { // WR -> R
124 local.append(inwd, 1, inwd.length - 1);
125 break;
126 }
127 if (inwd[1] == 'H') {
128 local.append(inwd, 1, inwd.length - 1);
129 local.setCharAt(0, 'W'); // WH -> W
130 } else {
131 local.append(inwd);
132 }
133 break;
134 case 'X': /* initial X becomes S */
135 inwd[0] = 'S';
136 local.append(inwd);
137 break;
138 default:
139 local.append(inwd);
140 } // now local has working string with initials fixed
141
142 final int wdsz = local.length();
143 int n = 0;
144
145 while (code.length() < this.getMaxCodeLen() &&
146 n < wdsz ) { // max code size of 4 works well
147 final char symb = local.charAt(n);
148 // remove duplicate letters except C
149 if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
150 n++;
151 } else { // not dup
152 switch(symb) {
153 case 'A':
154 case 'E':
155 case 'I':
156 case 'O':
157 case 'U':
158 if (n == 0) {
159 code.append(symb);
160 }
161 break; // only use vowel if leading char
162 case 'B':
163 if ( isPreviousChar(local, n, 'M') &&
164 isLastChar(wdsz, n) ) { // B is silent if word ends in MB
165 break;
166 }
167 code.append(symb);
168 break;
169 case 'C': // lots of C special cases
170 /* discard if SCI, SCE or SCY */
171 if ( isPreviousChar(local, n, 'S') &&
172 !isLastChar(wdsz, n) &&
173 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
174 break;
175 }
176 if (regionMatch(local, n, "CIA")) { // "CIA" -> X
177 code.append('X');
178 break;
179 }
180 if (!isLastChar(wdsz, n) &&
181 FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
182 code.append('S');
183 break; // CI,CE,CY -> S
184 }
185 if (isPreviousChar(local, n, 'S') &&
186 isNextChar(local, n, 'H') ) { // SCH->sk
187 code.append('K');
188 break;
189 }
190 if (isNextChar(local, n, 'H')) { // detect CH
191 if (n == 0 &&
192 wdsz >= 3 &&
193 isVowel(local,2) ) { // CH consonant -> K consonant
194 code.append('K');
195 } else {
196 code.append('X'); // CHvowel -> X
197 }
198 } else {
199 code.append('K');
200 }
201 break;
202 case 'D':
203 if (!isLastChar(wdsz, n + 1) &&
204 isNextChar(local, n, 'G') &&
205 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
206 code.append('J'); n += 2;
207 } else {
208 code.append('T');
209 }
210 break;
211 case 'G': // GH silent at end or before consonant
212 if (isLastChar(wdsz, n + 1) &&
213 isNextChar(local, n, 'H')) {
214 break;
215 }
216 if (!isLastChar(wdsz, n + 1) &&
217 isNextChar(local,n,'H') &&
218 !isVowel(local,n+2)) {
219 break;
220 }
221 if (n > 0 &&
222 ( regionMatch(local, n, "GN") ||
223 regionMatch(local, n, "GNED") ) ) {
224 break; // silent G
225 }
226 if (isPreviousChar(local, n, 'G')) {
227 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
228 hard = true;
229 } else {
230 hard = false;
231 }
232 if (!isLastChar(wdsz, n) &&
233 FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
234 !hard) {
235 code.append('J');
236 } else {
237 code.append('K');
238 }
239 break;
240 case 'H':
241 if (isLastChar(wdsz, n)) {
242 break; // terminal H
243 }
244 if (n > 0 &&
245 VARSON.indexOf(local.charAt(n - 1)) >= 0) {
246 break;
247 }
248 if (isVowel(local,n+1)) {
249 code.append('H'); // Hvowel
250 }
251 break;
252 case 'F':
253 case 'J':
254 case 'L':
255 case 'M':
256 case 'N':
257 case 'R':
258 code.append(symb);
259 break;
260 case 'K':
261 if (n > 0) { // not initial
262 if (!isPreviousChar(local, n, 'C')) {
263 code.append(symb);
264 }
265 } else {
266 code.append(symb); // initial K
267 }
268 break;
269 case 'P':
270 if (isNextChar(local,n,'H')) {
271 // PH -> F
272 code.append('F');
273 } else {
274 code.append(symb);
275 }
276 break;
277 case 'Q':
278 code.append('K');
279 break;
280 case 'S':
281 if (regionMatch(local,n,"SH") ||
282 regionMatch(local,n,"SIO") ||
283 regionMatch(local,n,"SIA")) {
284 code.append('X');
285 } else {
286 code.append('S');
287 }
288 break;
289 case 'T':
290 if (regionMatch(local,n,"TIA") ||
291 regionMatch(local,n,"TIO")) {
292 code.append('X');
293 break;
294 }
295 if (regionMatch(local,n,"TCH")) {
296 // Silent if in "TCH"
297 break;
298 }
299 // substitute numeral 0 for TH (resembles theta after all)
300 if (regionMatch(local,n,"TH")) {
301 code.append('0');
302 } else {
303 code.append('T');
304 }
305 break;
306 case 'V':
307 code.append('F'); break;
308 case 'W':
309 case 'Y': // silent if not followed by vowel
310 if (!isLastChar(wdsz,n) &&
311 isVowel(local,n+1)) {
312 code.append(symb);
313 }
314 break;
315 case 'X':
316 code.append('K');
317 code.append('S');
318 break;
319 case 'Z':
320 code.append('S');
321 break;
322 default:
323 // do nothing
324 break;
325 } // end switch
326 n++;
327 } // end else from symb != 'C'
328 if (code.length() > this.getMaxCodeLen()) {
329 code.setLength(this.getMaxCodeLen());
330 }
331 }
332 return code.toString();
333 }
334
335 private boolean isVowel(final StringBuilder string, final int index) {
336 return VOWELS.indexOf(string.charAt(index)) >= 0;
337 }
338
339 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
340 boolean matches = false;
341 if( index > 0 &&
342 index < string.length() ) {
343 matches = string.charAt(index - 1) == c;
344 }
345 return matches;
346 }
347
348 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
349 boolean matches = false;
350 if( index >= 0 &&
351 index < string.length() - 1 ) {
352 matches = string.charAt(index + 1) == c;
353 }
354 return matches;
355 }
356
357 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
358 boolean matches = false;
359 if( index >= 0 &&
360 index + test.length() - 1 < string.length() ) {
361 final String substring = string.substring( index, index + test.length());
362 matches = substring.equals( test );
363 }
364 return matches;
365 }
366
367 private boolean isLastChar(final int wdsz, final int n) {
368 return n + 1 == wdsz;
369 }
370
371
372 /**
373 * Encodes an Object using the metaphone algorithm. This method
374 * is provided in order to satisfy the requirements of the
375 * Encoder interface, and will throw an EncoderException if the
376 * supplied object is not of type java.lang.String.
377 *
378 * @param obj Object to encode
379 * @return An object (or type java.lang.String) containing the
380 * metaphone code which corresponds to the String supplied.
381 * @throws EncoderException if the parameter supplied is not
382 * of type java.lang.String
383 */
384 @Override
385 public Object encode(final Object obj) throws EncoderException {
386 if (!(obj instanceof String)) {
387 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
388 }
389 return metaphone((String) obj);
390 }
391
392 /**
393 * Encodes a String using the Metaphone algorithm.
394 *
395 * @param str String object to encode
396 * @return The metaphone code corresponding to the String supplied
397 */
398 @Override
399 public String encode(final String str) {
400 return metaphone(str);
401 }
402
403 /**
404 * Tests is the metaphones of two strings are identical.
405 *
406 * @param str1 First of two strings to compare
407 * @param str2 Second of two strings to compare
408 * @return {@code true} if the metaphones of these strings are identical,
409 * {@code false} otherwise.
410 */
411 public boolean isMetaphoneEqual(final String str1, final String str2) {
412 return metaphone(str1).equals(metaphone(str2));
413 }
414
415 /**
416 * Returns the maxCodeLen.
417 * @return int
418 */
419 public int getMaxCodeLen() { return this.maxCodeLen; }
420
421 /**
422 * Sets the maxCodeLen.
423 * @param maxCodeLen The maxCodeLen to set
424 */
425 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
426
427 }