1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Metaphone value.
25 * <p>
26 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28 * <p>
29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
30 * p 39.</CITE>
31 * <p>
32 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl
33 * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
34 * They have had undocumented changes from the originally published algorithm.
35 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
36 * <p>
37 * This class is conditionally thread-safe.
38 * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
39 * but is not volatile, and accesses are not synchronised.
40 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation
41 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
42 * after initial setup.
43 *
44 * @version $Id: Metaphone.html 889935 2013-12-11 05:05:13Z ggregory $
45 */
46 public class Metaphone implements StringEncoder {
47
48 /**
49 * Five values in the English language
50 */
51 private static final String VOWELS = "AEIOU";
52
53 /**
54 * Variable used in Metaphone algorithm
55 */
56 private static final String FRONTV = "EIY";
57
58 /**
59 * Variable used in Metaphone algorithm
60 */
61 private static final String VARSON = "CSPTG";
62
63 /**
64 * The max code length for metaphone is 4
65 */
66 private int maxCodeLen = 4;
67
68 /**
69 * Creates an instance of the Metaphone encoder
70 */
71 public Metaphone() {
72 super();
73 }
74
75 /**
76 * Find the metaphone value of a String. This is similar to the
77 * soundex algorithm, but better at finding similar sounding words.
78 * All input is converted to upper case.
79 * Limitations: Input format is expected to be a single ASCII word
80 * with only characters in the A - Z range, no punctuation or numbers.
81 *
82 * @param txt String to find the metaphone code for
83 * @return A metaphone code corresponding to the String supplied
84 */
85 public String metaphone(final String txt) {
86 boolean hard = false;
87 if (txt == null || txt.length() == 0) {
88 return "";
89 }
90 // single character is itself
91 if (txt.length() == 1) {
92 return txt.toUpperCase(java.util.Locale.ENGLISH);
93 }
94
95 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
96
97 final StringBuilder local = new StringBuilder(40); // manipulate
98 final StringBuilder code = new StringBuilder(10); // output
99 // handle initial 2 characters exceptions
100 switch(inwd[0]) {
101 case 'K':
102 case 'G':
103 case 'P': /* looking for KN, etc*/
104 if (inwd[1] == 'N') {
105 local.append(inwd, 1, inwd.length - 1);
106 } else {
107 local.append(inwd);
108 }
109 break;
110 case 'A': /* looking for AE */
111 if (inwd[1] == 'E') {
112 local.append(inwd, 1, inwd.length - 1);
113 } else {
114 local.append(inwd);
115 }
116 break;
117 case 'W': /* looking for WR or WH */
118 if (inwd[1] == 'R') { // WR -> R
119 local.append(inwd, 1, inwd.length - 1);
120 break;
121 }
122 if (inwd[1] == 'H') {
123 local.append(inwd, 1, inwd.length - 1);
124 local.setCharAt(0, 'W'); // WH -> W
125 } else {
126 local.append(inwd);
127 }
128 break;
129 case 'X': /* initial X becomes S */
130 inwd[0] = 'S';
131 local.append(inwd);
132 break;
133 default:
134 local.append(inwd);
135 } // now local has working string with initials fixed
136
137 final int wdsz = local.length();
138 int n = 0;
139
140 while (code.length() < this.getMaxCodeLen() &&
141 n < wdsz ) { // max code size of 4 works well
142 final char symb = local.charAt(n);
143 // remove duplicate letters except C
144 if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
145 n++;
146 } else { // not dup
147 switch(symb) {
148 case 'A':
149 case 'E':
150 case 'I':
151 case 'O':
152 case 'U':
153 if (n == 0) {
154 code.append(symb);
155 }
156 break; // only use vowel if leading char
157 case 'B':
158 if ( isPreviousChar(local, n, 'M') &&
159 isLastChar(wdsz, n) ) { // B is silent if word ends in MB
160 break;
161 }
162 code.append(symb);
163 break;
164 case 'C': // lots of C special cases
165 /* discard if SCI, SCE or SCY */
166 if ( isPreviousChar(local, n, 'S') &&
167 !isLastChar(wdsz, n) &&
168 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
169 break;
170 }
171 if (regionMatch(local, n, "CIA")) { // "CIA" -> X
172 code.append('X');
173 break;
174 }
175 if (!isLastChar(wdsz, n) &&
176 FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
177 code.append('S');
178 break; // CI,CE,CY -> S
179 }
180 if (isPreviousChar(local, n, 'S') &&
181 isNextChar(local, n, 'H') ) { // SCH->sk
182 code.append('K');
183 break;
184 }
185 if (isNextChar(local, n, 'H')) { // detect CH
186 if (n == 0 &&
187 wdsz >= 3 &&
188 isVowel(local,2) ) { // CH consonant -> K consonant
189 code.append('K');
190 } else {
191 code.append('X'); // CHvowel -> X
192 }
193 } else {
194 code.append('K');
195 }
196 break;
197 case 'D':
198 if (!isLastChar(wdsz, n + 1) &&
199 isNextChar(local, n, 'G') &&
200 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
201 code.append('J'); n += 2;
202 } else {
203 code.append('T');
204 }
205 break;
206 case 'G': // GH silent at end or before consonant
207 if (isLastChar(wdsz, n + 1) &&
208 isNextChar(local, n, 'H')) {
209 break;
210 }
211 if (!isLastChar(wdsz, n + 1) &&
212 isNextChar(local,n,'H') &&
213 !isVowel(local,n+2)) {
214 break;
215 }
216 if (n > 0 &&
217 ( regionMatch(local, n, "GN") ||
218 regionMatch(local, n, "GNED") ) ) {
219 break; // silent G
220 }
221 if (isPreviousChar(local, n, 'G')) {
222 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
223 hard = true;
224 } else {
225 hard = false;
226 }
227 if (!isLastChar(wdsz, n) &&
228 FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
229 !hard) {
230 code.append('J');
231 } else {
232 code.append('K');
233 }
234 break;
235 case 'H':
236 if (isLastChar(wdsz, n)) {
237 break; // terminal H
238 }
239 if (n > 0 &&
240 VARSON.indexOf(local.charAt(n - 1)) >= 0) {
241 break;
242 }
243 if (isVowel(local,n+1)) {
244 code.append('H'); // Hvowel
245 }
246 break;
247 case 'F':
248 case 'J':
249 case 'L':
250 case 'M':
251 case 'N':
252 case 'R':
253 code.append(symb);
254 break;
255 case 'K':
256 if (n > 0) { // not initial
257 if (!isPreviousChar(local, n, 'C')) {
258 code.append(symb);
259 }
260 } else {
261 code.append(symb); // initial K
262 }
263 break;
264 case 'P':
265 if (isNextChar(local,n,'H')) {
266 // PH -> F
267 code.append('F');
268 } else {
269 code.append(symb);
270 }
271 break;
272 case 'Q':
273 code.append('K');
274 break;
275 case 'S':
276 if (regionMatch(local,n,"SH") ||
277 regionMatch(local,n,"SIO") ||
278 regionMatch(local,n,"SIA")) {
279 code.append('X');
280 } else {
281 code.append('S');
282 }
283 break;
284 case 'T':
285 if (regionMatch(local,n,"TIA") ||
286 regionMatch(local,n,"TIO")) {
287 code.append('X');
288 break;
289 }
290 if (regionMatch(local,n,"TCH")) {
291 // Silent if in "TCH"
292 break;
293 }
294 // substitute numeral 0 for TH (resembles theta after all)
295 if (regionMatch(local,n,"TH")) {
296 code.append('0');
297 } else {
298 code.append('T');
299 }
300 break;
301 case 'V':
302 code.append('F'); break;
303 case 'W':
304 case 'Y': // silent if not followed by vowel
305 if (!isLastChar(wdsz,n) &&
306 isVowel(local,n+1)) {
307 code.append(symb);
308 }
309 break;
310 case 'X':
311 code.append('K');
312 code.append('S');
313 break;
314 case 'Z':
315 code.append('S');
316 break;
317 default:
318 // do nothing
319 break;
320 } // end switch
321 n++;
322 } // end else from symb != 'C'
323 if (code.length() > this.getMaxCodeLen()) {
324 code.setLength(this.getMaxCodeLen());
325 }
326 }
327 return code.toString();
328 }
329
330 private boolean isVowel(final StringBuilder string, final int index) {
331 return VOWELS.indexOf(string.charAt(index)) >= 0;
332 }
333
334 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
335 boolean matches = false;
336 if( index > 0 &&
337 index < string.length() ) {
338 matches = string.charAt(index - 1) == c;
339 }
340 return matches;
341 }
342
343 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
344 boolean matches = false;
345 if( index >= 0 &&
346 index < string.length() - 1 ) {
347 matches = string.charAt(index + 1) == c;
348 }
349 return matches;
350 }
351
352 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
353 boolean matches = false;
354 if( index >= 0 &&
355 index + test.length() - 1 < string.length() ) {
356 final String substring = string.substring( index, index + test.length());
357 matches = substring.equals( test );
358 }
359 return matches;
360 }
361
362 private boolean isLastChar(final int wdsz, final int n) {
363 return n + 1 == wdsz;
364 }
365
366
367 /**
368 * Encodes an Object using the metaphone algorithm. This method
369 * is provided in order to satisfy the requirements of the
370 * Encoder interface, and will throw an EncoderException if the
371 * supplied object is not of type java.lang.String.
372 *
373 * @param obj Object to encode
374 * @return An object (or type java.lang.String) containing the
375 * metaphone code which corresponds to the String supplied.
376 * @throws EncoderException if the parameter supplied is not
377 * of type java.lang.String
378 */
379 @Override
380 public Object encode(final Object obj) throws EncoderException {
381 if (!(obj instanceof String)) {
382 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
383 }
384 return metaphone((String) obj);
385 }
386
387 /**
388 * Encodes a String using the Metaphone algorithm.
389 *
390 * @param str String object to encode
391 * @return The metaphone code corresponding to the String supplied
392 */
393 @Override
394 public String encode(final String str) {
395 return metaphone(str);
396 }
397
398 /**
399 * Tests is the metaphones of two strings are identical.
400 *
401 * @param str1 First of two strings to compare
402 * @param str2 Second of two strings to compare
403 * @return {@code true} if the metaphones of these strings are identical,
404 * {@code false} otherwise.
405 */
406 public boolean isMetaphoneEqual(final String str1, final String str2) {
407 return metaphone(str1).equals(metaphone(str2));
408 }
409
410 /**
411 * Returns the maxCodeLen.
412 * @return int
413 */
414 public int getMaxCodeLen() { return this.maxCodeLen; }
415
416 /**
417 * Sets the maxCodeLen.
418 * @param maxCodeLen The maxCodeLen to set
419 */
420 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
421
422 }