1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a metaphone value.
25 * <p>
26 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
27 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
28 * </p>
29 * <p>
30 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
31 * 39.</CITE>
32 * </p>
33 *
34 * @author Apache Software Foundation
35 * @version $Id: Metaphone.java 582447 2007-10-06 04:14:43Z bayard $
36 */
37 public class Metaphone implements StringEncoder {
38
39 /**
40 * Five values in the English language
41 */
42 private static final String VOWELS = "AEIOU" ;
43
44 /**
45 * Variable used in Metaphone algorithm
46 */
47 private static final String FRONTV = "EIY" ;
48
49 /**
50 * Variable used in Metaphone algorithm
51 */
52 private static final String VARSON = "CSPTG" ;
53
54 /**
55 * The max code length for metaphone is 4
56 */
57 private int maxCodeLen = 4 ;
58
59 /**
60 * Creates an instance of the Metaphone encoder
61 */
62 public Metaphone() {
63 super();
64 }
65
66 /**
67 * Find the metaphone value of a String. This is similar to the
68 * soundex algorithm, but better at finding similar sounding words.
69 * All input is converted to upper case.
70 * Limitations: Input format is expected to be a single ASCII word
71 * with only characters in the A - Z range, no punctuation or numbers.
72 *
73 * @param txt String to find the metaphone code for
74 * @return A metaphone code corresponding to the String supplied
75 */
76 public String metaphone(String txt) {
77 boolean hard = false ;
78 if ((txt == null) || (txt.length() == 0)) {
79 return "" ;
80 }
81 // single character is itself
82 if (txt.length() == 1) {
83 return txt.toUpperCase() ;
84 }
85
86 char[] inwd = txt.toUpperCase().toCharArray() ;
87
88 StringBuffer local = new StringBuffer(40); // manipulate
89 StringBuffer code = new StringBuffer(10) ; // output
90 // handle initial 2 characters exceptions
91 switch(inwd[0]) {
92 case 'K' :
93 case 'G' :
94 case 'P' : /* looking for KN, etc*/
95 if (inwd[1] == 'N') {
96 local.append(inwd, 1, inwd.length - 1);
97 } else {
98 local.append(inwd);
99 }
100 break;
101 case 'A': /* looking for AE */
102 if (inwd[1] == 'E') {
103 local.append(inwd, 1, inwd.length - 1);
104 } else {
105 local.append(inwd);
106 }
107 break;
108 case 'W' : /* looking for WR or WH */
109 if (inwd[1] == 'R') { // WR -> R
110 local.append(inwd, 1, inwd.length - 1);
111 break ;
112 }
113 if (inwd[1] == 'H') {
114 local.append(inwd, 1, inwd.length - 1);
115 local.setCharAt(0, 'W'); // WH -> W
116 } else {
117 local.append(inwd);
118 }
119 break;
120 case 'X' : /* initial X becomes S */
121 inwd[0] = 'S';
122 local.append(inwd);
123 break ;
124 default :
125 local.append(inwd);
126 } // now local has working string with initials fixed
127
128 int wdsz = local.length();
129 int n = 0 ;
130
131 while ((code.length() < this.getMaxCodeLen()) &&
132 (n < wdsz) ) { // max code size of 4 works well
133 char symb = local.charAt(n) ;
134 // remove duplicate letters except C
135 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
136 n++ ;
137 } else { // not dup
138 switch(symb) {
139 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
140 if (n == 0) {
141 code.append(symb);
142 }
143 break ; // only use vowel if leading char
144 case 'B' :
145 if ( isPreviousChar(local, n, 'M') &&
146 isLastChar(wdsz, n) ) { // B is silent if word ends in MB
147 break;
148 }
149 code.append(symb);
150 break;
151 case 'C' : // lots of C special cases
152 /* discard if SCI, SCE or SCY */
153 if ( isPreviousChar(local, n, 'S') &&
154 !isLastChar(wdsz, n) &&
155 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) {
156 break;
157 }
158 if (regionMatch(local, n, "CIA")) { // "CIA" -> X
159 code.append('X');
160 break;
161 }
162 if (!isLastChar(wdsz, n) &&
163 (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
164 code.append('S');
165 break; // CI,CE,CY -> S
166 }
167 if (isPreviousChar(local, n, 'S') &&
168 isNextChar(local, n, 'H') ) { // SCH->sk
169 code.append('K') ;
170 break ;
171 }
172 if (isNextChar(local, n, 'H')) { // detect CH
173 if ((n == 0) &&
174 (wdsz >= 3) &&
175 isVowel(local,2) ) { // CH consonant -> K consonant
176 code.append('K');
177 } else {
178 code.append('X'); // CHvowel -> X
179 }
180 } else {
181 code.append('K');
182 }
183 break ;
184 case 'D' :
185 if (!isLastChar(wdsz, n + 1) &&
186 isNextChar(local, n, 'G') &&
187 (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
188 code.append('J'); n += 2 ;
189 } else {
190 code.append('T');
191 }
192 break ;
193 case 'G' : // GH silent at end or before consonant
194 if (isLastChar(wdsz, n + 1) &&
195 isNextChar(local, n, 'H')) {
196 break;
197 }
198 if (!isLastChar(wdsz, n + 1) &&
199 isNextChar(local,n,'H') &&
200 !isVowel(local,n+2)) {
201 break;
202 }
203 if ((n > 0) &&
204 ( regionMatch(local, n, "GN") ||
205 regionMatch(local, n, "GNED") ) ) {
206 break; // silent G
207 }
208 if (isPreviousChar(local, n, 'G')) {
209 hard = true ;
210 } else {
211 hard = false ;
212 }
213 if (!isLastChar(wdsz, n) &&
214 (FRONTV.indexOf(local.charAt(n + 1)) >= 0) &&
215 (!hard)) {
216 code.append('J');
217 } else {
218 code.append('K');
219 }
220 break ;
221 case 'H':
222 if (isLastChar(wdsz, n)) {
223 break ; // terminal H
224 }
225 if ((n > 0) &&
226 (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
227 break;
228 }
229 if (isVowel(local,n+1)) {
230 code.append('H'); // Hvowel
231 }
232 break;
233 case 'F':
234 case 'J' :
235 case 'L' :
236 case 'M':
237 case 'N' :
238 case 'R' :
239 code.append(symb);
240 break;
241 case 'K' :
242 if (n > 0) { // not initial
243 if (!isPreviousChar(local, n, 'C')) {
244 code.append(symb);
245 }
246 } else {
247 code.append(symb); // initial K
248 }
249 break ;
250 case 'P' :
251 if (isNextChar(local,n,'H')) {
252 // PH -> F
253 code.append('F');
254 } else {
255 code.append(symb);
256 }
257 break ;
258 case 'Q' :
259 code.append('K');
260 break;
261 case 'S' :
262 if (regionMatch(local,n,"SH") ||
263 regionMatch(local,n,"SIO") ||
264 regionMatch(local,n,"SIA")) {
265 code.append('X');
266 } else {
267 code.append('S');
268 }
269 break;
270 case 'T' :
271 if (regionMatch(local,n,"TIA") ||
272 regionMatch(local,n,"TIO")) {
273 code.append('X');
274 break;
275 }
276 if (regionMatch(local,n,"TCH")) {
277 // Silent if in "TCH"
278 break;
279 }
280 // substitute numeral 0 for TH (resembles theta after all)
281 if (regionMatch(local,n,"TH")) {
282 code.append('0');
283 } else {
284 code.append('T');
285 }
286 break ;
287 case 'V' :
288 code.append('F'); break ;
289 case 'W' : case 'Y' : // silent if not followed by vowel
290 if (!isLastChar(wdsz,n) &&
291 isVowel(local,n+1)) {
292 code.append(symb);
293 }
294 break ;
295 case 'X' :
296 code.append('K'); code.append('S');
297 break ;
298 case 'Z' :
299 code.append('S'); break ;
300 } // end switch
301 n++ ;
302 } // end else from symb != 'C'
303 if (code.length() > this.getMaxCodeLen()) {
304 code.setLength(this.getMaxCodeLen());
305 }
306 }
307 return code.toString();
308 }
309
310 private boolean isVowel(StringBuffer string, int index) {
311 return VOWELS.indexOf(string.charAt(index)) >= 0;
312 }
313
314 private boolean isPreviousChar(StringBuffer string, int index, char c) {
315 boolean matches = false;
316 if( index > 0 &&
317 index < string.length() ) {
318 matches = string.charAt(index - 1) == c;
319 }
320 return matches;
321 }
322
323 private boolean isNextChar(StringBuffer string, int index, char c) {
324 boolean matches = false;
325 if( index >= 0 &&
326 index < string.length() - 1 ) {
327 matches = string.charAt(index + 1) == c;
328 }
329 return matches;
330 }
331
332 private boolean regionMatch(StringBuffer string, int index, String test) {
333 boolean matches = false;
334 if( index >= 0 &&
335 (index + test.length() - 1) < string.length() ) {
336 String substring = string.substring( index, index + test.length());
337 matches = substring.equals( test );
338 }
339 return matches;
340 }
341
342 private boolean isLastChar(int wdsz, int n) {
343 return n + 1 == wdsz;
344 }
345
346
347 /**
348 * Encodes an Object using the metaphone algorithm. This method
349 * is provided in order to satisfy the requirements of the
350 * Encoder interface, and will throw an EncoderException if the
351 * supplied object is not of type java.lang.String.
352 *
353 * @param pObject Object to encode
354 * @return An object (or type java.lang.String) containing the
355 * metaphone code which corresponds to the String supplied.
356 * @throws EncoderException if the parameter supplied is not
357 * of type java.lang.String
358 */
359 public Object encode(Object pObject) throws EncoderException {
360 if (!(pObject instanceof java.lang.String)) {
361 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
362 }
363 return metaphone((String) pObject);
364 }
365
366 /**
367 * Encodes a String using the Metaphone algorithm.
368 *
369 * @param pString String object to encode
370 * @return The metaphone code corresponding to the String supplied
371 */
372 public String encode(String pString) {
373 return metaphone(pString);
374 }
375
376 /**
377 * Tests is the metaphones of two strings are identical.
378 *
379 * @param str1 First of two strings to compare
380 * @param str2 Second of two strings to compare
381 * @return <code>true</code> if the metaphones of these strings are identical,
382 * <code>false</code> otherwise.
383 */
384 public boolean isMetaphoneEqual(String str1, String str2) {
385 return metaphone(str1).equals(metaphone(str2));
386 }
387
388 /**
389 * Returns the maxCodeLen.
390 * @return int
391 */
392 public int getMaxCodeLen() { return this.maxCodeLen; }
393
394 /**
395 * Sets the maxCodeLen.
396 * @param maxCodeLen The maxCodeLen to set
397 */
398 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
399
400 }