1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.util.Locale;
20
21 import org.apache.commons.codec.EncoderException;
22 import org.apache.commons.codec.StringEncoder;
23
24
25
26
27
28
29
30
31
32 public class MatchRatingApproachEncoder implements StringEncoder {
33
34 private static final String SPACE = " ";
35
36 private static final String EMPTY = "";
37
38
39
40
41 private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
42 ELEVEN = 11, TWELVE = 12;
43
44
45
46
47 private static final String PLAIN_ASCII = "AaEeIiOoUu" +
48 "AaEeIiOoUuYy" +
49 "AaEeIiOoUuYy" +
50 "AaOoNn" +
51 "AaEeIiOoUuYy" +
52 "Aa" +
53 "Cc" +
54 "OoUu";
55
56
57
58
59 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
60 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
61 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
62 "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
63 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
64 "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
65
66 private static final String[] DOUBLE_CONSONANT =
67 new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
68 "TT", "VV", "WW", "XX", "YY", "ZZ" };
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 String cleanName(final String name) {
84 String upperName = name.toUpperCase(Locale.ENGLISH);
85
86 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
87 for (final String str : charsToTrim) {
88 upperName = upperName.replaceAll(str, EMPTY);
89 }
90
91 upperName = removeAccents(upperName);
92 upperName = upperName.replaceAll("\\s+", EMPTY);
93
94 return upperName;
95 }
96
97
98
99
100
101
102
103
104
105
106
107
108 @Override
109 public final Object encode(final Object pObject) throws EncoderException {
110 if (!(pObject instanceof String)) {
111 throw new EncoderException(
112 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
113 }
114 return encode((String) pObject);
115 }
116
117
118
119
120
121
122
123
124 @Override
125 public final String encode(String name) {
126
127 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
128 return EMPTY;
129 }
130
131
132 name = cleanName(name);
133
134
135
136 name = removeVowels(name);
137
138
139 name = removeDoubleConsonants(name);
140
141
142 name = getFirst3Last3(name);
143
144 return name;
145 }
146
147
148
149
150
151
152
153
154
155
156
157
158
159 String getFirst3Last3(final String name) {
160 final int nameLength = name.length();
161
162 if (nameLength > SIX) {
163 final String firstThree = name.substring(0, THREE);
164 final String lastThree = name.substring(nameLength - THREE, nameLength);
165 return firstThree + lastThree;
166 } else {
167 return name;
168 }
169 }
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184 int getMinRating(final int sumLength) {
185 int minRating = 0;
186
187 if (sumLength <= FOUR) {
188 minRating = FIVE;
189 } else if (sumLength >= FIVE && sumLength <= SEVEN) {
190 minRating = FOUR;
191 } else if (sumLength >= EIGHT && sumLength <= ELEVEN) {
192 minRating = THREE;
193 } else if (sumLength == TWELVE) {
194 minRating = TWO;
195 } else {
196 minRating = ONE;
197 }
198
199 return minRating;
200 }
201
202
203
204
205
206
207
208
209
210
211
212 public boolean isEncodeEquals(String name1, String name2) {
213
214 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
215 return false;
216 } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
217 return false;
218 } else if (name1.length() == 1 || name2.length() == 1) {
219 return false;
220 } else if (name1.equalsIgnoreCase(name2)) {
221 return true;
222 }
223
224
225 name1 = cleanName(name1);
226 name2 = cleanName(name2);
227
228
229
230
231 name1 = removeVowels(name1);
232 name2 = removeVowels(name2);
233
234
235 name1 = removeDoubleConsonants(name1);
236 name2 = removeDoubleConsonants(name2);
237
238
239 name1 = getFirst3Last3(name1);
240 name2 = getFirst3Last3(name2);
241
242
243
244 if (Math.abs(name1.length() - name2.length()) >= THREE) {
245 return false;
246 }
247
248
249
250 final int sumLength = Math.abs(name1.length() + name2.length());
251 int minRating = 0;
252 minRating = getMinRating(sumLength);
253
254
255
256 final int count = leftToRightThenRightToLeftProcessing(name1, name2);
257
258
259
260 return count >= minRating;
261
262 }
263
264
265
266
267
268
269
270
271
272
273
274
275
276 boolean isVowel(final String letter) {
277 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
278 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
279 }
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
295 final char[] name1Char = name1.toCharArray();
296 final char[] name2Char = name2.toCharArray();
297
298 final int name1Size = name1.length() - 1;
299 final int name2Size = name2.length() - 1;
300
301 String name1LtRStart = EMPTY;
302 String name1LtREnd = EMPTY;
303
304 String name2RtLStart = EMPTY;
305 String name2RtLEnd = EMPTY;
306
307 for (int i = 0; i < name1Char.length; i++) {
308 if (i > name2Size) {
309 break;
310 }
311
312 name1LtRStart = name1.substring(i, i + 1);
313 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
314
315 name2RtLStart = name2.substring(i, i + 1);
316 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
317
318
319 if (name1LtRStart.equals(name2RtLStart)) {
320 name1Char[i] = ' ';
321 name2Char[i] = ' ';
322 }
323
324
325 if (name1LtREnd.equals(name2RtLEnd)) {
326 name1Char[name1Size - i] = ' ';
327 name2Char[name2Size - i] = ' ';
328 }
329 }
330
331
332 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
333 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
334
335
336 if (strA.length() > strB.length()) {
337 return Math.abs(SIX - strA.length());
338 } else {
339 return Math.abs(SIX - strB.length());
340 }
341 }
342
343
344
345
346
347
348
349
350
351 String removeAccents(final String accentedWord) {
352 if (accentedWord == null) {
353 return null;
354 }
355
356 final StringBuilder sb = new StringBuilder();
357 final int n = accentedWord.length();
358
359 for (int i = 0; i < n; i++) {
360 final char c = accentedWord.charAt(i);
361 final int pos = UNICODE.indexOf(c);
362 if (pos > -1) {
363 sb.append(PLAIN_ASCII.charAt(pos));
364 } else {
365 sb.append(c);
366 }
367 }
368
369 return sb.toString();
370 }
371
372
373
374
375
376
377
378
379
380
381
382
383
384 String removeDoubleConsonants(final String name) {
385 String replacedName = name.toUpperCase();
386 for (final String dc : DOUBLE_CONSONANT) {
387 if (replacedName.contains(dc)) {
388 final String singleLetter = dc.substring(0, 1);
389 replacedName = replacedName.replace(dc, singleLetter);
390 }
391 }
392 return replacedName;
393 }
394
395
396
397
398
399
400
401
402
403
404
405
406
407 String removeVowels(String name) {
408
409 final String firstLetter = name.substring(0, 1);
410
411 name = name.replaceAll("A", EMPTY);
412 name = name.replaceAll("E", EMPTY);
413 name = name.replaceAll("I", EMPTY);
414 name = name.replaceAll("O", EMPTY);
415 name = name.replaceAll("U", EMPTY);
416
417 name = name.replaceAll("\\s{2,}\\b", SPACE);
418
419
420 if (isVowel(firstLetter)) {
421 return firstLetter + name;
422 } else {
423 return name;
424 }
425 }
426 }