1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.LinkedHashSet;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Scanner;
29 import java.util.Set;
30
31 import org.apache.commons.codec.CharEncoding;
32 import org.apache.commons.codec.EncoderException;
33 import org.apache.commons.codec.StringEncoder;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72 public class DaitchMokotoffSoundex implements StringEncoder {
73
74
75
76
77 private static final class Branch {
78 private final StringBuilder builder;
79 private String cachedString;
80 private String lastReplacement;
81
82 private Branch() {
83 builder = new StringBuilder();
84 lastReplacement = null;
85 cachedString = null;
86 }
87
88
89
90
91
92
93 public Branch createBranch() {
94 final Branch branch = new Branch();
95 branch.builder.append(toString());
96 branch.lastReplacement = this.lastReplacement;
97 return branch;
98 }
99
100 @Override
101 public boolean equals(final Object other) {
102 if (this == other) {
103 return true;
104 }
105 if (!(other instanceof Branch)) {
106 return false;
107 }
108
109 return toString().equals(((Branch) other).toString());
110 }
111
112
113
114
115 public void finish() {
116 while (builder.length() < MAX_LENGTH) {
117 builder.append('0');
118 cachedString = null;
119 }
120 }
121
122 @Override
123 public int hashCode() {
124 return toString().hashCode();
125 }
126
127
128
129
130
131
132
133
134
135 public void processNextReplacement(final String replacement, final boolean forceAppend) {
136 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
137
138 if (append && builder.length() < MAX_LENGTH) {
139 builder.append(replacement);
140
141 if (builder.length() > MAX_LENGTH) {
142 builder.delete(MAX_LENGTH, builder.length());
143 }
144 cachedString = null;
145 }
146
147 lastReplacement = replacement;
148 }
149
150 @Override
151 public String toString() {
152 if (cachedString == null) {
153 cachedString = builder.toString();
154 }
155 return cachedString;
156 }
157 }
158
159
160
161
162 private static final class Rule {
163 private final String pattern;
164 private final String[] replacementAtStart;
165 private final String[] replacementBeforeVowel;
166 private final String[] replacementDefault;
167
168 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
169 final String replacementDefault) {
170 this.pattern = pattern;
171 this.replacementAtStart = replacementAtStart.split("\\|");
172 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
173 this.replacementDefault = replacementDefault.split("\\|");
174 }
175
176 public int getPatternLength() {
177 return pattern.length();
178 }
179
180 public String[] getReplacements(final String context, final boolean atStart) {
181 if (atStart) {
182 return replacementAtStart;
183 }
184
185 final int nextIndex = getPatternLength();
186 final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
187 if (nextCharIsVowel) {
188 return replacementBeforeVowel;
189 }
190
191 return replacementDefault;
192 }
193
194 private boolean isVowel(final char ch) {
195 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
196 }
197
198 public boolean matches(final String context) {
199 return context.startsWith(pattern);
200 }
201
202 @Override
203 public String toString() {
204 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
205 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
206 }
207 }
208
209 private static final String COMMENT = "//";
210 private static final String DOUBLE_QUOTE = "\"";
211
212 private static final String MULTILINE_COMMENT_END = "*/";
213
214 private static final String MULTILINE_COMMENT_START = "/*";
215
216
217 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
218
219
220 private static final int MAX_LENGTH = 6;
221
222
223 private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
224
225
226 private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
227
228 static {
229 final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
230 if (rulesIS == null) {
231 throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
232 }
233
234 final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
235 try {
236 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
237 } finally {
238 scanner.close();
239 }
240
241
242 for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
243 final List<Rule> ruleList = rule.getValue();
244 Collections.sort(ruleList, new Comparator<Rule>() {
245 @Override
246 public int compare(final Rule rule1, final Rule rule2) {
247 return rule2.getPatternLength() - rule1.getPatternLength();
248 }
249 });
250 }
251 }
252
253 private static void parseRules(final Scanner scanner, final String location,
254 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
255 int currentLine = 0;
256 boolean inMultilineComment = false;
257
258 while (scanner.hasNextLine()) {
259 currentLine++;
260 final String rawLine = scanner.nextLine();
261 String line = rawLine;
262
263 if (inMultilineComment) {
264 if (line.endsWith(MULTILINE_COMMENT_END)) {
265 inMultilineComment = false;
266 }
267 continue;
268 }
269
270 if (line.startsWith(MULTILINE_COMMENT_START)) {
271 inMultilineComment = true;
272 } else {
273
274 final int cmtI = line.indexOf(COMMENT);
275 if (cmtI >= 0) {
276 line = line.substring(0, cmtI);
277 }
278
279
280 line = line.trim();
281
282 if (line.length() == 0) {
283 continue;
284 }
285
286 if (line.contains("=")) {
287
288 final String[] parts = line.split("=");
289 if (parts.length != 2) {
290 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
291 " parts: " + rawLine + " in " + location);
292 }
293 final String leftCharacter = parts[0];
294 final String rightCharacter = parts[1];
295
296 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
297 throw new IllegalArgumentException("Malformed folding statement - " +
298 "patterns are not single characters: " + rawLine + " in " + location);
299 }
300
301 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
302 } else {
303
304 final String[] parts = line.split("\\s+");
305 if (parts.length != 4) {
306 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
307 " parts: " + rawLine + " in " + location);
308 }
309 try {
310 final String pattern = stripQuotes(parts[0]);
311 final String replacement1 = stripQuotes(parts[1]);
312 final String replacement2 = stripQuotes(parts[2]);
313 final String replacement3 = stripQuotes(parts[3]);
314
315 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
316 final char patternKey = r.pattern.charAt(0);
317 List<Rule> rules = ruleMapping.get(patternKey);
318 if (rules == null) {
319 rules = new ArrayList<Rule>();
320 ruleMapping.put(patternKey, rules);
321 }
322 rules.add(r);
323 } catch (final IllegalArgumentException e) {
324 throw new IllegalStateException(
325 "Problem parsing line '" + currentLine + "' in " + location, e);
326 }
327 }
328 }
329 }
330 }
331
332 private static String stripQuotes(String str) {
333 if (str.startsWith(DOUBLE_QUOTE)) {
334 str = str.substring(1);
335 }
336
337 if (str.endsWith(DOUBLE_QUOTE)) {
338 str = str.substring(0, str.length() - 1);
339 }
340
341 return str;
342 }
343
344
345 private final boolean folding;
346
347
348
349
350 public DaitchMokotoffSoundex() {
351 this(true);
352 }
353
354
355
356
357
358
359
360
361
362
363
364 public DaitchMokotoffSoundex(final boolean folding) {
365 this.folding = folding;
366 }
367
368
369
370
371
372
373
374
375
376
377
378 private String cleanup(final String input) {
379 final StringBuilder sb = new StringBuilder();
380 for (char ch : input.toCharArray()) {
381 if (Character.isWhitespace(ch)) {
382 continue;
383 }
384
385 ch = Character.toLowerCase(ch);
386 if (folding && FOLDINGS.containsKey(ch)) {
387 ch = FOLDINGS.get(ch);
388 }
389 sb.append(ch);
390 }
391 return sb.toString();
392 }
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412 @Override
413 public Object encode(final Object obj) throws EncoderException {
414 if (!(obj instanceof String)) {
415 throw new EncoderException(
416 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
417 }
418 return encode((String) obj);
419 }
420
421
422
423
424
425
426
427
428
429
430
431
432 @Override
433 public String encode(final String source) {
434 if (source == null) {
435 return null;
436 }
437 return soundex(source, false)[0];
438 }
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463 public String soundex(final String source) {
464 final String[] branches = soundex(source, true);
465 final StringBuilder sb = new StringBuilder();
466 int index = 0;
467 for (final String branch : branches) {
468 sb.append(branch);
469 if (++index < branches.length) {
470 sb.append('|');
471 }
472 }
473 return sb.toString();
474 }
475
476
477
478
479
480
481
482
483
484
485
486 private String[] soundex(final String source, final boolean branching) {
487 if (source == null) {
488 return null;
489 }
490
491 final String input = cleanup(source);
492
493 final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
494 currentBranches.add(new Branch());
495
496 char lastChar = '\0';
497 for (int index = 0; index < input.length(); index++) {
498 final char ch = input.charAt(index);
499
500
501 if (Character.isWhitespace(ch)) {
502 continue;
503 }
504
505 final String inputContext = input.substring(index);
506 final List<Rule> rules = RULES.get(ch);
507 if (rules == null) {
508 continue;
509 }
510
511
512 @SuppressWarnings("unchecked")
513 final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
514
515 for (final Rule rule : rules) {
516 if (rule.matches(inputContext)) {
517 if (branching) {
518 nextBranches.clear();
519 }
520 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
521 final boolean branchingRequired = replacements.length > 1 && branching;
522
523 for (final Branch branch : currentBranches) {
524 for (final String nextReplacement : replacements) {
525
526 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
527
528
529 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
530
531 nextBranch.processNextReplacement(nextReplacement, force);
532
533 if (branching) {
534 nextBranches.add(nextBranch);
535 } else {
536 break;
537 }
538 }
539 }
540
541 if (branching) {
542 currentBranches.clear();
543 currentBranches.addAll(nextBranches);
544 }
545 index += rule.getPatternLength() - 1;
546 break;
547 }
548 }
549
550 lastChar = ch;
551 }
552
553 final String[] result = new String[currentBranches.size()];
554 int index = 0;
555 for (final Branch branch : currentBranches) {
556 branch.finish();
557 result[index++] = branch.toString();
558 }
559
560 return result;
561 }
562 }