1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.io.InputStream;
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.LinkedHashSet;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Scanner;
29 import java.util.Set;
30
31 import org.apache.commons.codec.CharEncoding;
32 import org.apache.commons.codec.EncoderException;
33 import org.apache.commons.codec.StringEncoder;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72 public class DaitchMokotoffSoundex implements StringEncoder {
73
74
75
76
77 private static final class Branch {
78 private final StringBuilder builder;
79 private String cachedString;
80 private String lastReplacement;
81
82 private Branch() {
83 builder = new StringBuilder();
84 lastReplacement = null;
85 cachedString = null;
86 }
87
88
89
90
91
92
93 public Branch createBranch() {
94 final Branch branch = new Branch();
95 branch.builder.append(toString());
96 branch.lastReplacement = this.lastReplacement;
97 return branch;
98 }
99
100 @Override
101 public boolean equals(final Object other) {
102 if (this == other) {
103 return true;
104 }
105 if (!(other instanceof Branch)) {
106 return false;
107 }
108
109 return toString().equals(((Branch) other).toString());
110 }
111
112
113
114
115 public void finish() {
116 while (builder.length() < MAX_LENGTH) {
117 builder.append('0');
118 cachedString = null;
119 }
120 }
121
122 @Override
123 public int hashCode() {
124 return toString().hashCode();
125 }
126
127
128
129
130
131
132
133
134
135 public void processNextReplacement(final String replacement, final boolean forceAppend) {
136 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
137
138 if (append && builder.length() < MAX_LENGTH) {
139 builder.append(replacement);
140
141 if (builder.length() > MAX_LENGTH) {
142 builder.delete(MAX_LENGTH, builder.length());
143 }
144 cachedString = null;
145 }
146
147 lastReplacement = replacement;
148 }
149
150 @Override
151 public String toString() {
152 if (cachedString == null) {
153 cachedString = builder.toString();
154 }
155 return cachedString;
156 }
157 }
158
159
160
161
162 private static final class Rule {
163 private final String pattern;
164 private final String[] replacementAtStart;
165 private final String[] replacementBeforeVowel;
166 private final String[] replacementDefault;
167
168 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
169 final String replacementDefault) {
170 this.pattern = pattern;
171 this.replacementAtStart = replacementAtStart.split("\\|");
172 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
173 this.replacementDefault = replacementDefault.split("\\|");
174 }
175
176 public int getPatternLength() {
177 return pattern.length();
178 }
179
180 public String[] getReplacements(final String context, final boolean atStart) {
181 if (atStart) {
182 return replacementAtStart;
183 }
184
185 final int nextIndex = getPatternLength();
186 final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
187 if (nextCharIsVowel) {
188 return replacementBeforeVowel;
189 }
190
191 return replacementDefault;
192 }
193
194 private boolean isVowel(final char ch) {
195 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
196 }
197
198 public boolean matches(final String context) {
199 return context.startsWith(pattern);
200 }
201
202 @Override
203 public String toString() {
204 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
205 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
206 }
207 }
208
209 private static final String COMMENT = "//";
210 private static final String DOUBLE_QUOTE = "\"";
211
212 private static final String MULTILINE_COMMENT_END = "*/";
213
214 private static final String MULTILINE_COMMENT_START = "/*";
215
216
217 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
218
219
220 private static final int MAX_LENGTH = 6;
221
222
223 private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
224
225
226 private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
227
228 static {
229 final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
230 if (rulesIS == null) {
231 throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
232 }
233
234 final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
235 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
236 scanner.close();
237
238
239 for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
240 final List<Rule> ruleList = rule.getValue();
241 Collections.sort(ruleList, new Comparator<Rule>() {
242 @Override
243 public int compare(final Rule rule1, final Rule rule2) {
244 return rule2.getPatternLength() - rule1.getPatternLength();
245 }
246 });
247 }
248 }
249
250 private static void parseRules(final Scanner scanner, final String location,
251 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
252 int currentLine = 0;
253 boolean inMultilineComment = false;
254
255 while (scanner.hasNextLine()) {
256 currentLine++;
257 final String rawLine = scanner.nextLine();
258 String line = rawLine;
259
260 if (inMultilineComment) {
261 if (line.endsWith(MULTILINE_COMMENT_END)) {
262 inMultilineComment = false;
263 }
264 continue;
265 }
266
267 if (line.startsWith(MULTILINE_COMMENT_START)) {
268 inMultilineComment = true;
269 } else {
270
271 final int cmtI = line.indexOf(COMMENT);
272 if (cmtI >= 0) {
273 line = line.substring(0, cmtI);
274 }
275
276
277 line = line.trim();
278
279 if (line.length() == 0) {
280 continue;
281 }
282
283 if (line.contains("=")) {
284
285 final String[] parts = line.split("=");
286 if (parts.length != 2) {
287 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
288 " parts: " + rawLine + " in " + location);
289 } else {
290 final String leftCharacter = parts[0];
291 final String rightCharacter = parts[1];
292
293 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
294 throw new IllegalArgumentException("Malformed folding statement - " +
295 "patterns are not single characters: " + rawLine + " in " + location);
296 }
297
298 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
299 }
300 } else {
301
302 final String[] parts = line.split("\\s+");
303 if (parts.length != 4) {
304 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
305 " parts: " + rawLine + " in " + location);
306 } else {
307 try {
308 final String pattern = stripQuotes(parts[0]);
309 final String replacement1 = stripQuotes(parts[1]);
310 final String replacement2 = stripQuotes(parts[2]);
311 final String replacement3 = stripQuotes(parts[3]);
312
313 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
314 final char patternKey = r.pattern.charAt(0);
315 List<Rule> rules = ruleMapping.get(patternKey);
316 if (rules == null) {
317 rules = new ArrayList<Rule>();
318 ruleMapping.put(patternKey, rules);
319 }
320 rules.add(r);
321 } catch (final IllegalArgumentException e) {
322 throw new IllegalStateException(
323 "Problem parsing line '" + currentLine + "' in " + location, e);
324 }
325 }
326 }
327 }
328 }
329 }
330
331 private static String stripQuotes(String str) {
332 if (str.startsWith(DOUBLE_QUOTE)) {
333 str = str.substring(1);
334 }
335
336 if (str.endsWith(DOUBLE_QUOTE)) {
337 str = str.substring(0, str.length() - 1);
338 }
339
340 return str;
341 }
342
343
344 private final boolean folding;
345
346
347
348
349 public DaitchMokotoffSoundex() {
350 this(true);
351 }
352
353
354
355
356
357
358
359
360
361
362
363 public DaitchMokotoffSoundex(final boolean folding) {
364 this.folding = folding;
365 }
366
367
368
369
370
371
372
373
374
375
376
377 private String cleanup(final String input) {
378 final StringBuilder sb = new StringBuilder();
379 for (char ch : input.toCharArray()) {
380 if (Character.isWhitespace(ch)) {
381 continue;
382 }
383
384 ch = Character.toLowerCase(ch);
385 if (folding && FOLDINGS.containsKey(ch)) {
386 ch = FOLDINGS.get(ch);
387 }
388 sb.append(ch);
389 }
390 return sb.toString();
391 }
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411 @Override
412 public Object encode(final Object obj) throws EncoderException {
413 if (!(obj instanceof String)) {
414 throw new EncoderException(
415 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
416 }
417 return encode((String) obj);
418 }
419
420
421
422
423
424
425
426
427
428
429
430
431 @Override
432 public String encode(final String source) {
433 if (source == null) {
434 return null;
435 }
436 return soundex(source, false)[0];
437 }
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462 public String soundex(final String source) {
463 final String[] branches = soundex(source, true);
464 final StringBuilder sb = new StringBuilder();
465 int index = 0;
466 for (final String branch : branches) {
467 sb.append(branch);
468 if (++index < branches.length) {
469 sb.append('|');
470 }
471 }
472 return sb.toString();
473 }
474
475
476
477
478
479
480
481
482
483
484
485 private String[] soundex(final String source, final boolean branching) {
486 if (source == null) {
487 return null;
488 }
489
490 final String input = cleanup(source);
491
492 final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
493 currentBranches.add(new Branch());
494
495 char lastChar = '\0';
496 for (int index = 0; index < input.length(); index++) {
497 final char ch = input.charAt(index);
498
499
500 if (Character.isWhitespace(ch)) {
501 continue;
502 }
503
504 final String inputContext = input.substring(index);
505 final List<Rule> rules = RULES.get(ch);
506 if (rules == null) {
507 continue;
508 }
509
510
511 @SuppressWarnings("unchecked")
512 final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
513
514 for (final Rule rule : rules) {
515 if (rule.matches(inputContext)) {
516 if (branching) {
517 nextBranches.clear();
518 }
519 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
520 final boolean branchingRequired = replacements.length > 1 && branching;
521
522 for (final Branch branch : currentBranches) {
523 for (final String nextReplacement : replacements) {
524
525 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
526
527
528 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
529
530 nextBranch.processNextReplacement(nextReplacement, force);
531
532 if (branching) {
533 nextBranches.add(nextBranch);
534 } else {
535 break;
536 }
537 }
538 }
539
540 if (branching) {
541 currentBranches.clear();
542 currentBranches.addAll(nextBranches);
543 }
544 index += rule.getPatternLength() - 1;
545 break;
546 }
547 }
548
549 lastChar = ch;
550 }
551
552 final String[] result = new String[currentBranches.size()];
553 int index = 0;
554 for (final Branch branch : currentBranches) {
555 branch.finish();
556 result[index++] = branch.toString();
557 }
558
559 return result;
560 }
561 }