1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.codec.language;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashMap;
23 import java.util.LinkedHashSet;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Scanner;
27 import java.util.Set;
28 import java.util.regex.Pattern;
29
30 import org.apache.commons.codec.CharEncoding;
31 import org.apache.commons.codec.EncoderException;
32 import org.apache.commons.codec.Resources;
33 import org.apache.commons.codec.StringEncoder;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public class DaitchMokotoffSoundex implements StringEncoder {
71
72
73
74
75 private static final class Branch {
76 private final StringBuilder builder;
77 private String cachedString;
78 private String lastReplacement;
79
80 private Branch() {
81 builder = new StringBuilder();
82 }
83
84
85
86
87
88
89 private Branch createBranch() {
90 final Branch branch = new Branch();
91 branch.builder.append(toString());
92 branch.lastReplacement = this.lastReplacement;
93 return branch;
94 }
95
96 @Override
97 public boolean equals(final Object other) {
98 if (this == other) {
99 return true;
100 }
101 if (!(other instanceof Branch)) {
102 return false;
103 }
104 return toString().equals(((Branch) other).toString());
105 }
106
107
108
109
110 private void finish() {
111 while (builder.length() < MAX_LENGTH) {
112 builder.append('0');
113 cachedString = null;
114 }
115 }
116
117 @Override
118 public int hashCode() {
119 return toString().hashCode();
120 }
121
122
123
124
125
126
127
128
129
130 private void processNextReplacement(final String replacement, final boolean forceAppend) {
131 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
132 if (append && builder.length() < MAX_LENGTH) {
133 builder.append(replacement);
134
135 if (builder.length() > MAX_LENGTH) {
136 builder.delete(MAX_LENGTH, builder.length());
137 }
138 cachedString = null;
139 }
140 lastReplacement = replacement;
141 }
142
143 @Override
144 public String toString() {
145 if (cachedString == null) {
146 cachedString = builder.toString();
147 }
148 return cachedString;
149 }
150 }
151
152
153
154
155 private static final class Rule {
156 private static final Pattern PIPE = Pattern.compile("\\|");
157 private final String pattern;
158 private final String[] replacementAtStart;
159 private final String[] replacementBeforeVowel;
160 private final String[] replacementDefault;
161
162 private Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
163 final String replacementDefault) {
164 this.pattern = pattern;
165 this.replacementAtStart = PIPE.split(replacementAtStart);
166 this.replacementBeforeVowel = PIPE.split(replacementBeforeVowel);
167 this.replacementDefault = PIPE.split(replacementDefault);
168 }
169
170 private int getPatternLength() {
171 return pattern.length();
172 }
173
174 private String[] getReplacements(final String context, final boolean atStart) {
175 if (atStart) {
176 return replacementAtStart;
177 }
178
179 final int nextIndex = getPatternLength();
180 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
181 if (nextCharIsVowel) {
182 return replacementBeforeVowel;
183 }
184
185 return replacementDefault;
186 }
187
188 private boolean isVowel(final char ch) {
189 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
190 }
191
192 private boolean matches(final String context) {
193 return context.startsWith(pattern);
194 }
195
196 @Override
197 public String toString() {
198 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
199 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
200 }
201 }
202
203
204
205
206 private static final char NUL = '\0';
207
208 private static final String COMMENT = "//";
209
210 private static final String DOUBLE_QUOTE = "\"";
211
212 private static final String MULTILINE_COMMENT_END = "*/";
213
214 private static final String MULTILINE_COMMENT_START = "/*";
215
216
217 private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
218
219
220 private static final int MAX_LENGTH = 6;
221
222
223 private static final Map<Character, List<Rule>> RULES = new HashMap<>();
224
225
226 private static final Map<Character, Character> FOLDINGS = new HashMap<>();
227
228 private static final Pattern EQUAL = Pattern.compile("=");
229
230 private static final Pattern SPACES = Pattern.compile("\\s+");
231
232 static {
233 try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
234 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
235 }
236
237 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
238 }
239
240 private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping,
241 final Map<Character, Character> asciiFoldings) {
242 int currentLine = 0;
243 boolean inMultilineComment = false;
244 while (scanner.hasNextLine()) {
245 currentLine++;
246 final String rawLine = scanner.nextLine();
247 String line = rawLine;
248 if (inMultilineComment) {
249 if (line.endsWith(MULTILINE_COMMENT_END)) {
250 inMultilineComment = false;
251 }
252 continue;
253 }
254 if (line.startsWith(MULTILINE_COMMENT_START)) {
255 inMultilineComment = true;
256 } else {
257
258 final int cmtI = line.indexOf(COMMENT);
259 if (cmtI >= 0) {
260 line = line.substring(0, cmtI);
261 }
262
263 line = line.trim();
264 if (line.isEmpty()) {
265 continue;
266 }
267 if (line.contains("=")) {
268
269 final String[] parts = EQUAL.split(line);
270 if (parts.length != 2) {
271 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location);
272 }
273 final String leftCharacter = parts[0];
274 final String rightCharacter = parts[1];
275 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
276 throw new IllegalArgumentException(
277 "Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location);
278 }
279 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
280 } else {
281
282 final String[] parts = SPACES.split(line);
283 if (parts.length != 4) {
284 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
285 }
286 try {
287 final String pattern = stripQuotes(parts[0]);
288 final String replacement1 = stripQuotes(parts[1]);
289 final String replacement2 = stripQuotes(parts[2]);
290 final String replacement3 = stripQuotes(parts[3]);
291 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
292 final char patternKey = r.pattern.charAt(0);
293 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
294 rules.add(r);
295 } catch (final IllegalArgumentException e) {
296 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
297 }
298 }
299 }
300 }
301 }
302
303 private static String stripQuotes(String str) {
304 if (str.startsWith(DOUBLE_QUOTE)) {
305 str = str.substring(1);
306 }
307 if (str.endsWith(DOUBLE_QUOTE)) {
308 str = str.substring(0, str.length() - 1);
309 }
310 return str;
311 }
312
313
314 private final boolean folding;
315
316
317
318
319 public DaitchMokotoffSoundex() {
320 this(true);
321 }
322
323
324
325
326
327
328
329
330
331
332
333 public DaitchMokotoffSoundex(final boolean folding) {
334 this.folding = folding;
335 }
336
337
338
339
340
341
342
343
344
345
346
347 private String cleanup(final String input) {
348 final StringBuilder sb = new StringBuilder();
349 for (char ch : input.toCharArray()) {
350 if (Character.isWhitespace(ch) || !Character.isLetter(ch)) {
351 continue;
352 }
353 ch = Character.toLowerCase(ch);
354 final Character character = FOLDINGS.get(ch);
355 if (folding && character != null) {
356 ch = character;
357 }
358 sb.append(ch);
359 }
360 return sb.toString();
361 }
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380 @Override
381 public Object encode(final Object obj) throws EncoderException {
382 if (!(obj instanceof String)) {
383 throw new EncoderException(
384 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
385 }
386 return encode((String) obj);
387 }
388
389
390
391
392
393
394
395
396
397
398
399 @Override
400 public String encode(final String source) {
401 if (source == null) {
402 return null;
403 }
404 return soundex(source, false)[0];
405 }
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430 public String soundex(final String source) {
431 return String.join("|", soundex(source, true));
432 }
433
434
435
436
437
438
439
440
441
442
443
444 private String[] soundex(final String source, final boolean branching) {
445 if (source == null) {
446 return null;
447 }
448 final String input = cleanup(source);
449 final Set<Branch> currentBranches = new LinkedHashSet<>();
450 currentBranches.add(new Branch());
451 char lastChar = NUL;
452 for (int index = 0; index < input.length(); index++) {
453 final char ch = input.charAt(index);
454 final String inputContext = input.substring(index);
455 final List<Rule> rules = RULES.get(ch);
456 if (rules == null) {
457 continue;
458 }
459
460 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
461 for (final Rule rule : rules) {
462 if (rule.matches(inputContext)) {
463 if (branching) {
464 nextBranches.clear();
465 }
466 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
467 final boolean branchingRequired = replacements.length > 1 && branching;
468 for (final Branch branch : currentBranches) {
469 for (final String nextReplacement : replacements) {
470
471 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
472
473 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
474 nextBranch.processNextReplacement(nextReplacement, force);
475 if (!branching) {
476 break;
477 }
478 nextBranches.add(nextBranch);
479 }
480 }
481 if (branching) {
482 currentBranches.clear();
483 currentBranches.addAll(nextBranches);
484 }
485 index += rule.getPatternLength() - 1;
486 break;
487 }
488 }
489 lastChar = ch;
490 }
491 final String[] result = new String[currentBranches.size()];
492 int index = 0;
493 for (final Branch branch : currentBranches) {
494 branch.finish();
495 result[index++] = branch.toString();
496 }
497 return result;
498 }
499 }