View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.LinkedHashSet;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.Scanner;
27  import java.util.Set;
28  
29  import org.apache.commons.codec.CharEncoding;
30  import org.apache.commons.codec.EncoderException;
31  import org.apache.commons.codec.Resources;
32  import org.apache.commons.codec.StringEncoder;
33  
34  /**
35   * Encodes a string into a Daitch-Mokotoff Soundex value.
36   * <p>
37   * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
38   * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
39   * </p>
40   * <p>
41   * The main differences compared to the other soundex variants are:
42   * </p>
43   * <ul>
44   * <li>coded names are 6 digits long
45   * <li>the initial character of the name is coded
46   * <li>rules to encoded multi-character n-grams
47   * <li>multiple possible encodings for the same name (branching)
48   * </ul>
49   * <p>
50   * This implementation supports branching, depending on the used method:
51   * <ul>
52   * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
53   * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
54   * </ul>
55   * <p>
56   * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
57   * rules can be customized by overriding the default rules contained in the resource file
58   * {@code org/apache/commons/codec/language/dmrules.txt}.
59   * </p>
60   * <p>
61   * This class is thread-safe.
62   * </p>
63   *
64   * @see Soundex
65   * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
66   * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
67   *
68   * @since 1.10
69   */
70  public class DaitchMokotoffSoundex implements StringEncoder {
71  
72      /**
73       * Inner class representing a branch during DM soundex encoding.
74       */
75      private static final class Branch {
76          private final StringBuilder builder;
77          private String cachedString;
78          private String lastReplacement;
79  
80          private Branch() {
81              builder = new StringBuilder();
82              lastReplacement = null;
83              cachedString = null;
84          }
85  
86          /**
87           * Creates a new branch, identical to this branch.
88           *
89           * @return a new, identical branch
90           */
91          public Branch createBranch() {
92              final Branch branch = new Branch();
93              branch.builder.append(toString());
94              branch.lastReplacement = this.lastReplacement;
95              return branch;
96          }
97  
98          @Override
99          public boolean equals(final Object other) {
100             if (this == other) {
101                 return true;
102             }
103             if (!(other instanceof Branch)) {
104                 return false;
105             }
106 
107             return toString().equals(((Branch) other).toString());
108         }
109 
110         /**
111          * Finish this branch by appending '0's until the maximum code length has been reached.
112          */
113         public void finish() {
114             while (builder.length() < MAX_LENGTH) {
115                 builder.append('0');
116                 cachedString = null;
117             }
118         }
119 
120         @Override
121         public int hashCode() {
122             return toString().hashCode();
123         }
124 
125         /**
126          * Process the next replacement to be added to this branch.
127          *
128          * @param replacement
129          *            the next replacement to append
130          * @param forceAppend
131          *            indicates if the default processing shall be overridden
132          */
133         public void processNextReplacement(final String replacement, final boolean forceAppend) {
134             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
135 
136             if (append && builder.length() < MAX_LENGTH) {
137                 builder.append(replacement);
138                 // remove all characters after the maximum length
139                 if (builder.length() > MAX_LENGTH) {
140                     builder.delete(MAX_LENGTH, builder.length());
141                 }
142                 cachedString = null;
143             }
144 
145             lastReplacement = replacement;
146         }
147 
148         @Override
149         public String toString() {
150             if (cachedString == null) {
151                 cachedString = builder.toString();
152             }
153             return cachedString;
154         }
155     }
156 
157     /**
158      * Inner class for storing rules.
159      */
160     private static final class Rule {
161         private final String pattern;
162         private final String[] replacementAtStart;
163         private final String[] replacementBeforeVowel;
164         private final String[] replacementDefault;
165 
166         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
167                 final String replacementDefault) {
168             this.pattern = pattern;
169             this.replacementAtStart = replacementAtStart.split("\\|");
170             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
171             this.replacementDefault = replacementDefault.split("\\|");
172         }
173 
174         public int getPatternLength() {
175             return pattern.length();
176         }
177 
178         public String[] getReplacements(final String context, final boolean atStart) {
179             if (atStart) {
180                 return replacementAtStart;
181             }
182 
183             final int nextIndex = getPatternLength();
184             final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
185             if (nextCharIsVowel) {
186                 return replacementBeforeVowel;
187             }
188 
189             return replacementDefault;
190         }
191 
192         private boolean isVowel(final char ch) {
193             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
194         }
195 
196         public boolean matches(final String context) {
197             return context.startsWith(pattern);
198         }
199 
200         @Override
201         public String toString() {
202             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
203                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
204         }
205     }
206 
207     private static final String COMMENT = "//";
208 
209     private static final String DOUBLE_QUOTE = "\"";
210 
211     private static final String MULTILINE_COMMENT_END = "*/";
212 
213     private static final String MULTILINE_COMMENT_START = "/*";
214 
215     /** The resource file containing the replacement and folding rules */
216     private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
217 
218     /** The code length of a DM soundex value. */
219     private static final int MAX_LENGTH = 6;
220 
221     /** Transformation rules indexed by the first character of their pattern. */
222     private static final Map<Character, List<Rule>> RULES = new HashMap<>();
223 
224     /** Folding rules. */
225     private static final Map<Character, Character> FOLDINGS = new HashMap<>();
226 
227     static {
228         try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
229             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
230         }
231 
232         // sort RULES by pattern length in descending order
233         RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
234     }
235 
236     private static void parseRules(final Scanner scanner, final String location,
237             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
238         int currentLine = 0;
239         boolean inMultilineComment = false;
240 
241         while (scanner.hasNextLine()) {
242             currentLine++;
243             final String rawLine = scanner.nextLine();
244             String line = rawLine;
245 
246             if (inMultilineComment) {
247                 if (line.endsWith(MULTILINE_COMMENT_END)) {
248                     inMultilineComment = false;
249                 }
250                 continue;
251             }
252 
253             if (line.startsWith(MULTILINE_COMMENT_START)) {
254                 inMultilineComment = true;
255             } else {
256                 // discard comments
257                 final int cmtI = line.indexOf(COMMENT);
258                 if (cmtI >= 0) {
259                     line = line.substring(0, cmtI);
260                 }
261 
262                 // trim leading-trailing whitespace
263                 line = line.trim();
264 
265                 if (line.isEmpty()) {
266                     continue; // empty lines can be safely skipped
267                 }
268 
269                 if (line.contains("=")) {
270                     // folding
271                     final String[] parts = line.split("=");
272                     if (parts.length != 2) {
273                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
274                                 " parts: " + rawLine + " in " + location);
275                     }
276                     final String leftCharacter = parts[0];
277                     final String rightCharacter = parts[1];
278 
279                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
280                         throw new IllegalArgumentException("Malformed folding statement - " +
281                                 "patterns are not single characters: " + rawLine + " in " + location);
282                     }
283 
284                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
285                 } else {
286                     // rule
287                     final String[] parts = line.split("\\s+");
288                     if (parts.length != 4) {
289                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
290                                 " parts: " + rawLine + " in " + location);
291                     }
292                     try {
293                         final String pattern = stripQuotes(parts[0]);
294                         final String replacement1 = stripQuotes(parts[1]);
295                         final String replacement2 = stripQuotes(parts[2]);
296                         final String replacement3 = stripQuotes(parts[3]);
297 
298                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
299                         final char patternKey = r.pattern.charAt(0);
300                         final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
301                         rules.add(r);
302                     } catch (final IllegalArgumentException e) {
303                         throw new IllegalStateException(
304                                 "Problem parsing line '" + currentLine + "' in " + location, e);
305                     }
306                 }
307             }
308         }
309     }
310 
311     private static String stripQuotes(String str) {
312         if (str.startsWith(DOUBLE_QUOTE)) {
313             str = str.substring(1);
314         }
315 
316         if (str.endsWith(DOUBLE_QUOTE)) {
317             str = str.substring(0, str.length() - 1);
318         }
319 
320         return str;
321     }
322 
323     /** Whether to use ASCII folding prior to encoding. */
324     private final boolean folding;
325 
326     /**
327      * Creates a new instance with ASCII-folding enabled.
328      */
329     public DaitchMokotoffSoundex() {
330         this(true);
331     }
332 
333     /**
334      * Creates a new instance.
335      * <p>
336      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
337      * รจ -&gt; e.
338      * </p>
339      *
340      * @param folding
341      *            if ASCII-folding shall be performed before encoding
342      */
343     public DaitchMokotoffSoundex(final boolean folding) {
344         this.folding = folding;
345     }
346 
347     /**
348      * Performs a cleanup of the input string before the actual soundex transformation.
349      * <p>
350      * Removes all whitespace characters and performs ASCII folding if enabled.
351      * </p>
352      *
353      * @param input
354      *            the input string to clean up
355      * @return a cleaned up string
356      */
357     private String cleanup(final String input) {
358         final StringBuilder sb = new StringBuilder();
359         for (char ch : input.toCharArray()) {
360             if (Character.isWhitespace(ch)) {
361                 continue;
362             }
363 
364             ch = Character.toLowerCase(ch);
365             final Character character = FOLDINGS.get(ch);
366             if (folding && character != null) {
367                 ch = character;
368             }
369             sb.append(ch);
370         }
371         return sb.toString();
372     }
373 
374     /**
375      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
376      * <p>
377      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
378      * EncoderException if the supplied object is not of type {@link String}.
379      * </p>
380      *
381      * @see #soundex(String)
382      *
383      * @param obj
384      *            Object to encode
385      * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String
386      *         supplied.
387      * @throws EncoderException
388      *             if the parameter supplied is not of type {@link String}
389      * @throws IllegalArgumentException
390      *             if a character is not mapped
391      */
392     @Override
393     public Object encode(final Object obj) throws EncoderException {
394         if (!(obj instanceof String)) {
395             throw new EncoderException(
396                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
397         }
398         return encode((String) obj);
399     }
400 
401     /**
402      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
403      *
404      * @see #soundex(String)
405      *
406      * @param source
407      *            A String object to encode
408      * @return A DM Soundex code corresponding to the String supplied
409      * @throws IllegalArgumentException
410      *             if a character is not mapped
411      */
412     @Override
413     public String encode(final String source) {
414         if (source == null) {
415             return null;
416         }
417         return soundex(source, false)[0];
418     }
419 
420     /**
421      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
422      * <p>
423      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
424      * separated by '|'.
425      * </p>
426      * <p>
427      * Example: the name "AUERBACH" is encoded as both
428      * </p>
429      * <ul>
430      * <li>097400</li>
431      * <li>097500</li>
432      * </ul>
433      * <p>
434      * Thus the result will be "097400|097500".
435      * </p>
436      *
437      * @param source
438      *            A String object to encode
439      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
440      * @throws IllegalArgumentException
441      *             if a character is not mapped
442      */
443     public String soundex(final String source) {
444         final String[] branches = soundex(source, true);
445         final StringBuilder sb = new StringBuilder();
446         int index = 0;
447         for (final String branch : branches) {
448             sb.append(branch);
449             if (++index < branches.length) {
450                 sb.append('|');
451             }
452         }
453         return sb.toString();
454     }
455 
456     /**
457      * Perform the actual DM Soundex algorithm on the input string.
458      *
459      * @param source
460      *            A String object to encode
461      * @param branching
462      *            If branching shall be performed
463      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
464      *         selected branching mode
465      */
466     private String[] soundex(final String source, final boolean branching) {
467         if (source == null) {
468             return null;
469         }
470 
471         final String input = cleanup(source);
472 
473         final Set<Branch> currentBranches = new LinkedHashSet<>();
474         currentBranches.add(new Branch());
475 
476         char lastChar = '\0';
477         for (int index = 0; index < input.length(); index++) {
478             final char ch = input.charAt(index);
479 
480             // ignore whitespace inside a name
481             if (Character.isWhitespace(ch)) {
482                 continue;
483             }
484 
485             final String inputContext = input.substring(index);
486             final List<Rule> rules = RULES.get(ch);
487             if (rules == null) {
488                 continue;
489             }
490 
491             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
492             final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
493 
494             for (final Rule rule : rules) {
495                 if (rule.matches(inputContext)) {
496                     if (branching) {
497                         nextBranches.clear();
498                     }
499                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
500                     final boolean branchingRequired = replacements.length > 1 && branching;
501 
502                     for (final Branch branch : currentBranches) {
503                         for (final String nextReplacement : replacements) {
504                             // if we have multiple replacements, always create a new branch
505                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
506 
507                             // special rule: occurrences of mn or nm are treated differently
508                             final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
509 
510                             nextBranch.processNextReplacement(nextReplacement, force);
511 
512                             if (!branching) {
513                                 break;
514                             }
515                             nextBranches.add(nextBranch);
516                         }
517                     }
518 
519                     if (branching) {
520                         currentBranches.clear();
521                         currentBranches.addAll(nextBranches);
522                     }
523                     index += rule.getPatternLength() - 1;
524                     break;
525                 }
526             }
527 
528             lastChar = ch;
529         }
530 
531         final String[] result = new String[currentBranches.size()];
532         int index = 0;
533         for (final Branch branch : currentBranches) {
534             branch.finish();
535             result[index++] = branch.toString();
536         }
537 
538         return result;
539     }
540 }