View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.LinkedHashSet;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.Scanner;
27  import java.util.Set;
28  import java.util.regex.Pattern;
29  
30  import org.apache.commons.codec.CharEncoding;
31  import org.apache.commons.codec.EncoderException;
32  import org.apache.commons.codec.Resources;
33  import org.apache.commons.codec.StringEncoder;
34  
35  /**
36   * Encodes a string into a Daitch-Mokotoff Soundex value.
37   * <p>
38   * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
39   * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
40   * </p>
41   * <p>
42   * The main differences compared to the other Soundex variants are:
43   * </p>
44   * <ul>
45   * <li>coded names are 6 digits long
46   * <li>the initial character of the name is coded
47   * <li>rules to encoded multi-character n-grams
48   * <li>multiple possible encodings for the same name (branching)
49   * </ul>
50   * <p>
51   * This implementation supports branching, depending on the used method:
52   * <ul>
53   * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
54   * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
55   * </ul>
56   * <p>
57   * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
58   * rules can be customized by overriding the default rules contained in the resource file
59   * {@code org/apache/commons/codec/language/dmrules.txt}.
60   * </p>
61   * <p>
62   * This class is thread-safe.
63   * </p>
64   *
65   * @see Soundex
66   * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
67   * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
68   * @since 1.10
69   */
70  public class DaitchMokotoffSoundex implements StringEncoder {
71  
72      /**
73       * Inner class representing a branch during DM Soundex encoding.
74       */
75      private static final class Branch {
76          private final StringBuilder builder;
77          private String cachedString;
78          private String lastReplacement;
79  
80          private Branch() {
81              builder = new StringBuilder();
82          }
83  
84          /**
85           * Creates a new branch, identical to this branch.
86           *
87           * @return a new, identical branch
88           */
89          private Branch createBranch() {
90              final Branch branch = new Branch();
91              branch.builder.append(toString());
92              branch.lastReplacement = this.lastReplacement;
93              return branch;
94          }
95  
96          @Override
97          public boolean equals(final Object other) {
98              if (this == other) {
99                  return true;
100             }
101             if (!(other instanceof Branch)) {
102                 return false;
103             }
104             return toString().equals(((Branch) other).toString());
105         }
106 
107         /**
108          * Finish this branch by appending '0's until the maximum code length has been reached.
109          */
110         private void finish() {
111             while (builder.length() < MAX_LENGTH) {
112                 builder.append('0');
113                 cachedString = null;
114             }
115         }
116 
117         @Override
118         public int hashCode() {
119             return toString().hashCode();
120         }
121 
122         /**
123          * Process the next replacement to be added to this branch.
124          *
125          * @param replacement
126          *            the next replacement to append.
127          * @param forceAppend
128          *            indicates if the default processing shall be overridden.
129          */
130         private void processNextReplacement(final String replacement, final boolean forceAppend) {
131             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
132             if (append && builder.length() < MAX_LENGTH) {
133                 builder.append(replacement);
134                 // remove all characters after the maximum length
135                 if (builder.length() > MAX_LENGTH) {
136                     builder.delete(MAX_LENGTH, builder.length());
137                 }
138                 cachedString = null;
139             }
140             lastReplacement = replacement;
141         }
142 
143         @Override
144         public String toString() {
145             if (cachedString == null) {
146                 cachedString = builder.toString();
147             }
148             return cachedString;
149         }
150     }
151 
152     /**
153      * Inner class for storing rules.
154      */
155     private static final class Rule {
156         private static final Pattern PIPE = Pattern.compile("\\|");
157         private final String pattern;
158         private final String[] replacementAtStart;
159         private final String[] replacementBeforeVowel;
160         private final String[] replacementDefault;
161 
162         private Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
163                 final String replacementDefault) {
164             this.pattern = pattern;
165             this.replacementAtStart = PIPE.split(replacementAtStart);
166             this.replacementBeforeVowel = PIPE.split(replacementBeforeVowel);
167             this.replacementDefault = PIPE.split(replacementDefault);
168         }
169 
170         private int getPatternLength() {
171             return pattern.length();
172         }
173 
174         private String[] getReplacements(final String context, final boolean atStart) {
175             if (atStart) {
176                 return replacementAtStart;
177             }
178 
179             final int nextIndex = getPatternLength();
180             final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
181             if (nextCharIsVowel) {
182                 return replacementBeforeVowel;
183             }
184 
185             return replacementDefault;
186         }
187 
188         private boolean isVowel(final char ch) {
189             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
190         }
191 
192         private boolean matches(final String context) {
193             return context.startsWith(pattern);
194         }
195 
196         @Override
197         public String toString() {
198             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
199                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
200         }
201     }
202 
203     /**
204      * The NUL character.
205      */
206     private static final char NUL = '\0';
207 
208     private static final String COMMENT = "//";
209 
210     private static final String DOUBLE_QUOTE = "\"";
211 
212     private static final String MULTILINE_COMMENT_END = "*/";
213 
214     private static final String MULTILINE_COMMENT_START = "/*";
215 
216     /** The resource file containing the replacement and folding rules */
217     private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
218 
219     /** The code length of a DM Soundex value. */
220     private static final int MAX_LENGTH = 6;
221 
222     /** Transformation rules indexed by the first character of their pattern. */
223     private static final Map<Character, List<Rule>> RULES = new HashMap<>();
224 
225     /** Folding rules. */
226     private static final Map<Character, Character> FOLDINGS = new HashMap<>();
227 
228     private static final Pattern EQUAL = Pattern.compile("=");
229 
230     private static final Pattern SPACES = Pattern.compile("\\s+");
231 
232     static {
233         try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
234             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
235         }
236         // sort RULES by pattern length in descending order
237         RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
238     }
239 
240     private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping,
241             final Map<Character, Character> asciiFoldings) {
242         int currentLine = 0;
243         boolean inMultilineComment = false;
244         while (scanner.hasNextLine()) {
245             currentLine++;
246             final String rawLine = scanner.nextLine();
247             String line = rawLine;
248             if (inMultilineComment) {
249                 if (line.endsWith(MULTILINE_COMMENT_END)) {
250                     inMultilineComment = false;
251                 }
252                 continue;
253             }
254             if (line.startsWith(MULTILINE_COMMENT_START)) {
255                 inMultilineComment = true;
256             } else {
257                 // discard comments
258                 final int cmtI = line.indexOf(COMMENT);
259                 if (cmtI >= 0) {
260                     line = line.substring(0, cmtI);
261                 }
262                 // trim leading-trailing whitespace
263                 line = line.trim();
264                 if (line.isEmpty()) {
265                     continue; // empty lines can be safely skipped
266                 }
267                 if (line.contains("=")) {
268                     // folding
269                     final String[] parts = EQUAL.split(line);
270                     if (parts.length != 2) {
271                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location);
272                     }
273                     final String leftCharacter = parts[0];
274                     final String rightCharacter = parts[1];
275                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
276                         throw new IllegalArgumentException(
277                                 "Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location);
278                     }
279                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
280                 } else {
281                     // rule
282                     final String[] parts = SPACES.split(line);
283                     if (parts.length != 4) {
284                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
285                     }
286                     try {
287                         final String pattern = stripQuotes(parts[0]);
288                         final String replacement1 = stripQuotes(parts[1]);
289                         final String replacement2 = stripQuotes(parts[2]);
290                         final String replacement3 = stripQuotes(parts[3]);
291                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
292                         final char patternKey = r.pattern.charAt(0);
293                         final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
294                         rules.add(r);
295                     } catch (final IllegalArgumentException e) {
296                         throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
297                     }
298                 }
299             }
300         }
301     }
302 
303     private static String stripQuotes(String str) {
304         if (str.startsWith(DOUBLE_QUOTE)) {
305             str = str.substring(1);
306         }
307         if (str.endsWith(DOUBLE_QUOTE)) {
308             str = str.substring(0, str.length() - 1);
309         }
310         return str;
311     }
312 
313     /** Whether to use ASCII folding prior to encoding. */
314     private final boolean folding;
315 
316     /**
317      * Creates a new instance with ASCII-folding enabled.
318      */
319     public DaitchMokotoffSoundex() {
320         this(true);
321     }
322 
323     /**
324      * Creates a new instance.
325      * <p>
326      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example
327      * è -&gt; e.
328      * </p>
329      *
330      * @param folding
331      *            if ASCII-folding shall be performed before encoding
332      */
333     public DaitchMokotoffSoundex(final boolean folding) {
334         this.folding = folding;
335     }
336 
337     /**
338      * Performs a cleanup of the input string before the actual Soundex transformation.
339      * <p>
340      * Removes all whitespace characters and performs ASCII folding if enabled.
341      * </p>
342      *
343      * @param input
344      *            the input string to clean up.
345      * @return a cleaned up string.
346      */
347     private String cleanup(final String input) {
348         final StringBuilder sb = new StringBuilder();
349         for (char ch : input.toCharArray()) {
350             if (Character.isWhitespace(ch) || !Character.isLetter(ch)) {
351                 continue;
352             }
353             ch = Character.toLowerCase(ch);
354             final Character character = FOLDINGS.get(ch);
355             if (folding && character != null) {
356                 ch = character;
357             }
358             sb.append(ch);
359         }
360         return sb.toString();
361     }
362 
363     /**
364      * Encodes an Object using the Daitch-Mokotoff Soundex algorithm without branching.
365      * <p>
366      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
367      * EncoderException if the supplied object is not of type {@link String}.
368      * </p>
369      *
370      * @see #soundex(String)
371      * @param obj
372      *            Object to encode.
373      * @return An object (of type {@link String}) containing the DM Soundex code, which corresponds to the String
374      *         supplied.
375      * @throws EncoderException
376      *             if the parameter supplied is not of type {@link String}.
377      * @throws IllegalArgumentException
378      *             if a character is not mapped.
379      */
380     @Override
381     public Object encode(final Object obj) throws EncoderException {
382         if (!(obj instanceof String)) {
383             throw new EncoderException(
384                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
385         }
386         return encode((String) obj);
387     }
388 
389     /**
390      * Encodes a String using the Daitch-Mokotoff Soundex algorithm without branching.
391      *
392      * @see #soundex(String)
393      * @param source
394      *            A String object to encode.
395      * @return A DM Soundex code corresponding to the String supplied.
396      * @throws IllegalArgumentException
397      *             if a character is not mapped.
398      */
399     @Override
400     public String encode(final String source) {
401         if (source == null) {
402             return null;
403         }
404         return soundex(source, false)[0];
405     }
406 
407     /**
408      * Encodes a String using the Daitch-Mokotoff Soundex algorithm with branching.
409      * <p>
410      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
411      * separated by '|'.
412      * </p>
413      * <p>
414      * Example: the name "AUERBACH" is encoded as both
415      * </p>
416      * <ul>
417      * <li>097400</li>
418      * <li>097500</li>
419      * </ul>
420      * <p>
421      * Thus the result will be "097400|097500".
422      * </p>
423      *
424      * @param source
425      *            A String object to encode.
426      * @return A string containing a set of DM Soundex codes corresponding to the String supplied.
427      * @throws IllegalArgumentException
428      *             if a character is not mapped.
429      */
430     public String soundex(final String source) {
431         return String.join("|", soundex(source, true));
432     }
433 
434     /**
435      * Perform the actual DM Soundex algorithm on the input string.
436      *
437      * @param source
438      *            A String object to encode.
439      * @param branching
440      *            If branching shall be performed.
441      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
442      *         selected branching mode.
443      */
444     private String[] soundex(final String source, final boolean branching) {
445         if (source == null) {
446             return null;
447         }
448         final String input = cleanup(source);
449         final Set<Branch> currentBranches = new LinkedHashSet<>();
450         currentBranches.add(new Branch());
451         char lastChar = NUL;
452         for (int index = 0; index < input.length(); index++) {
453             final char ch = input.charAt(index);
454             final String inputContext = input.substring(index);
455             final List<Rule> rules = RULES.get(ch);
456             if (rules == null) {
457                 continue;
458             }
459             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
460             final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
461             for (final Rule rule : rules) {
462                 if (rule.matches(inputContext)) {
463                     if (branching) {
464                         nextBranches.clear();
465                     }
466                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
467                     final boolean branchingRequired = replacements.length > 1 && branching;
468                     for (final Branch branch : currentBranches) {
469                         for (final String nextReplacement : replacements) {
470                             // if we have multiple replacements, always create a new branch
471                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
472                             // special rule: occurrences of mn or nm are treated differently
473                             final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
474                             nextBranch.processNextReplacement(nextReplacement, force);
475                             if (!branching) {
476                                 break;
477                             }
478                             nextBranches.add(nextBranch);
479                         }
480                     }
481                     if (branching) {
482                         currentBranches.clear();
483                         currentBranches.addAll(nextBranches);
484                     }
485                     index += rule.getPatternLength() - 1;
486                     break;
487                 }
488             }
489             lastChar = ch;
490         }
491         final String[] result = new String[currentBranches.size()];
492         int index = 0;
493         for (final Branch branch : currentBranches) {
494             branch.finish();
495             result[index++] = branch.toString();
496         }
497         return result;
498     }
499 }