View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.codec.language;
18  
19  import java.io.InputStream;
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.Comparator;
24  import java.util.HashMap;
25  import java.util.LinkedHashSet;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Scanner;
29  import java.util.Set;
30  
31  import org.apache.commons.codec.CharEncoding;
32  import org.apache.commons.codec.EncoderException;
33  import org.apache.commons.codec.StringEncoder;
34  
35  /**
36   * Encodes a string into a Daitch-Mokotoff Soundex value.
37   * <p>
38   * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
39   * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
40   * </p>
41   * <p>
42   * The main differences compared to the other soundex variants are:
43   * </p>
44   * <ul>
45   * <li>coded names are 6 digits long
46   * <li>the initial character of the name is coded
47   * <li>rules to encoded multi-character n-grams
48   * <li>multiple possible encodings for the same name (branching)
49   * </ul>
50   * <p>
51   * This implementation supports branching, depending on the used method:
52   * <ul>
53   * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
54   * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
55   * </ul>
56   * <p>
57   * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
58   * rules can be customized by overriding the default rules contained in the resource file
59   * {@code org/apache/commons/codec/language/dmrules.txt}.
60   * </p>
61   * <p>
62   * This class is thread-safe.
63   * </p>
64   *
65   * @see Soundex
66   * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
67   * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
68   *
69   * @version $Id: DaitchMokotoffSoundex.html 928559 2014-11-10 02:53:54Z ggregory $
70   * @since 1.10
71   */
72  public class DaitchMokotoffSoundex implements StringEncoder {
73  
74      /**
75       * Inner class representing a branch during DM soundex encoding.
76       */
77      private static final class Branch {
78          private final StringBuilder builder;
79          private String cachedString;
80          private String lastReplacement;
81  
82          private Branch() {
83              builder = new StringBuilder();
84              lastReplacement = null;
85              cachedString = null;
86          }
87  
88          /**
89           * Creates a new branch, identical to this branch.
90           *
91           * @return a new, identical branch
92           */
93          public Branch createBranch() {
94              final Branch branch = new Branch();
95              branch.builder.append(toString());
96              branch.lastReplacement = this.lastReplacement;
97              return branch;
98          }
99  
100         @Override
101         public boolean equals(final Object other) {
102             if (this == other) {
103                 return true;
104             }
105             if (!(other instanceof Branch)) {
106                 return false;
107             }
108 
109             return toString().equals(((Branch) other).toString());
110         }
111 
112         /**
113          * Finish this branch by appending '0's until the maximum code length has been reached.
114          */
115         public void finish() {
116             while (builder.length() < MAX_LENGTH) {
117                 builder.append('0');
118                 cachedString = null;
119             }
120         }
121 
122         @Override
123         public int hashCode() {
124             return toString().hashCode();
125         }
126 
127         /**
128          * Process the next replacement to be added to this branch.
129          *
130          * @param replacement
131          *            the next replacement to append
132          * @param forceAppend
133          *            indicates if the default processing shall be overridden
134          */
135         public void processNextReplacement(final String replacement, final boolean forceAppend) {
136             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
137 
138             if (append && builder.length() < MAX_LENGTH) {
139                 builder.append(replacement);
140                 // remove all characters after the maximum length
141                 if (builder.length() > MAX_LENGTH) {
142                     builder.delete(MAX_LENGTH, builder.length());
143                 }
144                 cachedString = null;
145             }
146 
147             lastReplacement = replacement;
148         }
149 
150         @Override
151         public String toString() {
152             if (cachedString == null) {
153                 cachedString = builder.toString();
154             }
155             return cachedString;
156         }
157     }
158 
159     /**
160      * Inner class for storing rules.
161      */
162     private static final class Rule {
163         private final String pattern;
164         private final String[] replacementAtStart;
165         private final String[] replacementBeforeVowel;
166         private final String[] replacementDefault;
167 
168         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
169                 final String replacementDefault) {
170             this.pattern = pattern;
171             this.replacementAtStart = replacementAtStart.split("\\|");
172             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
173             this.replacementDefault = replacementDefault.split("\\|");
174         }
175 
176         public int getPatternLength() {
177             return pattern.length();
178         }
179 
180         public String[] getReplacements(final String context, final boolean atStart) {
181             if (atStart) {
182                 return replacementAtStart;
183             }
184 
185             final int nextIndex = getPatternLength();
186             final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
187             if (nextCharIsVowel) {
188                 return replacementBeforeVowel;
189             }
190 
191             return replacementDefault;
192         }
193 
194         private boolean isVowel(final char ch) {
195             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
196         }
197 
198         public boolean matches(final String context) {
199             return context.startsWith(pattern);
200         }
201 
202         @Override
203         public String toString() {
204             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
205                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
206         }
207     }
208 
209     private static final String COMMENT = "//";
210     private static final String DOUBLE_QUOTE = "\"";
211 
212     private static final String MULTILINE_COMMENT_END = "*/";
213 
214     private static final String MULTILINE_COMMENT_START = "/*";
215 
216     /** The resource file containing the replacement and folding rules */
217     private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
218 
219     /** The code length of a DM soundex value. */
220     private static final int MAX_LENGTH = 6;
221 
222     /** Transformation rules indexed by the first character of their pattern. */
223     private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
224 
225     /** Folding rules. */
226     private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
227 
228     static {
229         final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
230         if (rulesIS == null) {
231             throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
232         }
233 
234         final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
235         parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
236         scanner.close();
237 
238         // sort RULES by pattern length in descending order
239         for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
240             final List<Rule> ruleList = rule.getValue();
241             Collections.sort(ruleList, new Comparator<Rule>() {
242                 @Override
243                 public int compare(final Rule rule1, final Rule rule2) {
244                     return rule2.getPatternLength() - rule1.getPatternLength();
245                 }
246             });
247         }
248     }
249 
250     private static void parseRules(final Scanner scanner, final String location,
251             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
252         int currentLine = 0;
253         boolean inMultilineComment = false;
254 
255         while (scanner.hasNextLine()) {
256             currentLine++;
257             final String rawLine = scanner.nextLine();
258             String line = rawLine;
259 
260             if (inMultilineComment) {
261                 if (line.endsWith(MULTILINE_COMMENT_END)) {
262                     inMultilineComment = false;
263                 }
264                 continue;
265             }
266 
267             if (line.startsWith(MULTILINE_COMMENT_START)) {
268                 inMultilineComment = true;
269             } else {
270                 // discard comments
271                 final int cmtI = line.indexOf(COMMENT);
272                 if (cmtI >= 0) {
273                     line = line.substring(0, cmtI);
274                 }
275 
276                 // trim leading-trailing whitespace
277                 line = line.trim();
278 
279                 if (line.length() == 0) {
280                     continue; // empty lines can be safely skipped
281                 }
282 
283                 if (line.contains("=")) {
284                     // folding
285                     final String[] parts = line.split("=");
286                     if (parts.length != 2) {
287                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
288                                 " parts: " + rawLine + " in " + location);
289                     } else {
290                         final String leftCharacter = parts[0];
291                         final String rightCharacter = parts[1];
292 
293                         if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
294                             throw new IllegalArgumentException("Malformed folding statement - " +
295                                     "patterns are not single characters: " + rawLine + " in " + location);
296                         }
297 
298                         asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
299                     }
300                 } else {
301                     // rule
302                     final String[] parts = line.split("\\s+");
303                     if (parts.length != 4) {
304                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
305                                 " parts: " + rawLine + " in " + location);
306                     } else {
307                         try {
308                             final String pattern = stripQuotes(parts[0]);
309                             final String replacement1 = stripQuotes(parts[1]);
310                             final String replacement2 = stripQuotes(parts[2]);
311                             final String replacement3 = stripQuotes(parts[3]);
312 
313                             final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
314                             final char patternKey = r.pattern.charAt(0);
315                             List<Rule> rules = ruleMapping.get(patternKey);
316                             if (rules == null) {
317                                 rules = new ArrayList<Rule>();
318                                 ruleMapping.put(patternKey, rules);
319                             }
320                             rules.add(r);
321                         } catch (final IllegalArgumentException e) {
322                             throw new IllegalStateException(
323                                     "Problem parsing line '" + currentLine + "' in " + location, e);
324                         }
325                     }
326                 }
327             }
328         }
329     }
330 
331     private static String stripQuotes(String str) {
332         if (str.startsWith(DOUBLE_QUOTE)) {
333             str = str.substring(1);
334         }
335 
336         if (str.endsWith(DOUBLE_QUOTE)) {
337             str = str.substring(0, str.length() - 1);
338         }
339 
340         return str;
341     }
342 
343     /** Whether to use ASCII folding prior to encoding. */
344     private final boolean folding;
345 
346     /**
347      * Creates a new instance with ASCII-folding enabled.
348      */
349     public DaitchMokotoffSoundex() {
350         this(true);
351     }
352 
353     /**
354      * Creates a new instance.
355      * <p>
356      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
357      * รจ -&gt; e.
358      * </p>
359      *
360      * @param folding
361      *            if ASCII-folding shall be performed before encoding
362      */
363     public DaitchMokotoffSoundex(final boolean folding) {
364         this.folding = folding;
365     }
366 
367     /**
368      * Performs a cleanup of the input string before the actual soundex transformation.
369      * <p>
370      * Removes all whitespace characters and performs ASCII folding if enabled.
371      * </p>
372      *
373      * @param input
374      *            the input string to cleanup
375      * @return a cleaned up string
376      */
377     private String cleanup(final String input) {
378         final StringBuilder sb = new StringBuilder();
379         for (char ch : input.toCharArray()) {
380             if (Character.isWhitespace(ch)) {
381                 continue;
382             }
383 
384             ch = Character.toLowerCase(ch);
385             if (folding && FOLDINGS.containsKey(ch)) {
386                 ch = FOLDINGS.get(ch);
387             }
388             sb.append(ch);
389         }
390         return sb.toString();
391     }
392 
393     /**
394      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
395      * <p>
396      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
397      * EncoderException if the supplied object is not of type java.lang.String.
398      * </p>
399      *
400      * @see #soundex(String)
401      *
402      * @param obj
403      *            Object to encode
404      * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
405      *         supplied.
406      * @throws EncoderException
407      *             if the parameter supplied is not of type java.lang.String
408      * @throws IllegalArgumentException
409      *             if a character is not mapped
410      */
411     @Override
412     public Object encode(final Object obj) throws EncoderException {
413         if (!(obj instanceof String)) {
414             throw new EncoderException(
415                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
416         }
417         return encode((String) obj);
418     }
419 
420     /**
421      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
422      *
423      * @see #soundex(String)
424      *
425      * @param source
426      *            A String object to encode
427      * @return A DM Soundex code corresponding to the String supplied
428      * @throws IllegalArgumentException
429      *             if a character is not mapped
430      */
431     @Override
432     public String encode(final String source) {
433         if (source == null) {
434             return null;
435         }
436         return soundex(source, false)[0];
437     }
438 
439     /**
440      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
441      * <p>
442      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
443      * separated by '|'.
444      * </p>
445      * <p>
446      * Example: the name "AUERBACH" is encoded as both
447      * </p>
448      * <ul>
449      * <li>097400</li>
450      * <li>097500</li>
451      * </ul>
452      * <p>
453      * Thus the result will be "097400|097500".
454      * </p>
455      *
456      * @param source
457      *            A String object to encode
458      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
459      * @throws IllegalArgumentException
460      *             if a character is not mapped
461      */
462     public String soundex(final String source) {
463         final String[] branches = soundex(source, true);
464         final StringBuilder sb = new StringBuilder();
465         int index = 0;
466         for (final String branch : branches) {
467             sb.append(branch);
468             if (++index < branches.length) {
469                 sb.append('|');
470             }
471         }
472         return sb.toString();
473     }
474 
475     /**
476      * Perform the actual DM Soundex algorithm on the input string.
477      *
478      * @param source
479      *            A String object to encode
480      * @param branching
481      *            If branching shall be performed
482      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
483      *         selected branching mode
484      */
485     private String[] soundex(final String source, final boolean branching) {
486         if (source == null) {
487             return null;
488         }
489 
490         final String input = cleanup(source);
491 
492         final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
493         currentBranches.add(new Branch());
494 
495         char lastChar = '\0';
496         for (int index = 0; index < input.length(); index++) {
497             final char ch = input.charAt(index);
498 
499             // ignore whitespace inside a name
500             if (Character.isWhitespace(ch)) {
501                 continue;
502             }
503 
504             final String inputContext = input.substring(index);
505             final List<Rule> rules = RULES.get(ch);
506             if (rules == null) {
507                 continue;
508             }
509 
510             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
511             @SuppressWarnings("unchecked")
512             final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
513 
514             for (final Rule rule : rules) {
515                 if (rule.matches(inputContext)) {
516                     if (branching) {
517                         nextBranches.clear();
518                     }
519                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
520                     final boolean branchingRequired = replacements.length > 1 && branching;
521 
522                     for (final Branch branch : currentBranches) {
523                         for (final String nextReplacement : replacements) {
524                             // if we have multiple replacements, always create a new branch
525                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
526 
527                             // special rule: occurrences of mn or nm are treated differently
528                             final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
529 
530                             nextBranch.processNextReplacement(nextReplacement, force);
531 
532                             if (branching) {
533                                 nextBranches.add(nextBranch);
534                             } else {
535                                 break;
536                             }
537                         }
538                     }
539 
540                     if (branching) {
541                         currentBranches.clear();
542                         currentBranches.addAll(nextBranches);
543                     }
544                     index += rule.getPatternLength() - 1;
545                     break;
546                 }
547             }
548 
549             lastChar = ch;
550         }
551 
552         final String[] result = new String[currentBranches.size()];
553         int index = 0;
554         for (final Branch branch : currentBranches) {
555             branch.finish();
556             result[index++] = branch.toString();
557         }
558 
559         return result;
560     }
561 }