DaitchMokotoffSoundex.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.codec.language;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.HashMap;
  22. import java.util.LinkedHashSet;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Scanner;
  26. import java.util.Set;

  27. import org.apache.commons.codec.CharEncoding;
  28. import org.apache.commons.codec.EncoderException;
  29. import org.apache.commons.codec.Resources;
  30. import org.apache.commons.codec.StringEncoder;

  31. /**
  32.  * Encodes a string into a Daitch-Mokotoff Soundex value.
  33.  * <p>
  34.  * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
  35.  * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
  36.  * </p>
  37.  * <p>
  38.  * The main differences compared to the other soundex variants are:
  39.  * </p>
  40.  * <ul>
  41.  * <li>coded names are 6 digits long
  42.  * <li>the initial character of the name is coded
  43.  * <li>rules to encoded multi-character n-grams
  44.  * <li>multiple possible encodings for the same name (branching)
  45.  * </ul>
  46.  * <p>
  47.  * This implementation supports branching, depending on the used method:
  48.  * <ul>
  49.  * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
  50.  * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
  51.  * </ul>
  52.  * <p>
  53.  * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
  54.  * rules can be customized by overriding the default rules contained in the resource file
  55.  * {@code org/apache/commons/codec/language/dmrules.txt}.
  56.  * </p>
  57.  * <p>
  58.  * This class is thread-safe.
  59.  * </p>
  60.  *
  61.  * @see Soundex
  62.  * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
  63.  * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
  64.  * @since 1.10
  65.  */
  66. public class DaitchMokotoffSoundex implements StringEncoder {

  67.     /**
  68.      * Inner class representing a branch during DM soundex encoding.
  69.      */
  70.     private static final class Branch {
  71.         private final StringBuilder builder;
  72.         private String cachedString;
  73.         private String lastReplacement;

  74.         private Branch() {
  75.             builder = new StringBuilder();
  76.             lastReplacement = null;
  77.             cachedString = null;
  78.         }

  79.         /**
  80.          * Creates a new branch, identical to this branch.
  81.          *
  82.          * @return a new, identical branch
  83.          */
  84.         public Branch createBranch() {
  85.             final Branch branch = new Branch();
  86.             branch.builder.append(toString());
  87.             branch.lastReplacement = this.lastReplacement;
  88.             return branch;
  89.         }

  90.         @Override
  91.         public boolean equals(final Object other) {
  92.             if (this == other) {
  93.                 return true;
  94.             }
  95.             if (!(other instanceof Branch)) {
  96.                 return false;
  97.             }

  98.             return toString().equals(((Branch) other).toString());
  99.         }

  100.         /**
  101.          * Finish this branch by appending '0's until the maximum code length has been reached.
  102.          */
  103.         public void finish() {
  104.             while (builder.length() < MAX_LENGTH) {
  105.                 builder.append('0');
  106.                 cachedString = null;
  107.             }
  108.         }

  109.         @Override
  110.         public int hashCode() {
  111.             return toString().hashCode();
  112.         }

  113.         /**
  114.          * Process the next replacement to be added to this branch.
  115.          *
  116.          * @param replacement
  117.          *            the next replacement to append
  118.          * @param forceAppend
  119.          *            indicates if the default processing shall be overridden
  120.          */
  121.         public void processNextReplacement(final String replacement, final boolean forceAppend) {
  122.             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;

  123.             if (append && builder.length() < MAX_LENGTH) {
  124.                 builder.append(replacement);
  125.                 // remove all characters after the maximum length
  126.                 if (builder.length() > MAX_LENGTH) {
  127.                     builder.delete(MAX_LENGTH, builder.length());
  128.                 }
  129.                 cachedString = null;
  130.             }

  131.             lastReplacement = replacement;
  132.         }

  133.         @Override
  134.         public String toString() {
  135.             if (cachedString == null) {
  136.                 cachedString = builder.toString();
  137.             }
  138.             return cachedString;
  139.         }
  140.     }

  141.     /**
  142.      * Inner class for storing rules.
  143.      */
  144.     private static final class Rule {
  145.         private final String pattern;
  146.         private final String[] replacementAtStart;
  147.         private final String[] replacementBeforeVowel;
  148.         private final String[] replacementDefault;

  149.         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
  150.                 final String replacementDefault) {
  151.             this.pattern = pattern;
  152.             this.replacementAtStart = replacementAtStart.split("\\|");
  153.             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
  154.             this.replacementDefault = replacementDefault.split("\\|");
  155.         }

  156.         public int getPatternLength() {
  157.             return pattern.length();
  158.         }

  159.         public String[] getReplacements(final String context, final boolean atStart) {
  160.             if (atStart) {
  161.                 return replacementAtStart;
  162.             }

  163.             final int nextIndex = getPatternLength();
  164.             final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
  165.             if (nextCharIsVowel) {
  166.                 return replacementBeforeVowel;
  167.             }

  168.             return replacementDefault;
  169.         }

  170.         private boolean isVowel(final char ch) {
  171.             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
  172.         }

  173.         public boolean matches(final String context) {
  174.             return context.startsWith(pattern);
  175.         }

  176.         @Override
  177.         public String toString() {
  178.             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
  179.                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
  180.         }
  181.     }

  182.     private static final String COMMENT = "//";

  183.     private static final String DOUBLE_QUOTE = "\"";

  184.     private static final String MULTILINE_COMMENT_END = "*/";

  185.     private static final String MULTILINE_COMMENT_START = "/*";

  186.     /** The resource file containing the replacement and folding rules */
  187.     private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";

  188.     /** The code length of a DM soundex value. */
  189.     private static final int MAX_LENGTH = 6;

  190.     /** Transformation rules indexed by the first character of their pattern. */
  191.     private static final Map<Character, List<Rule>> RULES = new HashMap<>();

  192.     /** Folding rules. */
  193.     private static final Map<Character, Character> FOLDINGS = new HashMap<>();

  194.     static {
  195.         try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
  196.             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
  197.         }

  198.         // sort RULES by pattern length in descending order
  199.         RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
  200.     }

  201.     private static void parseRules(final Scanner scanner, final String location,
  202.             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
  203.         int currentLine = 0;
  204.         boolean inMultilineComment = false;

  205.         while (scanner.hasNextLine()) {
  206.             currentLine++;
  207.             final String rawLine = scanner.nextLine();
  208.             String line = rawLine;

  209.             if (inMultilineComment) {
  210.                 if (line.endsWith(MULTILINE_COMMENT_END)) {
  211.                     inMultilineComment = false;
  212.                 }
  213.                 continue;
  214.             }

  215.             if (line.startsWith(MULTILINE_COMMENT_START)) {
  216.                 inMultilineComment = true;
  217.             } else {
  218.                 // discard comments
  219.                 final int cmtI = line.indexOf(COMMENT);
  220.                 if (cmtI >= 0) {
  221.                     line = line.substring(0, cmtI);
  222.                 }

  223.                 // trim leading-trailing whitespace
  224.                 line = line.trim();

  225.                 if (line.isEmpty()) {
  226.                     continue; // empty lines can be safely skipped
  227.                 }

  228.                 if (line.contains("=")) {
  229.                     // folding
  230.                     final String[] parts = line.split("=");
  231.                     if (parts.length != 2) {
  232.                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
  233.                                 " parts: " + rawLine + " in " + location);
  234.                     }
  235.                     final String leftCharacter = parts[0];
  236.                     final String rightCharacter = parts[1];

  237.                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
  238.                         throw new IllegalArgumentException("Malformed folding statement - " +
  239.                                 "patterns are not single characters: " + rawLine + " in " + location);
  240.                     }

  241.                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
  242.                 } else {
  243.                     // rule
  244.                     final String[] parts = line.split("\\s+");
  245.                     if (parts.length != 4) {
  246.                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
  247.                                 " parts: " + rawLine + " in " + location);
  248.                     }
  249.                     try {
  250.                         final String pattern = stripQuotes(parts[0]);
  251.                         final String replacement1 = stripQuotes(parts[1]);
  252.                         final String replacement2 = stripQuotes(parts[2]);
  253.                         final String replacement3 = stripQuotes(parts[3]);

  254.                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
  255.                         final char patternKey = r.pattern.charAt(0);
  256.                         final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
  257.                         rules.add(r);
  258.                     } catch (final IllegalArgumentException e) {
  259.                         throw new IllegalStateException(
  260.                                 "Problem parsing line '" + currentLine + "' in " + location, e);
  261.                     }
  262.                 }
  263.             }
  264.         }
  265.     }

  266.     private static String stripQuotes(String str) {
  267.         if (str.startsWith(DOUBLE_QUOTE)) {
  268.             str = str.substring(1);
  269.         }

  270.         if (str.endsWith(DOUBLE_QUOTE)) {
  271.             str = str.substring(0, str.length() - 1);
  272.         }

  273.         return str;
  274.     }

  275.     /** Whether to use ASCII folding prior to encoding. */
  276.     private final boolean folding;

  277.     /**
  278.      * Creates a new instance with ASCII-folding enabled.
  279.      */
  280.     public DaitchMokotoffSoundex() {
  281.         this(true);
  282.     }

  283.     /**
  284.      * Creates a new instance.
  285.      * <p>
  286.      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example
  287.      * รจ -&gt; e.
  288.      * </p>
  289.      *
  290.      * @param folding
  291.      *            if ASCII-folding shall be performed before encoding
  292.      */
  293.     public DaitchMokotoffSoundex(final boolean folding) {
  294.         this.folding = folding;
  295.     }

  296.     /**
  297.      * Performs a cleanup of the input string before the actual soundex transformation.
  298.      * <p>
  299.      * Removes all whitespace characters and performs ASCII folding if enabled.
  300.      * </p>
  301.      *
  302.      * @param input
  303.      *            the input string to clean up
  304.      * @return a cleaned up string
  305.      */
  306.     private String cleanup(final String input) {
  307.         final StringBuilder sb = new StringBuilder();
  308.         for (char ch : input.toCharArray()) {
  309.             if (Character.isWhitespace(ch)) {
  310.                 continue;
  311.             }

  312.             ch = Character.toLowerCase(ch);
  313.             final Character character = FOLDINGS.get(ch);
  314.             if (folding && character != null) {
  315.                 ch = character;
  316.             }
  317.             sb.append(ch);
  318.         }
  319.         return sb.toString();
  320.     }

  321.     /**
  322.      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
  323.      * <p>
  324.      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
  325.      * EncoderException if the supplied object is not of type {@link String}.
  326.      * </p>
  327.      *
  328.      * @see #soundex(String)
  329.      * @param obj
  330.      *            Object to encode
  331.      * @return An object (of type {@link String}) containing the DM soundex code, which corresponds to the String
  332.      *         supplied.
  333.      * @throws EncoderException
  334.      *             if the parameter supplied is not of type {@link String}
  335.      * @throws IllegalArgumentException
  336.      *             if a character is not mapped
  337.      */
  338.     @Override
  339.     public Object encode(final Object obj) throws EncoderException {
  340.         if (!(obj instanceof String)) {
  341.             throw new EncoderException(
  342.                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
  343.         }
  344.         return encode((String) obj);
  345.     }

  346.     /**
  347.      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
  348.      *
  349.      * @see #soundex(String)
  350.      * @param source
  351.      *            A String object to encode
  352.      * @return A DM Soundex code corresponding to the String supplied
  353.      * @throws IllegalArgumentException
  354.      *             if a character is not mapped
  355.      */
  356.     @Override
  357.     public String encode(final String source) {
  358.         if (source == null) {
  359.             return null;
  360.         }
  361.         return soundex(source, false)[0];
  362.     }

  363.     /**
  364.      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
  365.      * <p>
  366.      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
  367.      * separated by '|'.
  368.      * </p>
  369.      * <p>
  370.      * Example: the name "AUERBACH" is encoded as both
  371.      * </p>
  372.      * <ul>
  373.      * <li>097400</li>
  374.      * <li>097500</li>
  375.      * </ul>
  376.      * <p>
  377.      * Thus the result will be "097400|097500".
  378.      * </p>
  379.      *
  380.      * @param source
  381.      *            A String object to encode
  382.      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
  383.      * @throws IllegalArgumentException
  384.      *             if a character is not mapped
  385.      */
  386.     public String soundex(final String source) {
  387.         return String.join("|", soundex(source, true));
  388.     }

  389.     /**
  390.      * Perform the actual DM Soundex algorithm on the input string.
  391.      *
  392.      * @param source
  393.      *            A String object to encode
  394.      * @param branching
  395.      *            If branching shall be performed
  396.      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
  397.      *         selected branching mode
  398.      */
  399.     private String[] soundex(final String source, final boolean branching) {
  400.         if (source == null) {
  401.             return null;
  402.         }

  403.         final String input = cleanup(source);

  404.         final Set<Branch> currentBranches = new LinkedHashSet<>();
  405.         currentBranches.add(new Branch());

  406.         char lastChar = '\0';
  407.         for (int index = 0; index < input.length(); index++) {
  408.             final char ch = input.charAt(index);

  409.             // ignore whitespace inside a name
  410.             if (Character.isWhitespace(ch)) {
  411.                 continue;
  412.             }

  413.             final String inputContext = input.substring(index);
  414.             final List<Rule> rules = RULES.get(ch);
  415.             if (rules == null) {
  416.                 continue;
  417.             }

  418.             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
  419.             final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();

  420.             for (final Rule rule : rules) {
  421.                 if (rule.matches(inputContext)) {
  422.                     if (branching) {
  423.                         nextBranches.clear();
  424.                     }
  425.                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
  426.                     final boolean branchingRequired = replacements.length > 1 && branching;

  427.                     for (final Branch branch : currentBranches) {
  428.                         for (final String nextReplacement : replacements) {
  429.                             // if we have multiple replacements, always create a new branch
  430.                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;

  431.                             // special rule: occurrences of mn or nm are treated differently
  432.                             final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';

  433.                             nextBranch.processNextReplacement(nextReplacement, force);

  434.                             if (!branching) {
  435.                                 break;
  436.                             }
  437.                             nextBranches.add(nextBranch);
  438.                         }
  439.                     }

  440.                     if (branching) {
  441.                         currentBranches.clear();
  442.                         currentBranches.addAll(nextBranches);
  443.                     }
  444.                     index += rule.getPatternLength() - 1;
  445.                     break;
  446.                 }
  447.             }

  448.             lastChar = ch;
  449.         }

  450.         final String[] result = new String[currentBranches.size()];
  451.         int index = 0;
  452.         for (final Branch branch : currentBranches) {
  453.             branch.finish();
  454.             result[index++] = branch.toString();
  455.         }

  456.         return result;
  457.     }
  458. }