DaitchMokotoffSoundex.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.codec.language;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.HashMap;
  22. import java.util.LinkedHashSet;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Scanner;
  26. import java.util.Set;
  27. import java.util.regex.Pattern;

  28. import org.apache.commons.codec.CharEncoding;
  29. import org.apache.commons.codec.EncoderException;
  30. import org.apache.commons.codec.Resources;
  31. import org.apache.commons.codec.StringEncoder;

  32. /**
  33.  * Encodes a string into a Daitch-Mokotoff Soundex value.
  34.  * <p>
  35.  * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
  36.  * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
  37.  * </p>
  38.  * <p>
  39.  * The main differences compared to the other Soundex variants are:
  40.  * </p>
  41.  * <ul>
  42.  * <li>coded names are 6 digits long
  43.  * <li>the initial character of the name is coded
  44.  * <li>rules to encoded multi-character n-grams
  45.  * <li>multiple possible encodings for the same name (branching)
  46.  * </ul>
  47.  * <p>
  48.  * This implementation supports branching, depending on the used method:
  49.  * <ul>
  50.  * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
  51.  * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
  52.  * </ul>
  53.  * <p>
  54.  * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
  55.  * rules can be customized by overriding the default rules contained in the resource file
  56.  * {@code org/apache/commons/codec/language/dmrules.txt}.
  57.  * </p>
  58.  * <p>
  59.  * This class is thread-safe.
  60.  * </p>
  61.  *
  62.  * @see Soundex
  63.  * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
  64.  * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
  65.  * @since 1.10
  66.  */
  67. public class DaitchMokotoffSoundex implements StringEncoder {

  68.     /**
  69.      * Inner class representing a branch during DM Soundex encoding.
  70.      */
  71.     private static final class Branch {
  72.         private final StringBuilder builder;
  73.         private String cachedString;
  74.         private String lastReplacement;

  75.         private Branch() {
  76.             builder = new StringBuilder();
  77.         }

  78.         /**
  79.          * Creates a new branch, identical to this branch.
  80.          *
  81.          * @return a new, identical branch
  82.          */
  83.         private Branch createBranch() {
  84.             final Branch branch = new Branch();
  85.             branch.builder.append(toString());
  86.             branch.lastReplacement = this.lastReplacement;
  87.             return branch;
  88.         }

  89.         @Override
  90.         public boolean equals(final Object other) {
  91.             if (this == other) {
  92.                 return true;
  93.             }
  94.             if (!(other instanceof Branch)) {
  95.                 return false;
  96.             }
  97.             return toString().equals(((Branch) other).toString());
  98.         }

  99.         /**
  100.          * Finish this branch by appending '0's until the maximum code length has been reached.
  101.          */
  102.         private void finish() {
  103.             while (builder.length() < MAX_LENGTH) {
  104.                 builder.append('0');
  105.                 cachedString = null;
  106.             }
  107.         }

  108.         @Override
  109.         public int hashCode() {
  110.             return toString().hashCode();
  111.         }

  112.         /**
  113.          * Process the next replacement to be added to this branch.
  114.          *
  115.          * @param replacement
  116.          *            the next replacement to append.
  117.          * @param forceAppend
  118.          *            indicates if the default processing shall be overridden.
  119.          */
  120.         private void processNextReplacement(final String replacement, final boolean forceAppend) {
  121.             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
  122.             if (append && builder.length() < MAX_LENGTH) {
  123.                 builder.append(replacement);
  124.                 // remove all characters after the maximum length
  125.                 if (builder.length() > MAX_LENGTH) {
  126.                     builder.delete(MAX_LENGTH, builder.length());
  127.                 }
  128.                 cachedString = null;
  129.             }
  130.             lastReplacement = replacement;
  131.         }

  132.         @Override
  133.         public String toString() {
  134.             if (cachedString == null) {
  135.                 cachedString = builder.toString();
  136.             }
  137.             return cachedString;
  138.         }
  139.     }

  140.     /**
  141.      * Inner class for storing rules.
  142.      */
  143.     private static final class Rule {
  144.         private static final Pattern PIPE = Pattern.compile("\\|");
  145.         private final String pattern;
  146.         private final String[] replacementAtStart;
  147.         private final String[] replacementBeforeVowel;
  148.         private final String[] replacementDefault;

  149.         private Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
  150.                 final String replacementDefault) {
  151.             this.pattern = pattern;
  152.             this.replacementAtStart = PIPE.split(replacementAtStart);
  153.             this.replacementBeforeVowel = PIPE.split(replacementBeforeVowel);
  154.             this.replacementDefault = PIPE.split(replacementDefault);
  155.         }

  156.         private int getPatternLength() {
  157.             return pattern.length();
  158.         }

  159.         private String[] getReplacements(final String context, final boolean atStart) {
  160.             if (atStart) {
  161.                 return replacementAtStart;
  162.             }

  163.             final int nextIndex = getPatternLength();
  164.             final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
  165.             if (nextCharIsVowel) {
  166.                 return replacementBeforeVowel;
  167.             }

  168.             return replacementDefault;
  169.         }

  170.         private boolean isVowel(final char ch) {
  171.             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
  172.         }

  173.         private boolean matches(final String context) {
  174.             return context.startsWith(pattern);
  175.         }

  176.         @Override
  177.         public String toString() {
  178.             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
  179.                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
  180.         }
  181.     }

  182.     /**
  183.      * The NUL character.
  184.      */
  185.     private static final char NUL = '\0';

  186.     private static final String COMMENT = "//";

  187.     private static final String DOUBLE_QUOTE = "\"";

  188.     private static final String MULTILINE_COMMENT_END = "*/";

  189.     private static final String MULTILINE_COMMENT_START = "/*";

  190.     /** The resource file containing the replacement and folding rules */
  191.     private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";

  192.     /** The code length of a DM Soundex value. */
  193.     private static final int MAX_LENGTH = 6;

  194.     /** Transformation rules indexed by the first character of their pattern. */
  195.     private static final Map<Character, List<Rule>> RULES = new HashMap<>();

  196.     /** Folding rules. */
  197.     private static final Map<Character, Character> FOLDINGS = new HashMap<>();

  198.     private static final Pattern EQUAL = Pattern.compile("=");

  199.     private static final Pattern SPACES = Pattern.compile("\\s+");

  200.     static {
  201.         try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
  202.             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
  203.         }
  204.         // sort RULES by pattern length in descending order
  205.         RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
  206.     }

  207.     private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping,
  208.             final Map<Character, Character> asciiFoldings) {
  209.         int currentLine = 0;
  210.         boolean inMultilineComment = false;
  211.         while (scanner.hasNextLine()) {
  212.             currentLine++;
  213.             final String rawLine = scanner.nextLine();
  214.             String line = rawLine;
  215.             if (inMultilineComment) {
  216.                 if (line.endsWith(MULTILINE_COMMENT_END)) {
  217.                     inMultilineComment = false;
  218.                 }
  219.                 continue;
  220.             }
  221.             if (line.startsWith(MULTILINE_COMMENT_START)) {
  222.                 inMultilineComment = true;
  223.             } else {
  224.                 // discard comments
  225.                 final int cmtI = line.indexOf(COMMENT);
  226.                 if (cmtI >= 0) {
  227.                     line = line.substring(0, cmtI);
  228.                 }
  229.                 // trim leading-trailing whitespace
  230.                 line = line.trim();
  231.                 if (line.isEmpty()) {
  232.                     continue; // empty lines can be safely skipped
  233.                 }
  234.                 if (line.contains("=")) {
  235.                     // folding
  236.                     final String[] parts = EQUAL.split(line);
  237.                     if (parts.length != 2) {
  238.                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location);
  239.                     }
  240.                     final String leftCharacter = parts[0];
  241.                     final String rightCharacter = parts[1];
  242.                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
  243.                         throw new IllegalArgumentException(
  244.                                 "Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location);
  245.                     }
  246.                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
  247.                 } else {
  248.                     // rule
  249.                     final String[] parts = SPACES.split(line);
  250.                     if (parts.length != 4) {
  251.                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
  252.                     }
  253.                     try {
  254.                         final String pattern = stripQuotes(parts[0]);
  255.                         final String replacement1 = stripQuotes(parts[1]);
  256.                         final String replacement2 = stripQuotes(parts[2]);
  257.                         final String replacement3 = stripQuotes(parts[3]);
  258.                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
  259.                         final char patternKey = r.pattern.charAt(0);
  260.                         final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
  261.                         rules.add(r);
  262.                     } catch (final IllegalArgumentException e) {
  263.                         throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
  264.                     }
  265.                 }
  266.             }
  267.         }
  268.     }

  269.     private static String stripQuotes(String str) {
  270.         if (str.startsWith(DOUBLE_QUOTE)) {
  271.             str = str.substring(1);
  272.         }
  273.         if (str.endsWith(DOUBLE_QUOTE)) {
  274.             str = str.substring(0, str.length() - 1);
  275.         }
  276.         return str;
  277.     }

  278.     /** Whether to use ASCII folding prior to encoding. */
  279.     private final boolean folding;

  280.     /**
  281.      * Creates a new instance with ASCII-folding enabled.
  282.      */
  283.     public DaitchMokotoffSoundex() {
  284.         this(true);
  285.     }

  286.     /**
  287.      * Creates a new instance.
  288.      * <p>
  289.      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example
  290.      * รจ -&gt; e.
  291.      * </p>
  292.      *
  293.      * @param folding
  294.      *            if ASCII-folding shall be performed before encoding
  295.      */
  296.     public DaitchMokotoffSoundex(final boolean folding) {
  297.         this.folding = folding;
  298.     }

  299.     /**
  300.      * Performs a cleanup of the input string before the actual Soundex transformation.
  301.      * <p>
  302.      * Removes all whitespace characters and performs ASCII folding if enabled.
  303.      * </p>
  304.      *
  305.      * @param input
  306.      *            the input string to clean up.
  307.      * @return a cleaned up string.
  308.      */
  309.     private String cleanup(final String input) {
  310.         final StringBuilder sb = new StringBuilder();
  311.         for (char ch : input.toCharArray()) {
  312.             if (Character.isWhitespace(ch) || !Character.isLetter(ch)) {
  313.                 continue;
  314.             }
  315.             ch = Character.toLowerCase(ch);
  316.             final Character character = FOLDINGS.get(ch);
  317.             if (folding && character != null) {
  318.                 ch = character;
  319.             }
  320.             sb.append(ch);
  321.         }
  322.         return sb.toString();
  323.     }

  324.     /**
  325.      * Encodes an Object using the Daitch-Mokotoff Soundex algorithm without branching.
  326.      * <p>
  327.      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
  328.      * EncoderException if the supplied object is not of type {@link String}.
  329.      * </p>
  330.      *
  331.      * @see #soundex(String)
  332.      * @param obj
  333.      *            Object to encode.
  334.      * @return An object (of type {@link String}) containing the DM Soundex code, which corresponds to the String
  335.      *         supplied.
  336.      * @throws EncoderException
  337.      *             if the parameter supplied is not of type {@link String}.
  338.      * @throws IllegalArgumentException
  339.      *             if a character is not mapped.
  340.      */
  341.     @Override
  342.     public Object encode(final Object obj) throws EncoderException {
  343.         if (!(obj instanceof String)) {
  344.             throw new EncoderException(
  345.                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
  346.         }
  347.         return encode((String) obj);
  348.     }

  349.     /**
  350.      * Encodes a String using the Daitch-Mokotoff Soundex algorithm without branching.
  351.      *
  352.      * @see #soundex(String)
  353.      * @param source
  354.      *            A String object to encode.
  355.      * @return A DM Soundex code corresponding to the String supplied.
  356.      * @throws IllegalArgumentException
  357.      *             if a character is not mapped.
  358.      */
  359.     @Override
  360.     public String encode(final String source) {
  361.         if (source == null) {
  362.             return null;
  363.         }
  364.         return soundex(source, false)[0];
  365.     }

  366.     /**
  367.      * Encodes a String using the Daitch-Mokotoff Soundex algorithm with branching.
  368.      * <p>
  369.      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
  370.      * separated by '|'.
  371.      * </p>
  372.      * <p>
  373.      * Example: the name "AUERBACH" is encoded as both
  374.      * </p>
  375.      * <ul>
  376.      * <li>097400</li>
  377.      * <li>097500</li>
  378.      * </ul>
  379.      * <p>
  380.      * Thus the result will be "097400|097500".
  381.      * </p>
  382.      *
  383.      * @param source
  384.      *            A String object to encode.
  385.      * @return A string containing a set of DM Soundex codes corresponding to the String supplied.
  386.      * @throws IllegalArgumentException
  387.      *             if a character is not mapped.
  388.      */
  389.     public String soundex(final String source) {
  390.         return String.join("|", soundex(source, true));
  391.     }

  392.     /**
  393.      * Perform the actual DM Soundex algorithm on the input string.
  394.      *
  395.      * @param source
  396.      *            A String object to encode.
  397.      * @param branching
  398.      *            If branching shall be performed.
  399.      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
  400.      *         selected branching mode.
  401.      */
  402.     private String[] soundex(final String source, final boolean branching) {
  403.         if (source == null) {
  404.             return null;
  405.         }
  406.         final String input = cleanup(source);
  407.         final Set<Branch> currentBranches = new LinkedHashSet<>();
  408.         currentBranches.add(new Branch());
  409.         char lastChar = NUL;
  410.         for (int index = 0; index < input.length(); index++) {
  411.             final char ch = input.charAt(index);
  412.             final String inputContext = input.substring(index);
  413.             final List<Rule> rules = RULES.get(ch);
  414.             if (rules == null) {
  415.                 continue;
  416.             }
  417.             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
  418.             final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
  419.             for (final Rule rule : rules) {
  420.                 if (rule.matches(inputContext)) {
  421.                     if (branching) {
  422.                         nextBranches.clear();
  423.                     }
  424.                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
  425.                     final boolean branchingRequired = replacements.length > 1 && branching;
  426.                     for (final Branch branch : currentBranches) {
  427.                         for (final String nextReplacement : replacements) {
  428.                             // if we have multiple replacements, always create a new branch
  429.                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
  430.                             // special rule: occurrences of mn or nm are treated differently
  431.                             final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
  432.                             nextBranch.processNextReplacement(nextReplacement, force);
  433.                             if (!branching) {
  434.                                 break;
  435.                             }
  436.                             nextBranches.add(nextBranch);
  437.                         }
  438.                     }
  439.                     if (branching) {
  440.                         currentBranches.clear();
  441.                         currentBranches.addAll(nextBranches);
  442.                     }
  443.                     index += rule.getPatternLength() - 1;
  444.                     break;
  445.                 }
  446.             }
  447.             lastChar = ch;
  448.         }
  449.         final String[] result = new String[currentBranches.size()];
  450.         int index = 0;
  451.         for (final Branch branch : currentBranches) {
  452.             branch.finish();
  453.             result[index++] = branch.toString();
  454.         }
  455.         return result;
  456.     }
  457. }