DaitchMokotoffSoundex.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.codec.language;

  18. import java.io.InputStream;
  19. import java.util.ArrayList;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.Comparator;
  23. import java.util.HashMap;
  24. import java.util.LinkedHashSet;
  25. import java.util.List;
  26. import java.util.Map;
  27. import java.util.Scanner;
  28. import java.util.Set;

  29. import org.apache.commons.codec.CharEncoding;
  30. import org.apache.commons.codec.EncoderException;
  31. import org.apache.commons.codec.StringEncoder;

  32. /**
  33.  * Encodes a string into a Daitch-Mokotoff Soundex value.
  34.  * <p>
  35.  * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
  36.  * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
  37.  * </p>
  38.  * <p>
  39.  * The main differences compared to the other soundex variants are:
  40.  * </p>
  41.  * <ul>
  42.  * <li>coded names are 6 digits long
  43.  * <li>the initial character of the name is coded
  44.  * <li>rules to encoded multi-character n-grams
  45.  * <li>multiple possible encodings for the same name (branching)
  46.  * </ul>
  47.  * <p>
  48.  * This implementation supports branching, depending on the used method:
  49.  * <ul>
  50.  * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
  51.  * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
  52.  * </ul>
  53.  * <p>
  54.  * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
  55.  * rules can be customized by overriding the default rules contained in the resource file
  56.  * {@code org/apache/commons/codec/language/dmrules.txt}.
  57.  * </p>
  58.  * <p>
  59.  * This class is thread-safe.
  60.  * </p>
  61.  *
  62.  * @see Soundex
  63.  * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
  64.  * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
  65.  *
  66.  * @version $Id: DaitchMokotoffSoundex.java 1760691 2016-09-14 12:14:26Z jochen $
  67.  * @since 1.10
  68.  */
  69. public class DaitchMokotoffSoundex implements StringEncoder {

  70.     /**
  71.      * Inner class representing a branch during DM soundex encoding.
  72.      */
  73.     private static final class Branch {
  74.         private final StringBuilder builder;
  75.         private String cachedString;
  76.         private String lastReplacement;

  77.         private Branch() {
  78.             builder = new StringBuilder();
  79.             lastReplacement = null;
  80.             cachedString = null;
  81.         }

  82.         /**
  83.          * Creates a new branch, identical to this branch.
  84.          *
  85.          * @return a new, identical branch
  86.          */
  87.         public Branch createBranch() {
  88.             final Branch branch = new Branch();
  89.             branch.builder.append(toString());
  90.             branch.lastReplacement = this.lastReplacement;
  91.             return branch;
  92.         }

  93.         @Override
  94.         public boolean equals(final Object other) {
  95.             if (this == other) {
  96.                 return true;
  97.             }
  98.             if (!(other instanceof Branch)) {
  99.                 return false;
  100.             }

  101.             return toString().equals(((Branch) other).toString());
  102.         }

  103.         /**
  104.          * Finish this branch by appending '0's until the maximum code length has been reached.
  105.          */
  106.         public void finish() {
  107.             while (builder.length() < MAX_LENGTH) {
  108.                 builder.append('0');
  109.                 cachedString = null;
  110.             }
  111.         }

  112.         @Override
  113.         public int hashCode() {
  114.             return toString().hashCode();
  115.         }

  116.         /**
  117.          * Process the next replacement to be added to this branch.
  118.          *
  119.          * @param replacement
  120.          *            the next replacement to append
  121.          * @param forceAppend
  122.          *            indicates if the default processing shall be overridden
  123.          */
  124.         public void processNextReplacement(final String replacement, final boolean forceAppend) {
  125.             final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;

  126.             if (append && builder.length() < MAX_LENGTH) {
  127.                 builder.append(replacement);
  128.                 // remove all characters after the maximum length
  129.                 if (builder.length() > MAX_LENGTH) {
  130.                     builder.delete(MAX_LENGTH, builder.length());
  131.                 }
  132.                 cachedString = null;
  133.             }

  134.             lastReplacement = replacement;
  135.         }

  136.         @Override
  137.         public String toString() {
  138.             if (cachedString == null) {
  139.                 cachedString = builder.toString();
  140.             }
  141.             return cachedString;
  142.         }
  143.     }

  144.     /**
  145.      * Inner class for storing rules.
  146.      */
  147.     private static final class Rule {
  148.         private final String pattern;
  149.         private final String[] replacementAtStart;
  150.         private final String[] replacementBeforeVowel;
  151.         private final String[] replacementDefault;

  152.         protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
  153.                 final String replacementDefault) {
  154.             this.pattern = pattern;
  155.             this.replacementAtStart = replacementAtStart.split("\\|");
  156.             this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
  157.             this.replacementDefault = replacementDefault.split("\\|");
  158.         }

  159.         public int getPatternLength() {
  160.             return pattern.length();
  161.         }

  162.         public String[] getReplacements(final String context, final boolean atStart) {
  163.             if (atStart) {
  164.                 return replacementAtStart;
  165.             }

  166.             final int nextIndex = getPatternLength();
  167.             final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
  168.             if (nextCharIsVowel) {
  169.                 return replacementBeforeVowel;
  170.             }

  171.             return replacementDefault;
  172.         }

  173.         private boolean isVowel(final char ch) {
  174.             return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
  175.         }

  176.         public boolean matches(final String context) {
  177.             return context.startsWith(pattern);
  178.         }

  179.         @Override
  180.         public String toString() {
  181.             return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
  182.                     Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
  183.         }
  184.     }

  185.     private static final String COMMENT = "//";
  186.     private static final String DOUBLE_QUOTE = "\"";

  187.     private static final String MULTILINE_COMMENT_END = "*/";

  188.     private static final String MULTILINE_COMMENT_START = "/*";

  189.     /** The resource file containing the replacement and folding rules */
  190.     private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";

  191.     /** The code length of a DM soundex value. */
  192.     private static final int MAX_LENGTH = 6;

  193.     /** Transformation rules indexed by the first character of their pattern. */
  194.     private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();

  195.     /** Folding rules. */
  196.     private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();

  197.     static {
  198.         final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
  199.         if (rulesIS == null) {
  200.             throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
  201.         }

  202.         final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
  203.         try {
  204.             parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
  205.         } finally {
  206.             scanner.close();
  207.         }

  208.         // sort RULES by pattern length in descending order
  209.         for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
  210.             final List<Rule> ruleList = rule.getValue();
  211.             Collections.sort(ruleList, new Comparator<Rule>() {
  212.                 @Override
  213.                 public int compare(final Rule rule1, final Rule rule2) {
  214.                     return rule2.getPatternLength() - rule1.getPatternLength();
  215.                 }
  216.             });
  217.         }
  218.     }

  219.     private static void parseRules(final Scanner scanner, final String location,
  220.             final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
  221.         int currentLine = 0;
  222.         boolean inMultilineComment = false;

  223.         while (scanner.hasNextLine()) {
  224.             currentLine++;
  225.             final String rawLine = scanner.nextLine();
  226.             String line = rawLine;

  227.             if (inMultilineComment) {
  228.                 if (line.endsWith(MULTILINE_COMMENT_END)) {
  229.                     inMultilineComment = false;
  230.                 }
  231.                 continue;
  232.             }

  233.             if (line.startsWith(MULTILINE_COMMENT_START)) {
  234.                 inMultilineComment = true;
  235.             } else {
  236.                 // discard comments
  237.                 final int cmtI = line.indexOf(COMMENT);
  238.                 if (cmtI >= 0) {
  239.                     line = line.substring(0, cmtI);
  240.                 }

  241.                 // trim leading-trailing whitespace
  242.                 line = line.trim();

  243.                 if (line.length() == 0) {
  244.                     continue; // empty lines can be safely skipped
  245.                 }

  246.                 if (line.contains("=")) {
  247.                     // folding
  248.                     final String[] parts = line.split("=");
  249.                     if (parts.length != 2) {
  250.                         throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
  251.                                 " parts: " + rawLine + " in " + location);
  252.                     }
  253.                     final String leftCharacter = parts[0];
  254.                     final String rightCharacter = parts[1];

  255.                     if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
  256.                         throw new IllegalArgumentException("Malformed folding statement - " +
  257.                                 "patterns are not single characters: " + rawLine + " in " + location);
  258.                     }

  259.                     asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
  260.                 } else {
  261.                     // rule
  262.                     final String[] parts = line.split("\\s+");
  263.                     if (parts.length != 4) {
  264.                         throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
  265.                                 " parts: " + rawLine + " in " + location);
  266.                     }
  267.                     try {
  268.                         final String pattern = stripQuotes(parts[0]);
  269.                         final String replacement1 = stripQuotes(parts[1]);
  270.                         final String replacement2 = stripQuotes(parts[2]);
  271.                         final String replacement3 = stripQuotes(parts[3]);

  272.                         final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
  273.                         final char patternKey = r.pattern.charAt(0);
  274.                         List<Rule> rules = ruleMapping.get(patternKey);
  275.                         if (rules == null) {
  276.                             rules = new ArrayList<Rule>();
  277.                             ruleMapping.put(patternKey, rules);
  278.                         }
  279.                         rules.add(r);
  280.                     } catch (final IllegalArgumentException e) {
  281.                         throw new IllegalStateException(
  282.                                 "Problem parsing line '" + currentLine + "' in " + location, e);
  283.                     }
  284.                 }
  285.             }
  286.         }
  287.     }

  288.     private static String stripQuotes(String str) {
  289.         if (str.startsWith(DOUBLE_QUOTE)) {
  290.             str = str.substring(1);
  291.         }

  292.         if (str.endsWith(DOUBLE_QUOTE)) {
  293.             str = str.substring(0, str.length() - 1);
  294.         }

  295.         return str;
  296.     }

  297.     /** Whether to use ASCII folding prior to encoding. */
  298.     private final boolean folding;

  299.     /**
  300.      * Creates a new instance with ASCII-folding enabled.
  301.      */
  302.     public DaitchMokotoffSoundex() {
  303.         this(true);
  304.     }

  305.     /**
  306.      * Creates a new instance.
  307.      * <p>
  308.      * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
  309.      * รจ -&gt; e.
  310.      * </p>
  311.      *
  312.      * @param folding
  313.      *            if ASCII-folding shall be performed before encoding
  314.      */
  315.     public DaitchMokotoffSoundex(final boolean folding) {
  316.         this.folding = folding;
  317.     }

  318.     /**
  319.      * Performs a cleanup of the input string before the actual soundex transformation.
  320.      * <p>
  321.      * Removes all whitespace characters and performs ASCII folding if enabled.
  322.      * </p>
  323.      *
  324.      * @param input
  325.      *            the input string to cleanup
  326.      * @return a cleaned up string
  327.      */
  328.     private String cleanup(final String input) {
  329.         final StringBuilder sb = new StringBuilder();
  330.         for (char ch : input.toCharArray()) {
  331.             if (Character.isWhitespace(ch)) {
  332.                 continue;
  333.             }

  334.             ch = Character.toLowerCase(ch);
  335.             if (folding && FOLDINGS.containsKey(ch)) {
  336.                 ch = FOLDINGS.get(ch);
  337.             }
  338.             sb.append(ch);
  339.         }
  340.         return sb.toString();
  341.     }

  342.     /**
  343.      * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
  344.      * <p>
  345.      * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
  346.      * EncoderException if the supplied object is not of type java.lang.String.
  347.      * </p>
  348.      *
  349.      * @see #soundex(String)
  350.      *
  351.      * @param obj
  352.      *            Object to encode
  353.      * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
  354.      *         supplied.
  355.      * @throws EncoderException
  356.      *             if the parameter supplied is not of type java.lang.String
  357.      * @throws IllegalArgumentException
  358.      *             if a character is not mapped
  359.      */
  360.     @Override
  361.     public Object encode(final Object obj) throws EncoderException {
  362.         if (!(obj instanceof String)) {
  363.             throw new EncoderException(
  364.                     "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
  365.         }
  366.         return encode((String) obj);
  367.     }

  368.     /**
  369.      * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
  370.      *
  371.      * @see #soundex(String)
  372.      *
  373.      * @param source
  374.      *            A String object to encode
  375.      * @return A DM Soundex code corresponding to the String supplied
  376.      * @throws IllegalArgumentException
  377.      *             if a character is not mapped
  378.      */
  379.     @Override
  380.     public String encode(final String source) {
  381.         if (source == null) {
  382.             return null;
  383.         }
  384.         return soundex(source, false)[0];
  385.     }

  386.     /**
  387.      * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
  388.      * <p>
  389.      * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
  390.      * separated by '|'.
  391.      * </p>
  392.      * <p>
  393.      * Example: the name "AUERBACH" is encoded as both
  394.      * </p>
  395.      * <ul>
  396.      * <li>097400</li>
  397.      * <li>097500</li>
  398.      * </ul>
  399.      * <p>
  400.      * Thus the result will be "097400|097500".
  401.      * </p>
  402.      *
  403.      * @param source
  404.      *            A String object to encode
  405.      * @return A string containing a set of DM Soundex codes corresponding to the String supplied
  406.      * @throws IllegalArgumentException
  407.      *             if a character is not mapped
  408.      */
  409.     public String soundex(final String source) {
  410.         final String[] branches = soundex(source, true);
  411.         final StringBuilder sb = new StringBuilder();
  412.         int index = 0;
  413.         for (final String branch : branches) {
  414.             sb.append(branch);
  415.             if (++index < branches.length) {
  416.                 sb.append('|');
  417.             }
  418.         }
  419.         return sb.toString();
  420.     }

  421.     /**
  422.      * Perform the actual DM Soundex algorithm on the input string.
  423.      *
  424.      * @param source
  425.      *            A String object to encode
  426.      * @param branching
  427.      *            If branching shall be performed
  428.      * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
  429.      *         selected branching mode
  430.      */
  431.     private String[] soundex(final String source, final boolean branching) {
  432.         if (source == null) {
  433.             return null;
  434.         }

  435.         final String input = cleanup(source);

  436.         final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
  437.         currentBranches.add(new Branch());

  438.         char lastChar = '\0';
  439.         for (int index = 0; index < input.length(); index++) {
  440.             final char ch = input.charAt(index);

  441.             // ignore whitespace inside a name
  442.             if (Character.isWhitespace(ch)) {
  443.                 continue;
  444.             }

  445.             final String inputContext = input.substring(index);
  446.             final List<Rule> rules = RULES.get(ch);
  447.             if (rules == null) {
  448.                 continue;
  449.             }

  450.             // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
  451.             @SuppressWarnings("unchecked")
  452.             final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;

  453.             for (final Rule rule : rules) {
  454.                 if (rule.matches(inputContext)) {
  455.                     if (branching) {
  456.                         nextBranches.clear();
  457.                     }
  458.                     final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
  459.                     final boolean branchingRequired = replacements.length > 1 && branching;

  460.                     for (final Branch branch : currentBranches) {
  461.                         for (final String nextReplacement : replacements) {
  462.                             // if we have multiple replacements, always create a new branch
  463.                             final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;

  464.                             // special rule: occurrences of mn or nm are treated differently
  465.                             final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');

  466.                             nextBranch.processNextReplacement(nextReplacement, force);

  467.                             if (branching) {
  468.                                 nextBranches.add(nextBranch);
  469.                             } else {
  470.                                 break;
  471.                             }
  472.                         }
  473.                     }

  474.                     if (branching) {
  475.                         currentBranches.clear();
  476.                         currentBranches.addAll(nextBranches);
  477.                     }
  478.                     index += rule.getPatternLength() - 1;
  479.                     break;
  480.                 }
  481.             }

  482.             lastChar = ch;
  483.         }

  484.         final String[] result = new String[currentBranches.size()];
  485.         int index = 0;
  486.         for (final Branch branch : currentBranches) {
  487.             branch.finish();
  488.             result[index++] = branch.toString();
  489.         }

  490.         return result;
  491.     }
  492. }