PhoneticEngine.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language.bm;

  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.EnumMap;
  22. import java.util.HashSet;
  23. import java.util.LinkedHashSet;
  24. import java.util.List;
  25. import java.util.Locale;
  26. import java.util.Map;
  27. import java.util.Objects;
  28. import java.util.Set;
  29. import java.util.TreeMap;
  30. import java.util.stream.Collectors;

  31. import org.apache.commons.codec.language.bm.Languages.LanguageSet;
  32. import org.apache.commons.codec.language.bm.Rule.Phoneme;

  33. /**
  34.  * Converts words into potential phonetic representations.
  35.  * <p>
  36.  * This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
  37.  * into account the likely source language. Next, this phonetic representation is converted into a
  38.  * pan-European 'average' representation, allowing comparison between different versions of essentially
  39.  * the same word from different languages.
  40.  * </p>
  41.  * <p>
  42.  * This class is intentionally immutable and thread-safe.
  43.  * If you wish to alter the settings for a PhoneticEngine, you
  44.  * must make a new one with the updated settings.
  45.  * </p>
  46.  * <p>
  47.  * Ported from phoneticengine.php
  48.  * </p>
  49.  *
  50.  * @since 1.6
  51.  */
  52. public class PhoneticEngine {

  53.     /**
  54.      * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
  55.      * this package, and probably not outside the {@link PhoneticEngine} class.
  56.      *
  57.      * @since 1.6
  58.      */
  59.     static final class PhonemeBuilder {

  60.         /**
  61.          * An empty builder where all phonemes must come from some set of languages. This will contain a single
  62.          * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
  63.          * phoneme from scratch.
  64.          *
  65.          * @param languages the set of languages
  66.          * @return  a new, empty phoneme builder
  67.          */
  68.         public static PhonemeBuilder empty(final Languages.LanguageSet languages) {
  69.             return new PhonemeBuilder(new Rule.Phoneme("", languages));
  70.         }

  71.         private final Set<Rule.Phoneme> phonemes;

  72.         private PhonemeBuilder(final Rule.Phoneme phoneme) {
  73.             this.phonemes = new LinkedHashSet<>();
  74.             this.phonemes.add(phoneme);
  75.         }

  76.         private PhonemeBuilder(final Set<Rule.Phoneme> phonemes) {
  77.             this.phonemes = phonemes;
  78.         }

  79.         /**
  80.          * Creates a new phoneme builder containing all phonemes in this one extended by {@code str}.
  81.          *
  82.          * @param str   the characters to append to the phonemes
  83.          */
  84.         public void append(final CharSequence str) {
  85.             phonemes.forEach(ph -> ph.append(str));
  86.         }

  87.         /**
  88.          * Applies the given phoneme expression to all phonemes in this phoneme builder.
  89.          * <p>
  90.          * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
  91.          * incompatible.
  92.          * </p>
  93.          *
  94.          * @param phonemeExpr   the expression to apply
  95.          * @param maxPhonemes   the maximum number of phonemes to build up
  96.          */
  97.         public void apply(final Rule.PhonemeExpr phonemeExpr, final int maxPhonemes) {
  98.             final Set<Rule.Phoneme> newPhonemes = new LinkedHashSet<>(Math.min(phonemes.size() * phonemeExpr.size(), maxPhonemes));
  99.             EXPR: for (final Rule.Phoneme left : phonemes) {
  100.                 for (final Rule.Phoneme right : phonemeExpr.getPhonemes()) {
  101.                     final LanguageSet languages = left.getLanguages().restrictTo(right.getLanguages());
  102.                     if (!languages.isEmpty()) {
  103.                         final Rule.Phoneme join = new Phoneme(left, right, languages);
  104.                         if (newPhonemes.size() < maxPhonemes) {
  105.                             newPhonemes.add(join);
  106.                             if (newPhonemes.size() >= maxPhonemes) {
  107.                                 break EXPR;
  108.                             }
  109.                         }
  110.                     }
  111.                 }
  112.             }
  113.             phonemes.clear();
  114.             phonemes.addAll(newPhonemes);
  115.         }

  116.         /**
  117.          * Gets underlying phoneme set. Please don't mutate.
  118.          *
  119.          * @return  the phoneme set
  120.          */
  121.         public Set<Rule.Phoneme> getPhonemes() {
  122.             return phonemes;
  123.         }

  124.         /**
  125.          * Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
  126.          * joined with a pipe. This is explicitly provided in place of toString as it is a potentially
  127.          * expensive operation, which should be avoided when debugging.
  128.          *
  129.          * @return  the stringified phoneme set
  130.          */
  131.         public String makeString() {
  132.             return phonemes.stream().map(Rule.Phoneme::getPhonemeText).collect(Collectors.joining("|"));
  133.         }
  134.     }

  135.     /**
  136.      * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
  137.      * After invocation, the values {@code i} and {@code found} are updated. {@code i} points to the
  138.      * index of the next char in {@code input} that must be processed next (the input up to that index having been
  139.      * processed already), and {@code found} indicates if a matching rule was found or not. In the case where a
  140.      * matching rule was found, {@code phonemeBuilder} is replaced with a new builder containing the phonemes
  141.      * updated by the matching rule.
  142.      * <p>
  143.      * Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
  144.      * as it is constructed as needed by the calling methods.
  145.      * </p>
  146.      *
  147.      * @since 1.6
  148.      */
  149.     private static final class RulesApplication {

  150.         private final Map<String, List<Rule>> finalRules;
  151.         private final CharSequence input;
  152.         private final PhonemeBuilder phonemeBuilder;
  153.         private int i;
  154.         private final int maxPhonemes;
  155.         private boolean found;

  156.         RulesApplication(final Map<String, List<Rule>> finalRules, final CharSequence input, final PhonemeBuilder phonemeBuilder, final int i,
  157.                 final int maxPhonemes) {
  158.             Objects.requireNonNull(finalRules, "finalRules");
  159.             this.finalRules = finalRules;
  160.             this.phonemeBuilder = phonemeBuilder;
  161.             this.input = input;
  162.             this.i = i;
  163.             this.maxPhonemes = maxPhonemes;
  164.         }

  165.         public int getI() {
  166.             return i;
  167.         }

  168.         public PhonemeBuilder getPhonemeBuilder() {
  169.             return phonemeBuilder;
  170.         }

  171.         /**
  172.          * Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
  173.          * and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
  174.          * match, {@code i} is advanced one and the character is silently dropped from the phonetic spelling.
  175.          *
  176.          * @return {@code this}
  177.          */
  178.         public RulesApplication invoke() {
  179.             found = false;
  180.             int patternLength = 1;
  181.             final List<Rule> rules = finalRules.get(input.subSequence(i, i + patternLength));
  182.             if (rules != null) {
  183.                 for (final Rule rule : rules) {
  184.                     final String pattern = rule.getPattern();
  185.                     patternLength = pattern.length();
  186.                     if (rule.patternAndContextMatches(input, i)) {
  187.                         phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
  188.                         found = true;
  189.                         break;
  190.                     }
  191.                 }
  192.             }

  193.             if (!found) {
  194.                 patternLength = 1;
  195.             }

  196.             i += patternLength;
  197.             return this;
  198.         }

  199.         public boolean isFound() {
  200.             return found;
  201.         }
  202.     }

  203.     private static final int DEFAULT_MAX_PHONEMES = 20;

  204.     private static final Map<NameType, Set<String>> NAME_PREFIXES = new EnumMap<>(NameType.class);

  205.     static {
  206.         NAME_PREFIXES.put(NameType.ASHKENAZI,
  207.                 Collections.unmodifiableSet(
  208.                         new HashSet<>(Arrays.asList("bar", "ben", "da", "de", "van", "von"))));
  209.         NAME_PREFIXES.put(NameType.SEPHARDIC,
  210.                 Collections.unmodifiableSet(
  211.                         new HashSet<>(Arrays.asList("al", "el", "da", "dal", "de", "del", "dela", "de la",
  212.                                                           "della", "des", "di", "do", "dos", "du", "van", "von"))));
  213.         NAME_PREFIXES.put(NameType.GENERIC,
  214.                 Collections.unmodifiableSet(
  215.                         new HashSet<>(Arrays.asList("da", "dal", "de", "del", "dela", "de la", "della",
  216.                                                           "des", "di", "do", "dos", "du", "van", "von"))));
  217.     }

  218.     /**
  219.      * Joins some strings with an internal separator.
  220.      *
  221.      * @param strings   Strings to join
  222.      * @param sep       String to separate them with
  223.      * @return a single String consisting of each element of {@code strings} interleaved by {@code sep}
  224.      */
  225.     private static String join(final List<String> strings, final String sep) {
  226.         return strings.stream().collect(Collectors.joining(sep));
  227.     }

  228.     private final Lang lang;

  229.     private final NameType nameType;

  230.     private final RuleType ruleType;

  231.     private final boolean concat;

  232.     private final int maxPhonemes;

  233.     /**
  234.      * Generates a new, fully-configured phonetic engine.
  235.      *
  236.      * @param nameType
  237.      *            the type of names it will use
  238.      * @param ruleType
  239.      *            the type of rules it will apply
  240.      * @param concatenate
  241.      *            if it will concatenate multiple encodings
  242.      */
  243.     public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concatenate) {
  244.         this(nameType, ruleType, concatenate, DEFAULT_MAX_PHONEMES);
  245.     }

  246.     /**
  247.      * Generates a new, fully-configured phonetic engine.
  248.      *
  249.      * @param nameType
  250.      *            the type of names it will use
  251.      * @param ruleType
  252.      *            the type of rules it will apply
  253.      * @param concatenate
  254.      *            if it will concatenate multiple encodings
  255.      * @param maxPhonemes
  256.      *            the maximum number of phonemes that will be handled
  257.      * @since 1.7
  258.      */
  259.     public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concatenate, final int maxPhonemes) {
  260.         if (ruleType == RuleType.RULES) {
  261.             throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES);
  262.         }
  263.         this.nameType = nameType;
  264.         this.ruleType = ruleType;
  265.         this.concat = concatenate;
  266.         this.lang = Lang.instance(nameType);
  267.         this.maxPhonemes = maxPhonemes;
  268.     }

  269.     /**
  270.      * Applies the final rules to convert from a language-specific phonetic representation to a
  271.      * language-independent representation.
  272.      *
  273.      * @param phonemeBuilder the current phonemes
  274.      * @param finalRules the final rules to apply
  275.      * @return the resulting phonemes
  276.      */
  277.     private PhonemeBuilder applyFinalRules(final PhonemeBuilder phonemeBuilder,
  278.             final Map<String, List<Rule>> finalRules) {
  279.         Objects.requireNonNull(finalRules, "finalRules");
  280.         if (finalRules.isEmpty()) {
  281.             return phonemeBuilder;
  282.         }

  283.         final Map<Rule.Phoneme, Rule.Phoneme> phonemes = new TreeMap<>(Rule.Phoneme.COMPARATOR);

  284.         phonemeBuilder.getPhonemes().forEach(phoneme -> {
  285.             PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
  286.             final String phonemeText = phoneme.getPhonemeText().toString();

  287.             for (int i = 0; i < phonemeText.length();) {
  288.                 final RulesApplication rulesApplication = new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke();
  289.                 final boolean found = rulesApplication.isFound();
  290.                 subBuilder = rulesApplication.getPhonemeBuilder();

  291.                 if (!found) {
  292.                     // not found, appending as-is
  293.                     subBuilder.append(phonemeText.subSequence(i, i + 1));
  294.                 }

  295.                 i = rulesApplication.getI();
  296.             }

  297.             // the phonemes map orders the phonemes only based on their text, but ignores the language set
  298.             // when adding new phonemes, check for equal phonemes and merge their language set, otherwise
  299.             // phonemes with the same text but different language set get lost
  300.             subBuilder.getPhonemes().forEach(newPhoneme -> {
  301.                 if (phonemes.containsKey(newPhoneme)) {
  302.                     final Rule.Phoneme oldPhoneme = phonemes.remove(newPhoneme);
  303.                     final Rule.Phoneme mergedPhoneme = oldPhoneme.mergeWithLanguage(newPhoneme.getLanguages());
  304.                     phonemes.put(mergedPhoneme, mergedPhoneme);
  305.                 } else {
  306.                     phonemes.put(newPhoneme, newPhoneme);
  307.                 }
  308.             });
  309.         });

  310.         return new PhonemeBuilder(phonemes.keySet());
  311.     }

  312.     /**
  313.      * Encodes a string to its phonetic representation.
  314.      *
  315.      * @param input
  316.      *            the String to encode
  317.      * @return the encoding of the input
  318.      */
  319.     public String encode(final String input) {
  320.         final Languages.LanguageSet languageSet = this.lang.guessLanguages(input);
  321.         return encode(input, languageSet);
  322.     }

  323.     /**
  324.      * Encodes an input string into an output phonetic representation, given a set of possible origin languages.
  325.      *
  326.      * @param input
  327.      *            String to phoneticise; a String with dashes or spaces separating each word
  328.      * @param languageSet
  329.      *            set of possible origin languages
  330.      * @return a phonetic representation of the input; a String containing '-'-separated phonetic representations of the
  331.      *         input
  332.      */
  333.     public String encode(String input, final Languages.LanguageSet languageSet) {
  334.         final Map<String, List<Rule>> rules = Rule.getInstanceMap(this.nameType, RuleType.RULES, languageSet);
  335.         // rules common across many (all) languages
  336.         final Map<String, List<Rule>> finalRules1 = Rule.getInstanceMap(this.nameType, this.ruleType, "common");
  337.         // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
  338.         final Map<String, List<Rule>> finalRules2 = Rule.getInstanceMap(this.nameType, this.ruleType, languageSet);

  339.         // tidy the input
  340.         // lower case is a locale-dependent operation
  341.         input = input.toLowerCase(Locale.ENGLISH).replace('-', ' ').trim();

  342.         if (this.nameType == NameType.GENERIC) {
  343.             if (input.startsWith("d'")) { // check for d'
  344.                 final String remainder = input.substring(2);
  345.                 final String combined = "d" + remainder;
  346.                 return "(" + encode(remainder) + ")-(" + encode(combined) + ")";
  347.             }
  348.             for (final String l : NAME_PREFIXES.get(this.nameType)) {
  349.                 // handle generic prefixes
  350.                 if (input.startsWith(l + " ")) {
  351.                     // check for any prefix in the words list
  352.                     final String remainder = input.substring(l.length() + 1); // input without the prefix
  353.                     final String combined = l + remainder; // input with prefix without space
  354.                     return "(" + encode(remainder) + ")-(" + encode(combined) + ")";
  355.                 }
  356.             }
  357.         }

  358.         final List<String> words = Arrays.asList(input.split("\\s+"));
  359.         final List<String> words2 = new ArrayList<>();

  360.         // special-case handling of word prefixes based upon the name type
  361.         switch (this.nameType) {
  362.         case SEPHARDIC:
  363.             words.forEach(aWord -> {
  364.                 final String[] parts = aWord.split("'", -1);
  365.                 words2.add(parts[parts.length - 1]);
  366.             });
  367.             words2.removeAll(NAME_PREFIXES.get(this.nameType));
  368.             break;
  369.         case ASHKENAZI:
  370.             words2.addAll(words);
  371.             words2.removeAll(NAME_PREFIXES.get(this.nameType));
  372.             break;
  373.         case GENERIC:
  374.             words2.addAll(words);
  375.             break;
  376.         default:
  377.             throw new IllegalStateException("Unreachable case: " + this.nameType);
  378.         }

  379.         if (this.concat) {
  380.             // concat mode enabled
  381.             input = join(words2, " ");
  382.         } else if (words2.size() == 1) {
  383.             // not a multi-word name
  384.             input = words.iterator().next();
  385.         } else if (!words2.isEmpty()) {
  386.             // encode each word in a multi-word name separately (normally used for approx matches)
  387.             final StringBuilder result = new StringBuilder();
  388.             words2.forEach(word -> result.append("-").append(encode(word)));
  389.             // return the result without the leading "-"
  390.             return result.substring(1);
  391.         }

  392.         PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet);

  393.         // loop over each char in the input - we will handle the increment manually
  394.         for (int i = 0; i < input.length();) {
  395.             final RulesApplication rulesApplication =
  396.                     new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).invoke();
  397.             i = rulesApplication.getI();
  398.             phonemeBuilder = rulesApplication.getPhonemeBuilder();
  399.         }

  400.         // Apply the general rules
  401.         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
  402.         // Apply the language-specific rules
  403.         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);

  404.         return phonemeBuilder.makeString();
  405.     }

  406.     /**
  407.      * Gets the Lang language guessing rules being used.
  408.      *
  409.      * @return the Lang in use
  410.      */
  411.     public Lang getLang() {
  412.         return this.lang;
  413.     }

  414.     /**
  415.      * Gets the maximum number of phonemes the engine will calculate for a given input.
  416.      *
  417.      * @return the maximum number of phonemes
  418.      * @since 1.7
  419.      */
  420.     public int getMaxPhonemes() {
  421.         return this.maxPhonemes;
  422.     }

  423.     /**
  424.      * Gets the NameType being used.
  425.      *
  426.      * @return the NameType in use
  427.      */
  428.     public NameType getNameType() {
  429.         return this.nameType;
  430.     }

  431.     /**
  432.      * Gets the RuleType being used.
  433.      *
  434.      * @return the RuleType in use
  435.      */
  436.     public RuleType getRuleType() {
  437.         return this.ruleType;
  438.     }

  439.     /**
  440.      * Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
  441.      *
  442.      * @return true if multiple phonetic encodings are returned, false if just the first is
  443.      */
  444.     public boolean isConcat() {
  445.         return this.concat;
  446.     }
  447. }