BeiderMorseEncoder.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language.bm;

  18. import org.apache.commons.codec.EncoderException;
  19. import org.apache.commons.codec.StringEncoder;

  20. /**
  21.  * Encodes strings into their Beider-Morse phonetic encoding.
  22.  * <p>
  23.  * Beider-Morse phonetic encodings are optimized for family names. However, they may be useful for a wide range of
  24.  * words.
  25.  * </p>
  26.  * <p>
  27.  * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable,
  28.  * and may not be thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine}
  29.  * directly.
  30.  * </p>
  31.  * <h2>Encoding overview</h2>
  32.  * <p>
  33.  * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
  34.  * language the word comes from. For example, if it ends in "{@code ault}" then it infers that the word is French.
  35.  * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of
  36.  * letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
  37.  * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly, this
  38.  * language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking into
  39.  * account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
  40.  * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
  41.  * pronounced in several ways in the source language have only one way to represent them in this average phonetic
  42.  * language, so the result is again a set of phonetic spellings.
  43.  * </p>
  44.  * <p>
  45.  * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
  46.  * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
  47.  * Secondly, some names have standard prefixes, for example, "{@code Mac/Mc}" in Scottish (English) names. As
  48.  * sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once
  49.  * with the prefix and once without it. The resulting encoding contains one and then the other result.
  50.  * </p>
  51.  * <h2>Encoding format</h2>
  52.  * <p>
  53.  * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
  54.  * are multiple possible phonetic representations, these are joined with a pipe ({@code |}) character. If multiple
  55.  * hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed in ellipses and
  56.  * these blocks are then joined with hyphens. For example, "{@code d'ortley}" has a possible prefix. The form
  57.  * without prefix encodes to "{@code ortlaj|ortlej}", while the form with prefix encodes to "
  58.  * {@code dortlaj|dortlej}". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
  59.  * </p>
  60.  * <p>
  61.  * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
  62.  * potential phonetic interpretations. For example, "{@code Renault}" encodes to "
  63.  * {@code rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult}". The {@code APPROX} rules will tend to produce larger
  64.  * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
  65.  * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
  66.  * splitting on pipe ({@code |}) and indexing under each of these alternatives.
  67.  * </p>
  68.  * <p>
  69.  * <strong>Note</strong>: this version of the Beider-Morse encoding is equivalent with v3.4 of the reference implementation.
  70.  * </p>
  71.  * @see <a href="https://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
  72.  * @see <a href="https://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
  73.  *
  74.  * <p>
  75.  * This class is Not ThreadSafe.
  76.  * </p>
  77.  * @since 1.6
  78.  */
  79. public class BeiderMorseEncoder implements StringEncoder {

  80.     // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
  81.     // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.

  82.     // a cached object
  83.     private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);

  84.     /**
  85.      * Constructs a new instance.
  86.      */
  87.     public BeiderMorseEncoder() {
  88.         // empty
  89.     }

  90.     @Override
  91.     public Object encode(final Object source) throws EncoderException {
  92.         if (!(source instanceof String)) {
  93.             throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
  94.         }
  95.         return encode((String) source);
  96.     }

  97.     @Override
  98.     public String encode(final String source) throws EncoderException {
  99.         if (source == null) {
  100.             return null;
  101.         }
  102.         return this.engine.encode(source);
  103.     }

  104.     /**
  105.      * Gets the name type currently in operation.
  106.      *
  107.      * @return the NameType currently being used
  108.      */
  109.     public NameType getNameType() {
  110.         return this.engine.getNameType();
  111.     }

  112.     /**
  113.      * Gets the rule type currently in operation.
  114.      *
  115.      * @return the RuleType currently being used
  116.      */
  117.     public RuleType getRuleType() {
  118.         return this.engine.getRuleType();
  119.     }

  120.     /**
  121.      * Discovers if multiple possible encodings are concatenated.
  122.      *
  123.      * @return true if multiple encodings are concatenated, false if just the first one is returned
  124.      */
  125.     public boolean isConcat() {
  126.         return this.engine.isConcat();
  127.     }

  128.     /**
  129.      * Sets how multiple possible phonetic encodings are combined.
  130.      *
  131.      * @param concat
  132.      *            true if multiple encodings are to be combined with a '|', false if just the first one is
  133.      *            to be considered
  134.      */
  135.     public void setConcat(final boolean concat) {
  136.         this.engine = new PhoneticEngine(this.engine.getNameType(),
  137.                                          this.engine.getRuleType(),
  138.                                          concat,
  139.                                          this.engine.getMaxPhonemes());
  140.     }

  141.     /**
  142.      * Sets the number of maximum of phonemes that shall be considered by the engine.
  143.      *
  144.      * @param maxPhonemes
  145.      *            the maximum number of phonemes returned by the engine
  146.      * @since 1.7
  147.      */
  148.     public void setMaxPhonemes(final int maxPhonemes) {
  149.         this.engine = new PhoneticEngine(this.engine.getNameType(),
  150.                                          this.engine.getRuleType(),
  151.                                          this.engine.isConcat(),
  152.                                          maxPhonemes);
  153.     }

  154.     /**
  155.      * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
  156.      * optimized for Ashkenazi or Sephardic Jewish family names.
  157.      *
  158.      * @param nameType
  159.      *            the NameType in use
  160.      */
  161.     public void setNameType(final NameType nameType) {
  162.         this.engine = new PhoneticEngine(nameType,
  163.                                          this.engine.getRuleType(),
  164.                                          this.engine.isConcat(),
  165.                                          this.engine.getMaxPhonemes());
  166.     }

  167.     /**
  168.      * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
  169.      *
  170.      * @param ruleType
  171.      *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
  172.      */
  173.     public void setRuleType(final RuleType ruleType) {
  174.         this.engine = new PhoneticEngine(this.engine.getNameType(),
  175.                                          ruleType,
  176.                                          this.engine.isConcat(),
  177.                                          this.engine.getMaxPhonemes());
  178.     }

  179. }