Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes strings into their Beider-Morse phonetic encoding.
025 * <p>
026 * Beider-Morse phonetic encodings are optimized for family names. However, they may be useful for a wide range of
027 * words.
028 * </p>
029 * <p>
030 * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable,
031 * and may not be thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine}
032 * directly.
033 * </p>
034 * <h2>Encoding overview</h2>
035 * <p>
036 * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
037 * language the word comes from. For example, if it ends in "{@code ault}" then it infers that the word is French.
038 * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of
039 * letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
040 * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly, this
041 * language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking into
042 * account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
043 * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
044 * pronounced in several ways in the source language have only one way to represent them in this average phonetic
045 * language, so the result is again a set of phonetic spellings.
046 * </p>
047 * <p>
048 * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
049 * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
050 * Secondly, some names have standard prefixes, for example, "{@code Mac/Mc}" in Scottish (English) names. As
051 * sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once
052 * with the prefix and once without it. The resulting encoding contains one and then the other result.
053 * </p>
054 * <h2>Encoding format</h2>
055 * <p>
056 * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
057 * are multiple possible phonetic representations, these are joined with a pipe ({@code |}) character. If multiple
058 * hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed in ellipses and
059 * these blocks are then joined with hyphens. For example, "{@code d'ortley}" has a possible prefix. The form
060 * without prefix encodes to "{@code ortlaj|ortlej}", while the form with prefix encodes to "
061 * {@code dortlaj|dortlej}". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
062 * </p>
063 * <p>
064 * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
065 * potential phonetic interpretations. For example, "{@code Renault}" encodes to "
066 * {@code rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult}". The {@code APPROX} rules will tend to produce larger
067 * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
068 * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
069 * splitting on pipe ({@code |}) and indexing under each of these alternatives.
070 * </p>
071 * <p>
072 * <strong>Note</strong>: this version of the Beider-Morse encoding is equivalent with v3.4 of the reference implementation.
073 * </p>
074 * <p>
075 * This class is Not ThreadSafe.
076 * </p>
077 *
078 * @see <a href="https://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
079 * @see <a href="https://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
080 * @since 1.6
081 */
082public class BeiderMorseEncoder implements StringEncoder {
083
084    // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
085    // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
086
087    // a cached object
088    private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
089
090    /**
091     * Constructs a new instance.
092     */
093    public BeiderMorseEncoder() {
094        // empty
095    }
096
097    @Override
098    public Object encode(final Object source) throws EncoderException {
099        if (!(source instanceof String)) {
100            throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
101        }
102        return encode((String) source);
103    }
104
105    @Override
106    public String encode(final String source) throws EncoderException {
107        if (source == null) {
108            return null;
109        }
110        return this.engine.encode(source);
111    }
112
113    /**
114     * Gets the name type currently in operation.
115     *
116     * @return the NameType currently being used.
117     */
118    public NameType getNameType() {
119        return this.engine.getNameType();
120    }
121
122    /**
123     * Gets the rule type currently in operation.
124     *
125     * @return the RuleType currently being used.
126     */
127    public RuleType getRuleType() {
128        return this.engine.getRuleType();
129    }
130
131    /**
132     * Discovers if multiple possible encodings are concatenated.
133     *
134     * @return true if multiple encodings are concatenated, false if just the first one is returned.
135     */
136    public boolean isConcat() {
137        return this.engine.isConcat();
138    }
139
140    /**
141     * Sets how multiple possible phonetic encodings are combined.
142     *
143     * @param concat
144     *            true if multiple encodings are to be combined with a '|', false if just the first one is
145     *            to be considered.
146     */
147    public void setConcat(final boolean concat) {
148        this.engine = new PhoneticEngine(this.engine.getNameType(),
149                                         this.engine.getRuleType(),
150                                         concat,
151                                         this.engine.getMaxPhonemes());
152    }
153
154    /**
155     * Sets the number of maximum of phonemes that shall be considered by the engine.
156     *
157     * @param maxPhonemes
158     *            the maximum number of phonemes returned by the engine.
159     * @since 1.7
160     */
161    public void setMaxPhonemes(final int maxPhonemes) {
162        this.engine = new PhoneticEngine(this.engine.getNameType(),
163                                         this.engine.getRuleType(),
164                                         this.engine.isConcat(),
165                                         maxPhonemes);
166    }
167
168    /**
169     * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
170     * optimized for Ashkenazi or Sephardic Jewish family names.
171     *
172     * @param nameType
173     *            the NameType in use.
174     */
175    public void setNameType(final NameType nameType) {
176        this.engine = new PhoneticEngine(nameType,
177                                         this.engine.getRuleType(),
178                                         this.engine.isConcat(),
179                                         this.engine.getMaxPhonemes());
180    }
181
182    /**
183     * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
184     *
185     * @param ruleType
186     *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches.
187     */
188    public void setRuleType(final RuleType ruleType) {
189        this.engine = new PhoneticEngine(this.engine.getNameType(),
190                                         ruleType,
191                                         this.engine.isConcat(),
192                                         this.engine.getMaxPhonemes());
193    }
194
195}