001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language.bm;
019    
020    import org.apache.commons.codec.EncoderException;
021    import org.apache.commons.codec.StringEncoder;
022    
023    /**
024     * Encodes strings into their Beider-Morse phonetic encoding.
025     * <p>
026     * Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range
027     * of words.
028     * <p>
029     * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it
030     * is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use
031     * {@link PhoneticEngine} directly.
032     * <p>
033     * <b>Encoding overview</b>
034     * <p>
035     * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
036     * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French.
037     * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some
038     * runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up
039     * into phonemes at different places, so this stage results in a set of possible language-specific phonetic
040     * representations. Lastly, this language-specific phonetic representation is processed by a table of rules that
041     * re-writes it phonetically taking into account systematic pronunciation differences between languages, to move
042     * it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be
043     * done and sometimes things that can be pronounced in several ways in the source language have only one way to
044     * represent them in this average phonetic language, so the result is again a set of phonetic spellings.
045     * <p>
046     * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated.
047     * In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final
048     * encoding. Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English)
049     * names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word
050     * is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result.
051     * <p>
052     * <b>Encoding format</b>
053     * <p>
054     * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where
055     * there are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character.
056     * If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed
057     * in elipses and these blocks are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible
058     * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>", while the form with prefix encodes to
059     * "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
060     * <p>
061     * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
062     * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
063     * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
064     * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
065     * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
066     * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
067     *
068     * @since 1.6
069     * @version $Id: BeiderMorseEncoder.html 889935 2013-12-11 05:05:13Z ggregory $
070     */
071    public class BeiderMorseEncoder implements StringEncoder {
072        // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
073        // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
074    
075        // a cached object
076        private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
077    
078        @Override
079        public Object encode(Object source) throws EncoderException {
080            if (!(source instanceof String)) {
081                throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
082            }
083            return encode((String) source);
084        }
085    
086        @Override
087        public String encode(String source) throws EncoderException {
088            if (source == null) {
089                return null;
090            }
091            return this.engine.encode(source);
092        }
093    
094        /**
095         * Gets the name type currently in operation.
096         *
097         * @return the NameType currently being used
098         */
099        public NameType getNameType() {
100            return this.engine.getNameType();
101        }
102    
103        /**
104         * Gets the rule type currently in operation.
105         *
106         * @return the RuleType currently being used
107         */
108        public RuleType getRuleType() {
109            return this.engine.getRuleType();
110        }
111    
112        /**
113         * Discovers if multiple possible encodings are concatenated.
114         *
115         * @return true if multiple encodings are concatenated, false if just the first one is returned
116         */
117        public boolean isConcat() {
118            return this.engine.isConcat();
119        }
120    
121        /**
122         * Sets how multiple possible phonetic encodings are combined.
123         *
124         * @param concat
125         *            true if multiple encodings are to be combined with a '|', false if just the first one is
126         *            to be considered
127         */
128        public void setConcat(boolean concat) {
129            this.engine = new PhoneticEngine(this.engine.getNameType(),
130                                             this.engine.getRuleType(),
131                                             concat,
132                                             this.engine.getMaxPhonemes());
133        }
134    
135        /**
136         * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
137         * optimized for Ashkenazi or Sephardic Jewish family names.
138         *
139         * @param nameType
140         *            the NameType in use
141         */
142        public void setNameType(NameType nameType) {
143            this.engine = new PhoneticEngine(nameType,
144                                             this.engine.getRuleType(),
145                                             this.engine.isConcat(),
146                                             this.engine.getMaxPhonemes());
147        }
148    
149        /**
150         * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
151         *
152         * @param ruleType
153         *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
154         */
155        public void setRuleType(RuleType ruleType) {
156            this.engine = new PhoneticEngine(this.engine.getNameType(),
157                                             ruleType,
158                                             this.engine.isConcat(),
159                                             this.engine.getMaxPhonemes());
160        }
161    
162        /**
163         * Sets the number of maximum of phonemes that shall be considered by the engine.
164         *
165         * @param maxPhonemes
166         *            the maximum number of phonemes returned by the engine
167         * @since 1.7
168         */
169        public void setMaxPhonemes(int maxPhonemes) {
170            this.engine = new PhoneticEngine(this.engine.getNameType(),
171                                             this.engine.getRuleType(),
172                                             this.engine.isConcat(),
173                                             maxPhonemes);
174        }
175    
176    }