1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language.bm;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes strings into their Beider-Morse phonetic encoding.
25 * <p>
26 * Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range
27 * of words.
28 * <p>
29 * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it
30 * is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use
31 * {@link PhoneticEngine} directly.
32 * <p>
33 * <b>Encoding overview</b>
34 * <p>
35 * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
36 * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French.
37 * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some
38 * runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up
39 * into phonemes at different places, so this stage results in a set of possible language-specific phonetic
40 * representations. Lastly, this language-specific phonetic representation is processed by a table of rules that
41 * re-writes it phonetically taking into account systematic pronunciation differences between languages, to move
42 * it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be
43 * done and sometimes things that can be pronounced in several ways in the source language have only one way to
44 * represent them in this average phonetic language, so the result is again a set of phonetic spellings.
45 * <p>
46 * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated.
47 * In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final
48 * encoding. Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English)
49 * names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word
50 * is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result.
51 * <p>
52 * <b>Encoding format</b>
53 * <p>
54 * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where
55 * there are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character.
56 * If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed
57 * in elipses and these blocks are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible
58 * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>", while the form with prefix encodes to
59 * "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}".
60 * <p>
61 * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
62 * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
63 * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
64 * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
65 * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
66 * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
67 *
68 * @since 1.6
69 * @version $Id: BeiderMorseEncoder.html 889935 2013-12-11 05:05:13Z ggregory $
70 */
71 public class BeiderMorseEncoder implements StringEncoder {
72 // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
73 // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
74
75 // a cached object
76 private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
77
78 @Override
79 public Object encode(Object source) throws EncoderException {
80 if (!(source instanceof String)) {
81 throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String");
82 }
83 return encode((String) source);
84 }
85
86 @Override
87 public String encode(String source) throws EncoderException {
88 if (source == null) {
89 return null;
90 }
91 return this.engine.encode(source);
92 }
93
94 /**
95 * Gets the name type currently in operation.
96 *
97 * @return the NameType currently being used
98 */
99 public NameType getNameType() {
100 return this.engine.getNameType();
101 }
102
103 /**
104 * Gets the rule type currently in operation.
105 *
106 * @return the RuleType currently being used
107 */
108 public RuleType getRuleType() {
109 return this.engine.getRuleType();
110 }
111
112 /**
113 * Discovers if multiple possible encodings are concatenated.
114 *
115 * @return true if multiple encodings are concatenated, false if just the first one is returned
116 */
117 public boolean isConcat() {
118 return this.engine.isConcat();
119 }
120
121 /**
122 * Sets how multiple possible phonetic encodings are combined.
123 *
124 * @param concat
125 * true if multiple encodings are to be combined with a '|', false if just the first one is
126 * to be considered
127 */
128 public void setConcat(boolean concat) {
129 this.engine = new PhoneticEngine(this.engine.getNameType(),
130 this.engine.getRuleType(),
131 concat,
132 this.engine.getMaxPhonemes());
133 }
134
135 /**
136 * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
137 * optimized for Ashkenazi or Sephardic Jewish family names.
138 *
139 * @param nameType
140 * the NameType in use
141 */
142 public void setNameType(NameType nameType) {
143 this.engine = new PhoneticEngine(nameType,
144 this.engine.getRuleType(),
145 this.engine.isConcat(),
146 this.engine.getMaxPhonemes());
147 }
148
149 /**
150 * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
151 *
152 * @param ruleType
153 * {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
154 */
155 public void setRuleType(RuleType ruleType) {
156 this.engine = new PhoneticEngine(this.engine.getNameType(),
157 ruleType,
158 this.engine.isConcat(),
159 this.engine.getMaxPhonemes());
160 }
161
162 /**
163 * Sets the number of maximum of phonemes that shall be considered by the engine.
164 *
165 * @param maxPhonemes
166 * the maximum number of phonemes returned by the engine
167 * @since 1.7
168 */
169 public void setMaxPhonemes(int maxPhonemes) {
170 this.engine = new PhoneticEngine(this.engine.getNameType(),
171 this.engine.getRuleType(),
172 this.engine.isConcat(),
173 maxPhonemes);
174 }
175
176 }