View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  
22  import java.util.Arrays;
23  import java.util.HashSet;
24  import java.util.Map;
25  import java.util.TreeMap;
26  import java.util.regex.Pattern;
27  
28  import org.junit.jupiter.api.Test;
29  
30  /**
31   * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
32   */
33  class PhoneticEngineRegressionTest {
34  
35      private static final Pattern COMMA_PATTERN = Pattern.compile(",");
36  
37      /**
38       * This code is similar in style to code found in Solr:
39       * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
40       *
41       * Making a JUnit test out of it to protect Solr from possible future
42       * regressions in Commons-Codec.
43       */
44      private static String encode(final Map<String, String> args, final boolean concat, final String input) {
45          final Languages.LanguageSet languageSet;
46          final PhoneticEngine engine;
47  
48          // PhoneticEngine = NameType + RuleType + concat
49          // we use common-codec's defaults: GENERIC + APPROX + true
50          final String nameTypeArg = args.get("nameType");
51          final NameType nameType = nameTypeArg == null ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
52  
53          final String ruleTypeArg = args.get("ruleType");
54          final RuleType ruleType = ruleTypeArg == null ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
55  
56          engine = new PhoneticEngine(nameType, ruleType, concat);
57  
58          // LanguageSet: defaults to automagic, otherwise a comma-separated list.
59          final String languageSetArg = args.get("languageSet");
60          if (languageSetArg == null || languageSetArg.equals("auto")) {
61              languageSet = null;
62          } else {
63              languageSet = Languages.LanguageSet.from(new HashSet<>(Arrays.asList(COMMA_PATTERN.split(languageSetArg))));
64          }
65  
66          /*
67              org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
68  
69              encoded = (languages == null)
70                  ? engine.encode(termAtt.toString())
71                  : engine.encode(termAtt.toString(), languages);
72  
73              Hence our approach, below:
74          */
75          if (languageSet == null) {
76              return engine.encode(input);
77          }
78          return engine.encode(input, languageSet);
79      }
80  
81      @Test
82      void testCompatibilityWithOriginalVersion() {
83          // see CODEC-187
84          // comparison: https://stevemorse.org/census/soundex.html
85  
86          final Map<String, String> args = new TreeMap<>();
87          args.put("nameType", "GENERIC");
88          args.put("ruleType", "APPROX");
89  
90          assertEquals("Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom",
91              encode(args, true, "abram"));
92          assertEquals("bndzn|bntsn|bnzn|vndzn|vntsn",
93              encode(args, true, "Bendzin"));
94  
95          args.put("nameType", "ASHKENAZI");
96          args.put("ruleType", "APPROX");
97  
98          assertEquals("Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom",
99              encode(args, true, "abram"));
100         assertEquals("YlpYrn|Ylpirn|alpYrn|alpirn|olpYrn|olpirn|xalpirn|xolpirn",
101             encode(args, true, "Halpern"));
102 
103     }
104 
105     @Test
106     void testSolrASHKENAZI() {
107         Map<String, String> args;
108 
109         // concat is true, ruleType is EXACT
110         args = new TreeMap<>();
111         args.put("nameType", "ASHKENAZI");
112         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
113             encode(args, true, "Angelo"));
114         args.put("ruleType", "EXACT");
115         assertEquals("andZelo|angelo|anhelo|anxelo",
116             encode(args, true, "Angelo"));
117         assertEquals("dandZelo|dangelo|danhelo|danxelo",
118             encode(args, true, "D'Angelo"));
119         args.put("languageSet", "italian,greek,spanish");
120         assertEquals("angelo|anxelo",
121             encode(args, true, "Angelo"));
122         assertEquals(encode(args, true, "1234"), "");
123 
124         // concat is false, ruleType is EXACT
125         args = new TreeMap<>();
126         args.put("nameType", "ASHKENAZI");
127         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
128             encode(args, false, "Angelo"));
129         args.put("ruleType", "EXACT");
130         assertEquals("andZelo|angelo|anhelo|anxelo",
131             encode(args, false, "Angelo"));
132         assertEquals("dandZelo|dangelo|danhelo|danxelo",
133             encode(args, false, "D'Angelo"));
134         args.put("languageSet", "italian,greek,spanish");
135         assertEquals("angelo|anxelo",
136             encode(args, false, "Angelo"));
137         assertEquals(encode(args, false, "1234"), "");
138 
139         // concat is true, ruleType is APPROX
140         args = new TreeMap<>();
141         args.put("nameType", "ASHKENAZI");
142         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
143             encode(args, true, "Angelo"));
144         args.put("ruleType", "APPROX");
145         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
146             encode(args, true, "Angelo"));
147         assertEquals("dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo",
148             encode(args, true, "D'Angelo"));
149         args.put("languageSet", "italian,greek,spanish");
150         assertEquals("angilo|anxilo|ongilo|onxilo",
151             encode(args, true, "Angelo"));
152         assertEquals(encode(args, true, "1234"), "");
153 
154         // concat is false, ruleType is APPROX
155         args = new TreeMap<>();
156         args.put("nameType", "ASHKENAZI");
157         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
158             encode(args, false, "Angelo"));
159         args.put("ruleType", "APPROX");
160         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
161             encode(args, false, "Angelo"));
162         assertEquals("dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo",
163             encode(args, false, "D'Angelo"));
164         args.put("languageSet", "italian,greek,spanish");
165         assertEquals("angilo|anxilo|ongilo|onxilo",
166             encode(args, false, "Angelo"));
167         assertEquals(encode(args, false, "1234"), "");
168     }
169 
170     @Test
171     void testSolrGENERIC() {
172         Map<String, String> args;
173 
174         // concat is true, ruleType is EXACT
175         args = new TreeMap<>();
176         args.put("nameType", "GENERIC");
177         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
178             encode(args, true, "Angelo"));
179         args.put("ruleType", "EXACT");
180         assertEquals("anZelo|andZelo|angelo|anhelo|anjelo|anxelo",
181             encode(args, true, "Angelo"));
182         assertEquals("(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)",
183             encode(args, true, "D'Angelo"));
184         args.put("languageSet", "italian,greek,spanish");
185         assertEquals("andZelo|angelo|anxelo",
186             encode(args, true, "Angelo"));
187         assertEquals(encode(args, true, "1234"), "");
188 
189         // concat is false, ruleType is EXACT
190         args = new TreeMap<>();
191         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
192             encode(args, false, "Angelo"));
193         args.put("ruleType", "EXACT");
194         assertEquals("anZelo|andZelo|angelo|anhelo|anjelo|anxelo",
195             encode(args, false, "Angelo"));
196         assertEquals("(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)",
197             encode(args, false, "D'Angelo"));
198         args.put("languageSet", "italian,greek,spanish");
199         assertEquals("andZelo|angelo|anxelo",
200             encode(args, false, "Angelo"));
201         assertEquals(encode(args, false, "1234"), "");
202 
203         // concat is true, ruleType is APPROX
204         args = new TreeMap<>();
205         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
206             encode(args, true, "Angelo"));
207         args.put("ruleType", "APPROX");
208         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
209             encode(args, true, "Angelo"));
210         assertEquals("(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)",
211             encode(args, true, "D'Angelo"));
212         args.put("languageSet", "italian,greek,spanish");
213         assertEquals("angilo|anxilo|anzilo|ongilo|onxilo|onzilo",
214             encode(args, true, "Angelo"));
215         assertEquals(encode(args, true, "1234"), "");
216 
217         // concat is false, ruleType is APPROX
218         args = new TreeMap<>();
219         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
220             encode(args, false, "Angelo"));
221         args.put("ruleType", "APPROX");
222         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
223             encode(args, false, "Angelo"));
224         assertEquals("(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)",
225             encode(args, false, "D'Angelo"));
226         args.put("languageSet", "italian,greek,spanish");
227         assertEquals("angilo|anxilo|anzilo|ongilo|onxilo|onzilo",
228             encode(args, false, "Angelo"));
229         assertEquals(encode(args, false, "1234"), "");
230     }
231 
232     @Test
233     void testSolrSEPHARDIC() {
234         Map<String, String> args;
235 
236         // concat is true, ruleType is EXACT
237         args = new TreeMap<>();
238         args.put("nameType", "SEPHARDIC");
239         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
240             encode(args, true, "Angelo"));
241         args.put("ruleType", "EXACT");
242         assertEquals("anZelo|andZelo|anxelo",
243             encode(args, true, "Angelo"));
244         assertEquals("anZelo|andZelo|anxelo",
245             encode(args, true, "D'Angelo"));
246         args.put("languageSet", "italian,greek,spanish");
247         assertEquals("andZelo|anxelo",
248             encode(args, true, "Angelo"));
249         assertEquals(encode(args, true, "1234"), "");
250 
251         // concat is false, ruleType is EXACT
252         args = new TreeMap<>();
253         args.put("nameType", "SEPHARDIC");
254         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
255             encode(args, false, "Angelo"));
256         args.put("ruleType", "EXACT");
257         assertEquals("anZelo|andZelo|anxelo",
258             encode(args, false, "Angelo"));
259         assertEquals("danZelo|dandZelo|danxelo",
260             encode(args, false, "D'Angelo"));
261         args.put("languageSet", "italian,greek,spanish");
262         assertEquals("andZelo|anxelo",
263             encode(args, false, "Angelo"));
264         assertEquals(encode(args, false, "1234"), "");
265 
266         // concat is true, ruleType is APPROX
267         args = new TreeMap<>();
268         args.put("nameType", "SEPHARDIC");
269         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
270             encode(args, true, "Angelo"));
271         args.put("ruleType", "APPROX");
272         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
273             encode(args, true, "Angelo"));
274         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
275             encode(args, true, "D'Angelo"));
276         args.put("languageSet", "italian,greek,spanish");
277         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
278             encode(args, true, "Angelo"));
279         assertEquals(encode(args, true, "1234"), "");
280 
281         // concat is false, ruleType is APPROX
282         args = new TreeMap<>();
283         args.put("nameType", "SEPHARDIC");
284         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
285             encode(args, false, "Angelo"));
286         args.put("ruleType", "APPROX");
287         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
288             encode(args, false, "Angelo"));
289         assertEquals("danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu",
290             encode(args, false, "D'Angelo"));
291         args.put("languageSet", "italian,greek,spanish");
292         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
293             encode(args, false, "Angelo"));
294         assertEquals(encode(args, false, "1234"), "");
295     }
296 }