View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  
22  import java.util.Arrays;
23  import java.util.HashSet;
24  import java.util.Map;
25  import java.util.TreeMap;
26  
27  import org.junit.jupiter.api.Test;
28  
29  /**
30   * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
31   */
32  public class PhoneticEngineRegressionTest {
33  
34      /**
35       * This code is similar in style to code found in Solr:
36       * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
37       *
38       * Making a JUnit test out of it to protect Solr from possible future
39       * regressions in Commons-Codec.
40       */
41      private static String encode(final Map<String, String> args, final boolean concat, final String input) {
42          final Languages.LanguageSet languageSet;
43          final PhoneticEngine engine;
44  
45          // PhoneticEngine = NameType + RuleType + concat
46          // we use common-codec's defaults: GENERIC + APPROX + true
47          final String nameTypeArg = args.get("nameType");
48          final NameType nameType = nameTypeArg == null ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
49  
50          final String ruleTypeArg = args.get("ruleType");
51          final RuleType ruleType = ruleTypeArg == null ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
52  
53          engine = new PhoneticEngine(nameType, ruleType, concat);
54  
55          // LanguageSet: defaults to automagic, otherwise a comma-separated list.
56          final String languageSetArg = args.get("languageSet");
57          if (languageSetArg == null || languageSetArg.equals("auto")) {
58              languageSet = null;
59          } else {
60              languageSet = Languages.LanguageSet.from(new HashSet<>(Arrays.asList(languageSetArg.split(","))));
61          }
62  
63          /*
64              org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
65  
66              encoded = (languages == null)
67                  ? engine.encode(termAtt.toString())
68                  : engine.encode(termAtt.toString(), languages);
69  
70              Hence our approach, below:
71          */
72          if (languageSet == null) {
73              return engine.encode(input);
74          }
75          return engine.encode(input, languageSet);
76      }
77  
78      @Test
79      public void testCompatibilityWithOriginalVersion() {
80          // see CODEC-187
81          // comparison: https://stevemorse.org/census/soundex.html
82  
83          final Map<String, String> args = new TreeMap<>();
84          args.put("nameType", "GENERIC");
85          args.put("ruleType", "APPROX");
86  
87          assertEquals("Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom",
88              encode(args, true, "abram"));
89          assertEquals("bndzn|bntsn|bnzn|vndzn|vntsn",
90              encode(args, true, "Bendzin"));
91  
92          args.put("nameType", "ASHKENAZI");
93          args.put("ruleType", "APPROX");
94  
95          assertEquals("Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom",
96              encode(args, true, "abram"));
97          assertEquals("YlpYrn|Ylpirn|alpYrn|alpirn|olpYrn|olpirn|xalpirn|xolpirn",
98              encode(args, true, "Halpern"));
99  
100     }
101 
102     @Test
103     public void testSolrASHKENAZI() {
104         Map<String, String> args;
105 
106         // concat is true, ruleType is EXACT
107         args = new TreeMap<>();
108         args.put("nameType", "ASHKENAZI");
109         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
110             encode(args, true, "Angelo"));
111         args.put("ruleType", "EXACT");
112         assertEquals("andZelo|angelo|anhelo|anxelo",
113             encode(args, true, "Angelo"));
114         assertEquals("dandZelo|dangelo|danhelo|danxelo",
115             encode(args, true, "D'Angelo"));
116         args.put("languageSet", "italian,greek,spanish");
117         assertEquals("angelo|anxelo",
118             encode(args, true, "Angelo"));
119         assertEquals(encode(args, true, "1234"), "");
120 
121         // concat is false, ruleType is EXACT
122         args = new TreeMap<>();
123         args.put("nameType", "ASHKENAZI");
124         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
125             encode(args, false, "Angelo"));
126         args.put("ruleType", "EXACT");
127         assertEquals("andZelo|angelo|anhelo|anxelo",
128             encode(args, false, "Angelo"));
129         assertEquals("dandZelo|dangelo|danhelo|danxelo",
130             encode(args, false, "D'Angelo"));
131         args.put("languageSet", "italian,greek,spanish");
132         assertEquals("angelo|anxelo",
133             encode(args, false, "Angelo"));
134         assertEquals(encode(args, false, "1234"), "");
135 
136         // concat is true, ruleType is APPROX
137         args = new TreeMap<>();
138         args.put("nameType", "ASHKENAZI");
139         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
140             encode(args, true, "Angelo"));
141         args.put("ruleType", "APPROX");
142         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
143             encode(args, true, "Angelo"));
144         assertEquals("dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo",
145             encode(args, true, "D'Angelo"));
146         args.put("languageSet", "italian,greek,spanish");
147         assertEquals("angilo|anxilo|ongilo|onxilo",
148             encode(args, true, "Angelo"));
149         assertEquals(encode(args, true, "1234"), "");
150 
151         // concat is false, ruleType is APPROX
152         args = new TreeMap<>();
153         args.put("nameType", "ASHKENAZI");
154         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
155             encode(args, false, "Angelo"));
156         args.put("ruleType", "APPROX");
157         assertEquals("YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo",
158             encode(args, false, "Angelo"));
159         assertEquals("dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo",
160             encode(args, false, "D'Angelo"));
161         args.put("languageSet", "italian,greek,spanish");
162         assertEquals("angilo|anxilo|ongilo|onxilo",
163             encode(args, false, "Angelo"));
164         assertEquals(encode(args, false, "1234"), "");
165     }
166 
167     @Test
168     public void testSolrGENERIC() {
169         Map<String, String> args;
170 
171         // concat is true, ruleType is EXACT
172         args = new TreeMap<>();
173         args.put("nameType", "GENERIC");
174         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
175             encode(args, true, "Angelo"));
176         args.put("ruleType", "EXACT");
177         assertEquals("anZelo|andZelo|angelo|anhelo|anjelo|anxelo",
178             encode(args, true, "Angelo"));
179         assertEquals("(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)",
180             encode(args, true, "D'Angelo"));
181         args.put("languageSet", "italian,greek,spanish");
182         assertEquals("andZelo|angelo|anxelo",
183             encode(args, true, "Angelo"));
184         assertEquals(encode(args, true, "1234"), "");
185 
186         // concat is false, ruleType is EXACT
187         args = new TreeMap<>();
188         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
189             encode(args, false, "Angelo"));
190         args.put("ruleType", "EXACT");
191         assertEquals("anZelo|andZelo|angelo|anhelo|anjelo|anxelo",
192             encode(args, false, "Angelo"));
193         assertEquals("(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)",
194             encode(args, false, "D'Angelo"));
195         args.put("languageSet", "italian,greek,spanish");
196         assertEquals("andZelo|angelo|anxelo",
197             encode(args, false, "Angelo"));
198         assertEquals(encode(args, false, "1234"), "");
199 
200         // concat is true, ruleType is APPROX
201         args = new TreeMap<>();
202         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
203             encode(args, true, "Angelo"));
204         args.put("ruleType", "APPROX");
205         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
206             encode(args, true, "Angelo"));
207         assertEquals("(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)",
208             encode(args, true, "D'Angelo"));
209         args.put("languageSet", "italian,greek,spanish");
210         assertEquals("angilo|anxilo|anzilo|ongilo|onxilo|onzilo",
211             encode(args, true, "Angelo"));
212         assertEquals(encode(args, true, "1234"), "");
213 
214         // concat is false, ruleType is APPROX
215         args = new TreeMap<>();
216         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
217             encode(args, false, "Angelo"));
218         args.put("ruleType", "APPROX");
219         assertEquals("YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo",
220             encode(args, false, "Angelo"));
221         assertEquals("(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)",
222             encode(args, false, "D'Angelo"));
223         args.put("languageSet", "italian,greek,spanish");
224         assertEquals("angilo|anxilo|anzilo|ongilo|onxilo|onzilo",
225             encode(args, false, "Angelo"));
226         assertEquals(encode(args, false, "1234"), "");
227     }
228 
229     @Test
230     public void testSolrSEPHARDIC() {
231         Map<String, String> args;
232 
233         // concat is true, ruleType is EXACT
234         args = new TreeMap<>();
235         args.put("nameType", "SEPHARDIC");
236         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
237             encode(args, true, "Angelo"));
238         args.put("ruleType", "EXACT");
239         assertEquals("anZelo|andZelo|anxelo",
240             encode(args, true, "Angelo"));
241         assertEquals("anZelo|andZelo|anxelo",
242             encode(args, true, "D'Angelo"));
243         args.put("languageSet", "italian,greek,spanish");
244         assertEquals("andZelo|anxelo",
245             encode(args, true, "Angelo"));
246         assertEquals(encode(args, true, "1234"), "");
247 
248         // concat is false, ruleType is EXACT
249         args = new TreeMap<>();
250         args.put("nameType", "SEPHARDIC");
251         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
252             encode(args, false, "Angelo"));
253         args.put("ruleType", "EXACT");
254         assertEquals("anZelo|andZelo|anxelo",
255             encode(args, false, "Angelo"));
256         assertEquals("danZelo|dandZelo|danxelo",
257             encode(args, false, "D'Angelo"));
258         args.put("languageSet", "italian,greek,spanish");
259         assertEquals("andZelo|anxelo",
260             encode(args, false, "Angelo"));
261         assertEquals(encode(args, false, "1234"), "");
262 
263         // concat is true, ruleType is APPROX
264         args = new TreeMap<>();
265         args.put("nameType", "SEPHARDIC");
266         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
267             encode(args, true, "Angelo"));
268         args.put("ruleType", "APPROX");
269         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
270             encode(args, true, "Angelo"));
271         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
272             encode(args, true, "D'Angelo"));
273         args.put("languageSet", "italian,greek,spanish");
274         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
275             encode(args, true, "Angelo"));
276         assertEquals(encode(args, true, "1234"), "");
277 
278         // concat is false, ruleType is APPROX
279         args = new TreeMap<>();
280         args.put("nameType", "SEPHARDIC");
281         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
282             encode(args, false, "Angelo"));
283         args.put("ruleType", "APPROX");
284         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
285             encode(args, false, "Angelo"));
286         assertEquals("danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu",
287             encode(args, false, "D'Angelo"));
288         args.put("languageSet", "italian,greek,spanish");
289         assertEquals("anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu",
290             encode(args, false, "Angelo"));
291         assertEquals(encode(args, false, "1234"), "");
292     }
293 }