001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.language.bm;
019
020 import static org.junit.Assert.assertEquals;
021
022 import java.util.Arrays;
023 import java.util.HashSet;
024 import java.util.Map;
025 import java.util.TreeMap;
026
027 import org.junit.Test;
028
029 /**
030 * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
031 *
032 * @since 1.7
033 */
034 public class PhoneticEngineRegressionTest {
035
036 @Test
037 public void testSolrGENERIC() {
038 Map<String, String> args;
039
040 // concat is true, ruleType is EXACT
041 args = new TreeMap<String, String>();
042 args.put("nameType", "GENERIC");
043 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
044 args.put("ruleType", "EXACT");
045 assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
046 assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
047 args.put("languageSet", "italian,greek,spanish");
048 assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
049 assertEquals(encode(args, true, "1234"), "");
050
051 // concat is false, ruleType is EXACT
052 args = new TreeMap<String, String>();
053 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
054 args.put("ruleType", "EXACT");
055 assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
056 assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
057 args.put("languageSet", "italian,greek,spanish");
058 assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
059 assertEquals(encode(args, false, "1234"), "");
060
061 // concat is true, ruleType is APPROX
062 args = new TreeMap<String, String>();
063 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
064 args.put("ruleType", "APPROX");
065 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
066 assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
067 args.put("languageSet", "italian,greek,spanish");
068 assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
069 assertEquals(encode(args, true, "1234"), "");
070
071 // concat is false, ruleType is APPROX
072 args = new TreeMap<String, String>();
073 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
074 args.put("ruleType", "APPROX");
075 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
076 assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
077 args.put("languageSet", "italian,greek,spanish");
078 assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
079 assertEquals(encode(args, false, "1234"), "");
080 }
081
082 @Test
083 public void testSolrASHKENAZI() {
084 Map<String, String> args;
085
086 // concat is true, ruleType is EXACT
087 args = new TreeMap<String, String>();
088 args.put("nameType", "ASHKENAZI");
089 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
090 args.put("ruleType", "EXACT");
091 assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
092 assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
093 args.put("languageSet", "italian,greek,spanish");
094 assertEquals(encode(args, true, "Angelo"), "angelo|anxelo");
095 assertEquals(encode(args, true, "1234"), "");
096
097 // concat is false, ruleType is EXACT
098 args = new TreeMap<String, String>();
099 args.put("nameType", "ASHKENAZI");
100 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
101 args.put("ruleType", "EXACT");
102 assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
103 assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
104 args.put("languageSet", "italian,greek,spanish");
105 assertEquals(encode(args, false, "Angelo"), "angelo|anxelo");
106 assertEquals(encode(args, false, "1234"), "");
107
108 // concat is true, ruleType is APPROX
109 args = new TreeMap<String, String>();
110 args.put("nameType", "ASHKENAZI");
111 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
112 args.put("ruleType", "APPROX");
113 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
114 assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
115 args.put("languageSet", "italian,greek,spanish");
116 assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
117 assertEquals(encode(args, true, "1234"), "");
118
119 // concat is false, ruleType is APPROX
120 args = new TreeMap<String, String>();
121 args.put("nameType", "ASHKENAZI");
122 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
123 args.put("ruleType", "APPROX");
124 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
125 assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
126 args.put("languageSet", "italian,greek,spanish");
127 assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
128 assertEquals(encode(args, false, "1234"), "");
129 }
130
131 @Test
132 public void testSolrSEPHARDIC() {
133 Map<String, String> args;
134
135 // concat is true, ruleType is EXACT
136 args = new TreeMap<String, String>();
137 args.put("nameType", "SEPHARDIC");
138 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
139 args.put("ruleType", "EXACT");
140 assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
141 assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
142 args.put("languageSet", "italian,greek,spanish");
143 assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo");
144 assertEquals(encode(args, true, "1234"), "");
145
146 // concat is false, ruleType is EXACT
147 args = new TreeMap<String, String>();
148 args.put("nameType", "SEPHARDIC");
149 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
150 args.put("ruleType", "EXACT");
151 assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
152 assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
153 args.put("languageSet", "italian,greek,spanish");
154 assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo");
155 assertEquals(encode(args, false, "1234"), "");
156
157 // concat is true, ruleType is APPROX
158 args = new TreeMap<String, String>();
159 args.put("nameType", "SEPHARDIC");
160 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
161 args.put("ruleType", "APPROX");
162 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
163 assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
164 args.put("languageSet", "italian,greek,spanish");
165 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
166 assertEquals(encode(args, true, "1234"), "");
167
168 // concat is false, ruleType is APPROX
169 args = new TreeMap<String, String>();
170 args.put("nameType", "SEPHARDIC");
171 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
172 args.put("ruleType", "APPROX");
173 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
174 assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
175 args.put("languageSet", "italian,greek,spanish");
176 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
177 assertEquals(encode(args, false, "1234"), "");
178 }
179
180 /**
181 * This code is similar in style to code found in Solr:
182 * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
183 *
184 * Making a JUnit test out of it to protect Solr from possible future
185 * regressions in Commons-Codec.
186 */
187 private static String encode(final Map<String, String> args, final boolean concat, final String input) {
188 Languages.LanguageSet languageSet;
189 PhoneticEngine engine;
190
191 // PhoneticEngine = NameType + RuleType + concat
192 // we use common-codec's defaults: GENERIC + APPROX + true
193 final String nameTypeArg = args.get("nameType");
194 final NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
195
196 final String ruleTypeArg = args.get("ruleType");
197 final RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
198
199 engine = new PhoneticEngine(nameType, ruleType, concat);
200
201 // LanguageSet: defaults to automagic, otherwise a comma-separated list.
202 final String languageSetArg = args.get("languageSet");
203 if (languageSetArg == null || languageSetArg.equals("auto")) {
204 languageSet = null;
205 } else {
206 languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
207 }
208
209 /*
210 org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
211
212 encoded = (languages == null)
213 ? engine.encode(termAtt.toString())
214 : engine.encode(termAtt.toString(), languages);
215
216 Hence our approach, below:
217 */
218 if (languageSet == null) {
219 return engine.encode(input);
220 } else {
221 return engine.encode(input, languageSet);
222 }
223 }
224 }