001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language.bm;
019    
020    import static org.junit.Assert.assertEquals;
021    
022    import java.util.Arrays;
023    import java.util.HashSet;
024    import java.util.Map;
025    import java.util.TreeMap;
026    
027    import org.junit.Test;
028    
029    /**
030     * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
031     *
032     * @since 1.7
033     */
034    public class PhoneticEngineRegressionTest {
035    
036        @Test
037        public void testSolrGENERIC() {
038            Map<String, String> args;
039    
040            // concat is true, ruleType is EXACT
041            args = new TreeMap<String, String>();
042            args.put("nameType", "GENERIC");
043            assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
044            args.put("ruleType", "EXACT");
045            assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
046            assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
047            args.put("languageSet", "italian,greek,spanish");
048            assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
049            assertEquals(encode(args, true, "1234"), "");
050    
051            // concat is false, ruleType is EXACT
052            args = new TreeMap<String, String>();
053            assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
054            args.put("ruleType", "EXACT");
055            assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
056            assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
057            args.put("languageSet", "italian,greek,spanish");
058            assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
059            assertEquals(encode(args, false, "1234"), "");
060    
061            // concat is true, ruleType is APPROX
062            args = new TreeMap<String, String>();
063            assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
064            args.put("ruleType", "APPROX");
065            assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
066            assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
067            args.put("languageSet", "italian,greek,spanish");
068            assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
069            assertEquals(encode(args, true, "1234"), "");
070    
071            // concat is false, ruleType is APPROX
072            args = new TreeMap<String, String>();
073            assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
074            args.put("ruleType", "APPROX");
075            assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
076            assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
077            args.put("languageSet", "italian,greek,spanish");
078            assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
079            assertEquals(encode(args, false, "1234"), "");
080        }
081    
082        @Test
083        public void testSolrASHKENAZI() {
084            Map<String, String> args;
085    
086            // concat is true, ruleType is EXACT
087            args = new TreeMap<String, String>();
088            args.put("nameType", "ASHKENAZI");
089            assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
090            args.put("ruleType", "EXACT");
091            assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
092            assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
093            args.put("languageSet", "italian,greek,spanish");
094            assertEquals(encode(args, true, "Angelo"), "angelo|anxelo");
095            assertEquals(encode(args, true, "1234"), "");
096    
097            // concat is false, ruleType is EXACT
098            args = new TreeMap<String, String>();
099            args.put("nameType", "ASHKENAZI");
100            assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
101            args.put("ruleType", "EXACT");
102            assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
103            assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
104            args.put("languageSet", "italian,greek,spanish");
105            assertEquals(encode(args, false, "Angelo"), "angelo|anxelo");
106            assertEquals(encode(args, false, "1234"), "");
107    
108            // concat is true, ruleType is APPROX
109            args = new TreeMap<String, String>();
110            args.put("nameType", "ASHKENAZI");
111            assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
112            args.put("ruleType", "APPROX");
113            assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
114            assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
115            args.put("languageSet", "italian,greek,spanish");
116            assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
117            assertEquals(encode(args, true, "1234"), "");
118    
119            // concat is false, ruleType is APPROX
120            args = new TreeMap<String, String>();
121            args.put("nameType", "ASHKENAZI");
122            assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
123            args.put("ruleType", "APPROX");
124            assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
125            assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
126            args.put("languageSet", "italian,greek,spanish");
127            assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
128            assertEquals(encode(args, false, "1234"), "");
129        }
130    
131        @Test
132        public void testSolrSEPHARDIC() {
133            Map<String, String> args;
134    
135            // concat is true, ruleType is EXACT
136            args = new TreeMap<String, String>();
137            args.put("nameType", "SEPHARDIC");
138            assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
139            args.put("ruleType", "EXACT");
140            assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
141            assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
142            args.put("languageSet", "italian,greek,spanish");
143            assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo");
144            assertEquals(encode(args, true, "1234"), "");
145    
146            // concat is false, ruleType is EXACT
147            args = new TreeMap<String, String>();
148            args.put("nameType", "SEPHARDIC");
149            assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
150            args.put("ruleType", "EXACT");
151            assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
152            assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
153            args.put("languageSet", "italian,greek,spanish");
154            assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo");
155            assertEquals(encode(args, false, "1234"), "");
156    
157            // concat is true, ruleType is APPROX
158            args = new TreeMap<String, String>();
159            args.put("nameType", "SEPHARDIC");
160            assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
161            args.put("ruleType", "APPROX");
162            assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
163            assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
164            args.put("languageSet", "italian,greek,spanish");
165            assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
166            assertEquals(encode(args, true, "1234"), "");
167    
168            // concat is false, ruleType is APPROX
169            args = new TreeMap<String, String>();
170            args.put("nameType", "SEPHARDIC");
171            assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
172            args.put("ruleType", "APPROX");
173            assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
174            assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
175            args.put("languageSet", "italian,greek,spanish");
176            assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
177            assertEquals(encode(args, false, "1234"), "");
178        }
179    
180        /**
181         * This code is similar in style to code found in Solr:
182         * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
183         *
184         * Making a JUnit test out of it to protect Solr from possible future
185         * regressions in Commons-Codec.
186         */
187        private static String encode(final Map<String, String> args, final boolean concat, final String input) {
188            Languages.LanguageSet languageSet;
189            PhoneticEngine engine;
190    
191            // PhoneticEngine = NameType + RuleType + concat
192            // we use common-codec's defaults: GENERIC + APPROX + true
193            final String nameTypeArg = args.get("nameType");
194            final NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
195    
196            final String ruleTypeArg = args.get("ruleType");
197            final RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
198    
199            engine = new PhoneticEngine(nameType, ruleType, concat);
200    
201            // LanguageSet: defaults to automagic, otherwise a comma-separated list.
202            final String languageSetArg = args.get("languageSet");
203            if (languageSetArg == null || languageSetArg.equals("auto")) {
204                languageSet = null;
205            } else {
206                languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
207            }
208    
209            /*
210                org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
211    
212                encoded = (languages == null)
213                    ? engine.encode(termAtt.toString())
214                    : engine.encode(termAtt.toString(), languages);
215    
216                Hence our approach, below:
217            */
218            if (languageSet == null) {
219                return engine.encode(input);
220            } else {
221                return engine.encode(input, languageSet);
222            }
223        }
224    }