001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language.bm;
019
020import static org.junit.Assert.assertEquals;
021
022import java.util.Arrays;
023import java.util.HashSet;
024import java.util.Map;
025import java.util.TreeMap;
026
027import org.junit.Test;
028
029/**
030 * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
031 *
032 * @since 1.7
033 */
034public class PhoneticEngineRegressionTest {
035
036    @Test
037    public void testSolrGENERIC() {
038        Map<String, String> args;
039
040        // concat is true, ruleType is EXACT
041        args = new TreeMap<String, String>();
042        args.put("nameType", "GENERIC");
043        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
044        args.put("ruleType", "EXACT");
045        assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
046        assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
047        args.put("languageSet", "italian,greek,spanish");
048        assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
049        assertEquals(encode(args, true, "1234"), "");
050
051        // concat is false, ruleType is EXACT
052        args = new TreeMap<String, String>();
053        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
054        args.put("ruleType", "EXACT");
055        assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
056        assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
057        args.put("languageSet", "italian,greek,spanish");
058        assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
059        assertEquals(encode(args, false, "1234"), "");
060
061        // concat is true, ruleType is APPROX
062        args = new TreeMap<String, String>();
063        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
064        args.put("ruleType", "APPROX");
065        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
066        assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
067        args.put("languageSet", "italian,greek,spanish");
068        assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
069        assertEquals(encode(args, true, "1234"), "");
070
071        // concat is false, ruleType is APPROX
072        args = new TreeMap<String, String>();
073        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
074        args.put("ruleType", "APPROX");
075        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
076        assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
077        args.put("languageSet", "italian,greek,spanish");
078        assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
079        assertEquals(encode(args, false, "1234"), "");
080    }
081
082    @Test
083    public void testSolrASHKENAZI() {
084        Map<String, String> args;
085
086        // concat is true, ruleType is EXACT
087        args = new TreeMap<String, String>();
088        args.put("nameType", "ASHKENAZI");
089        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
090        args.put("ruleType", "EXACT");
091        assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
092        assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
093        args.put("languageSet", "italian,greek,spanish");
094        assertEquals(encode(args, true, "Angelo"), "angelo|anxelo");
095        assertEquals(encode(args, true, "1234"), "");
096
097        // concat is false, ruleType is EXACT
098        args = new TreeMap<String, String>();
099        args.put("nameType", "ASHKENAZI");
100        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
101        args.put("ruleType", "EXACT");
102        assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
103        assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
104        args.put("languageSet", "italian,greek,spanish");
105        assertEquals(encode(args, false, "Angelo"), "angelo|anxelo");
106        assertEquals(encode(args, false, "1234"), "");
107
108        // concat is true, ruleType is APPROX
109        args = new TreeMap<String, String>();
110        args.put("nameType", "ASHKENAZI");
111        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
112        args.put("ruleType", "APPROX");
113        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
114        assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
115        args.put("languageSet", "italian,greek,spanish");
116        assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
117        assertEquals(encode(args, true, "1234"), "");
118
119        // concat is false, ruleType is APPROX
120        args = new TreeMap<String, String>();
121        args.put("nameType", "ASHKENAZI");
122        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
123        args.put("ruleType", "APPROX");
124        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
125        assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
126        args.put("languageSet", "italian,greek,spanish");
127        assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
128        assertEquals(encode(args, false, "1234"), "");
129    }
130
131    @Test
132    public void testSolrSEPHARDIC() {
133        Map<String, String> args;
134
135        // concat is true, ruleType is EXACT
136        args = new TreeMap<String, String>();
137        args.put("nameType", "SEPHARDIC");
138        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
139        args.put("ruleType", "EXACT");
140        assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
141        assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
142        args.put("languageSet", "italian,greek,spanish");
143        assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo");
144        assertEquals(encode(args, true, "1234"), "");
145
146        // concat is false, ruleType is EXACT
147        args = new TreeMap<String, String>();
148        args.put("nameType", "SEPHARDIC");
149        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
150        args.put("ruleType", "EXACT");
151        assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
152        assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
153        args.put("languageSet", "italian,greek,spanish");
154        assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo");
155        assertEquals(encode(args, false, "1234"), "");
156
157        // concat is true, ruleType is APPROX
158        args = new TreeMap<String, String>();
159        args.put("nameType", "SEPHARDIC");
160        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
161        args.put("ruleType", "APPROX");
162        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
163        assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
164        args.put("languageSet", "italian,greek,spanish");
165        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
166        assertEquals(encode(args, true, "1234"), "");
167
168        // concat is false, ruleType is APPROX
169        args = new TreeMap<String, String>();
170        args.put("nameType", "SEPHARDIC");
171        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
172        args.put("ruleType", "APPROX");
173        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
174        assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
175        args.put("languageSet", "italian,greek,spanish");
176        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
177        assertEquals(encode(args, false, "1234"), "");
178    }
179
180    /**
181     * This code is similar in style to code found in Solr:
182     * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
183     *
184     * Making a JUnit test out of it to protect Solr from possible future
185     * regressions in Commons-Codec.
186     */
187    private static String encode(final Map<String, String> args, final boolean concat, final String input) {
188        Languages.LanguageSet languageSet;
189        PhoneticEngine engine;
190
191        // PhoneticEngine = NameType + RuleType + concat
192        // we use common-codec's defaults: GENERIC + APPROX + true
193        final String nameTypeArg = args.get("nameType");
194        final NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
195
196        final String ruleTypeArg = args.get("ruleType");
197        final RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
198
199        engine = new PhoneticEngine(nameType, ruleType, concat);
200
201        // LanguageSet: defaults to automagic, otherwise a comma-separated list.
202        final String languageSetArg = args.get("languageSet");
203        if (languageSetArg == null || languageSetArg.equals("auto")) {
204            languageSet = null;
205        } else {
206            languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
207        }
208
209        /*
210            org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
211
212            encoded = (languages == null)
213                ? engine.encode(termAtt.toString())
214                : engine.encode(termAtt.toString(), languages);
215
216            Hence our approach, below:
217        */
218        if (languageSet == null) {
219            return engine.encode(input);
220        } else {
221            return engine.encode(input, languageSet);
222        }
223    }
224}