001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language.bm; 019 020import static org.junit.Assert.assertEquals; 021 022import java.util.Arrays; 023import java.util.HashSet; 024import java.util.Map; 025import java.util.TreeMap; 026 027import org.junit.Test; 028 029/** 030 * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0. 031 * 032 * @since 1.7 033 */ 034public class PhoneticEngineRegressionTest { 035 036 @Test 037 public void testSolrGENERIC() { 038 Map<String, String> args; 039 040 // concat is true, ruleType is EXACT 041 args = new TreeMap<String, String>(); 042 args.put("nameType", "GENERIC"); 043 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 044 args.put("ruleType", "EXACT"); 045 assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo"); 046 assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)"); 047 args.put("languageSet", "italian,greek,spanish"); 048 assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo"); 049 assertEquals(encode(args, true, "1234"), ""); 050 051 // concat is false, ruleType is EXACT 052 args = new TreeMap<String, String>(); 053 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 054 args.put("ruleType", "EXACT"); 055 assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo"); 056 assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)"); 057 args.put("languageSet", "italian,greek,spanish"); 058 assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo"); 059 assertEquals(encode(args, false, "1234"), ""); 060 061 // concat is true, ruleType is APPROX 062 args = new TreeMap<String, String>(); 063 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 064 args.put("ruleType", "APPROX"); 065 assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 066 assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)"); 067 args.put("languageSet", "italian,greek,spanish"); 068 assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo"); 069 assertEquals(encode(args, true, "1234"), ""); 070 071 // concat is false, ruleType is APPROX 072 args = new TreeMap<String, String>(); 073 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 074 args.put("ruleType", "APPROX"); 075 assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); 076 assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)"); 077 args.put("languageSet", "italian,greek,spanish"); 078 assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo"); 079 assertEquals(encode(args, false, "1234"), ""); 080 } 081 082 @Test 083 public void testSolrASHKENAZI() { 084 Map<String, String> args; 085 086 // concat is true, ruleType is EXACT 087 args = new TreeMap<String, String>(); 088 args.put("nameType", "ASHKENAZI"); 089 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 090 args.put("ruleType", "EXACT"); 091 assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo"); 092 assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo"); 093 args.put("languageSet", "italian,greek,spanish"); 094 assertEquals(encode(args, true, "Angelo"), "angelo|anxelo"); 095 assertEquals(encode(args, true, "1234"), ""); 096 097 // concat is false, ruleType is EXACT 098 args = new TreeMap<String, String>(); 099 args.put("nameType", "ASHKENAZI"); 100 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 101 args.put("ruleType", "EXACT"); 102 assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo"); 103 assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo"); 104 args.put("languageSet", "italian,greek,spanish"); 105 assertEquals(encode(args, false, "Angelo"), "angelo|anxelo"); 106 assertEquals(encode(args, false, "1234"), ""); 107 108 // concat is true, ruleType is APPROX 109 args = new TreeMap<String, String>(); 110 args.put("nameType", "ASHKENAZI"); 111 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 112 args.put("ruleType", "APPROX"); 113 assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 114 assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO"); 115 args.put("languageSet", "italian,greek,spanish"); 116 assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO"); 117 assertEquals(encode(args, true, "1234"), ""); 118 119 // concat is false, ruleType is APPROX 120 args = new TreeMap<String, String>(); 121 args.put("nameType", "ASHKENAZI"); 122 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 123 args.put("ruleType", "APPROX"); 124 assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); 125 assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO"); 126 args.put("languageSet", "italian,greek,spanish"); 127 assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO"); 128 assertEquals(encode(args, false, "1234"), ""); 129 } 130 131 @Test 132 public void testSolrSEPHARDIC() { 133 Map<String, String> args; 134 135 // concat is true, ruleType is EXACT 136 args = new TreeMap<String, String>(); 137 args.put("nameType", "SEPHARDIC"); 138 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 139 args.put("ruleType", "EXACT"); 140 assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo"); 141 assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo"); 142 args.put("languageSet", "italian,greek,spanish"); 143 assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo"); 144 assertEquals(encode(args, true, "1234"), ""); 145 146 // concat is false, ruleType is EXACT 147 args = new TreeMap<String, String>(); 148 args.put("nameType", "SEPHARDIC"); 149 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 150 args.put("ruleType", "EXACT"); 151 assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo"); 152 assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo"); 153 args.put("languageSet", "italian,greek,spanish"); 154 assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo"); 155 assertEquals(encode(args, false, "1234"), ""); 156 157 // concat is true, ruleType is APPROX 158 args = new TreeMap<String, String>(); 159 args.put("nameType", "SEPHARDIC"); 160 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 161 args.put("ruleType", "APPROX"); 162 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 163 assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 164 args.put("languageSet", "italian,greek,spanish"); 165 assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 166 assertEquals(encode(args, true, "1234"), ""); 167 168 // concat is false, ruleType is APPROX 169 args = new TreeMap<String, String>(); 170 args.put("nameType", "SEPHARDIC"); 171 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 172 args.put("ruleType", "APPROX"); 173 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 174 assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu"); 175 args.put("languageSet", "italian,greek,spanish"); 176 assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); 177 assertEquals(encode(args, false, "1234"), ""); 178 } 179 180 /** 181 * This code is similar in style to code found in Solr: 182 * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java 183 * 184 * Making a JUnit test out of it to protect Solr from possible future 185 * regressions in Commons-Codec. 186 */ 187 private static String encode(final Map<String, String> args, final boolean concat, final String input) { 188 Languages.LanguageSet languageSet; 189 PhoneticEngine engine; 190 191 // PhoneticEngine = NameType + RuleType + concat 192 // we use common-codec's defaults: GENERIC + APPROX + true 193 final String nameTypeArg = args.get("nameType"); 194 final NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg); 195 196 final String ruleTypeArg = args.get("ruleType"); 197 final RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg); 198 199 engine = new PhoneticEngine(nameType, ruleType, concat); 200 201 // LanguageSet: defaults to automagic, otherwise a comma-separated list. 202 final String languageSetArg = args.get("languageSet"); 203 if (languageSetArg == null || languageSetArg.equals("auto")) { 204 languageSet = null; 205 } else { 206 languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(",")))); 207 } 208 209 /* 210 org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this: 211 212 encoded = (languages == null) 213 ? engine.encode(termAtt.toString()) 214 : engine.encode(termAtt.toString(), languages); 215 216 Hence our approach, below: 217 */ 218 if (languageSet == null) { 219 return engine.encode(input); 220 } else { 221 return engine.encode(input, languageSet); 222 } 223 } 224}