001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.LinkedHashSet; 024import java.util.List; 025import java.util.Map; 026import java.util.Scanner; 027import java.util.Set; 028import java.util.regex.Pattern; 029 030import org.apache.commons.codec.CharEncoding; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.Resources; 033import org.apache.commons.codec.StringEncoder; 034 035/** 036 * Encodes a string into a Daitch-Mokotoff Soundex value. 037 * <p> 038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 040 * </p> 041 * <p> 042 * The main differences compared to the other Soundex variants are: 043 * </p> 044 * <ul> 045 * <li>coded names are 6 digits long 046 * <li>the initial character of the name is coded 047 * <li>rules to encoded multi-character n-grams 048 * <li>multiple possible encodings for the same name (branching) 049 * </ul> 050 * <p> 051 * This implementation supports branching, depending on the used method: 052 * <ul> 053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 055 * </ul> 056 * <p> 057 * Note: This implementation has additional branching rules compared to the original description of the algorithm. The 058 * rules can be customized by overriding the default rules contained in the resource file 059 * {@code org/apache/commons/codec/language/dmrules.txt}. 060 * </p> 061 * <p> 062 * This class is thread-safe. 063 * </p> 064 * 065 * @see Soundex 066 * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 067 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 068 * @since 1.10 069 */ 070public class DaitchMokotoffSoundex implements StringEncoder { 071 072 /** 073 * Inner class representing a branch during DM Soundex encoding. 074 */ 075 private static final class Branch { 076 private final StringBuilder builder; 077 private String cachedString; 078 private String lastReplacement; 079 080 private Branch() { 081 builder = new StringBuilder(); 082 } 083 084 /** 085 * Creates a new branch, identical to this branch. 086 * 087 * @return a new, identical branch 088 */ 089 private Branch createBranch() { 090 final Branch branch = new Branch(); 091 branch.builder.append(toString()); 092 branch.lastReplacement = this.lastReplacement; 093 return branch; 094 } 095 096 @Override 097 public boolean equals(final Object other) { 098 if (this == other) { 099 return true; 100 } 101 if (!(other instanceof Branch)) { 102 return false; 103 } 104 return toString().equals(((Branch) other).toString()); 105 } 106 107 /** 108 * Finish this branch by appending '0's until the maximum code length has been reached. 109 */ 110 private void finish() { 111 while (builder.length() < MAX_LENGTH) { 112 builder.append('0'); 113 cachedString = null; 114 } 115 } 116 117 @Override 118 public int hashCode() { 119 return toString().hashCode(); 120 } 121 122 /** 123 * Process the next replacement to be added to this branch. 124 * 125 * @param replacement 126 * the next replacement to append. 127 * @param forceAppend 128 * indicates if the default processing shall be overridden. 129 */ 130 private void processNextReplacement(final String replacement, final boolean forceAppend) { 131 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 132 if (append && builder.length() < MAX_LENGTH) { 133 builder.append(replacement); 134 // remove all characters after the maximum length 135 if (builder.length() > MAX_LENGTH) { 136 builder.delete(MAX_LENGTH, builder.length()); 137 } 138 cachedString = null; 139 } 140 lastReplacement = replacement; 141 } 142 143 @Override 144 public String toString() { 145 if (cachedString == null) { 146 cachedString = builder.toString(); 147 } 148 return cachedString; 149 } 150 } 151 152 /** 153 * Inner class for storing rules. 154 */ 155 private static final class Rule { 156 private static final Pattern PIPE = Pattern.compile("\\|"); 157 private final String pattern; 158 private final String[] replacementAtStart; 159 private final String[] replacementBeforeVowel; 160 private final String[] replacementDefault; 161 162 private Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 163 final String replacementDefault) { 164 this.pattern = pattern; 165 this.replacementAtStart = PIPE.split(replacementAtStart); 166 this.replacementBeforeVowel = PIPE.split(replacementBeforeVowel); 167 this.replacementDefault = PIPE.split(replacementDefault); 168 } 169 170 private int getPatternLength() { 171 return pattern.length(); 172 } 173 174 private String[] getReplacements(final String context, final boolean atStart) { 175 if (atStart) { 176 return replacementAtStart; 177 } 178 179 final int nextIndex = getPatternLength(); 180 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex)); 181 if (nextCharIsVowel) { 182 return replacementBeforeVowel; 183 } 184 185 return replacementDefault; 186 } 187 188 private boolean isVowel(final char ch) { 189 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 190 } 191 192 private boolean matches(final String context) { 193 return context.startsWith(pattern); 194 } 195 196 @Override 197 public String toString() { 198 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 199 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 200 } 201 } 202 203 /** 204 * The NUL character. 205 */ 206 private static final char NUL = '\0'; 207 208 private static final String COMMENT = "//"; 209 210 private static final String DOUBLE_QUOTE = "\""; 211 212 private static final String MULTILINE_COMMENT_END = "*/"; 213 214 private static final String MULTILINE_COMMENT_START = "/*"; 215 216 /** The resource file containing the replacement and folding rules */ 217 private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt"; 218 219 /** The code length of a DM Soundex value. */ 220 private static final int MAX_LENGTH = 6; 221 222 /** Transformation rules indexed by the first character of their pattern. */ 223 private static final Map<Character, List<Rule>> RULES = new HashMap<>(); 224 225 /** Folding rules. */ 226 private static final Map<Character, Character> FOLDINGS = new HashMap<>(); 227 228 private static final Pattern EQUAL = Pattern.compile("="); 229 230 private static final Pattern SPACES = Pattern.compile("\\s+"); 231 232 static { 233 try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { 234 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 235 } 236 // sort RULES by pattern length in descending order 237 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength())); 238 } 239 240 private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping, 241 final Map<Character, Character> asciiFoldings) { 242 int currentLine = 0; 243 boolean inMultilineComment = false; 244 while (scanner.hasNextLine()) { 245 currentLine++; 246 final String rawLine = scanner.nextLine(); 247 String line = rawLine; 248 if (inMultilineComment) { 249 if (line.endsWith(MULTILINE_COMMENT_END)) { 250 inMultilineComment = false; 251 } 252 continue; 253 } 254 if (line.startsWith(MULTILINE_COMMENT_START)) { 255 inMultilineComment = true; 256 } else { 257 // discard comments 258 final int cmtI = line.indexOf(COMMENT); 259 if (cmtI >= 0) { 260 line = line.substring(0, cmtI); 261 } 262 // trim leading-trailing whitespace 263 line = line.trim(); 264 if (line.isEmpty()) { 265 continue; // empty lines can be safely skipped 266 } 267 if (line.contains("=")) { 268 // folding 269 final String[] parts = EQUAL.split(line); 270 if (parts.length != 2) { 271 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location); 272 } 273 final String leftCharacter = parts[0]; 274 final String rightCharacter = parts[1]; 275 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 276 throw new IllegalArgumentException( 277 "Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location); 278 } 279 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 280 } else { 281 // rule 282 final String[] parts = SPACES.split(line); 283 if (parts.length != 4) { 284 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); 285 } 286 try { 287 final String pattern = stripQuotes(parts[0]); 288 final String replacement1 = stripQuotes(parts[1]); 289 final String replacement2 = stripQuotes(parts[2]); 290 final String replacement3 = stripQuotes(parts[3]); 291 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 292 final char patternKey = r.pattern.charAt(0); 293 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>()); 294 rules.add(r); 295 } catch (final IllegalArgumentException e) { 296 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e); 297 } 298 } 299 } 300 } 301 } 302 303 private static String stripQuotes(String str) { 304 if (str.startsWith(DOUBLE_QUOTE)) { 305 str = str.substring(1); 306 } 307 if (str.endsWith(DOUBLE_QUOTE)) { 308 str = str.substring(0, str.length() - 1); 309 } 310 return str; 311 } 312 313 /** Whether to use ASCII folding prior to encoding. */ 314 private final boolean folding; 315 316 /** 317 * Creates a new instance with ASCII-folding enabled. 318 */ 319 public DaitchMokotoffSoundex() { 320 this(true); 321 } 322 323 /** 324 * Creates a new instance. 325 * <p> 326 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example 327 * รจ -> e. 328 * </p> 329 * 330 * @param folding 331 * if ASCII-folding shall be performed before encoding 332 */ 333 public DaitchMokotoffSoundex(final boolean folding) { 334 this.folding = folding; 335 } 336 337 /** 338 * Performs a cleanup of the input string before the actual Soundex transformation. 339 * <p> 340 * Removes all whitespace characters and performs ASCII folding if enabled. 341 * </p> 342 * 343 * @param input 344 * the input string to clean up. 345 * @return a cleaned up string. 346 */ 347 private String cleanup(final String input) { 348 final StringBuilder sb = new StringBuilder(); 349 for (char ch : input.toCharArray()) { 350 if (Character.isWhitespace(ch) || !Character.isLetter(ch)) { 351 continue; 352 } 353 ch = Character.toLowerCase(ch); 354 final Character character = FOLDINGS.get(ch); 355 if (folding && character != null) { 356 ch = character; 357 } 358 sb.append(ch); 359 } 360 return sb.toString(); 361 } 362 363 /** 364 * Encodes an Object using the Daitch-Mokotoff Soundex algorithm without branching. 365 * <p> 366 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 367 * EncoderException if the supplied object is not of type {@link String}. 368 * </p> 369 * 370 * @see #soundex(String) 371 * @param obj 372 * Object to encode. 373 * @return An object (of type {@link String}) containing the DM Soundex code, which corresponds to the String 374 * supplied. 375 * @throws EncoderException 376 * if the parameter supplied is not of type {@link String}. 377 * @throws IllegalArgumentException 378 * if a character is not mapped. 379 */ 380 @Override 381 public Object encode(final Object obj) throws EncoderException { 382 if (!(obj instanceof String)) { 383 throw new EncoderException( 384 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 385 } 386 return encode((String) obj); 387 } 388 389 /** 390 * Encodes a String using the Daitch-Mokotoff Soundex algorithm without branching. 391 * 392 * @see #soundex(String) 393 * @param source 394 * A String object to encode. 395 * @return A DM Soundex code corresponding to the String supplied. 396 * @throws IllegalArgumentException 397 * if a character is not mapped. 398 */ 399 @Override 400 public String encode(final String source) { 401 if (source == null) { 402 return null; 403 } 404 return soundex(source, false)[0]; 405 } 406 407 /** 408 * Encodes a String using the Daitch-Mokotoff Soundex algorithm with branching. 409 * <p> 410 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 411 * separated by '|'. 412 * </p> 413 * <p> 414 * Example: the name "AUERBACH" is encoded as both 415 * </p> 416 * <ul> 417 * <li>097400</li> 418 * <li>097500</li> 419 * </ul> 420 * <p> 421 * Thus the result will be "097400|097500". 422 * </p> 423 * 424 * @param source 425 * A String object to encode. 426 * @return A string containing a set of DM Soundex codes corresponding to the String supplied. 427 * @throws IllegalArgumentException 428 * if a character is not mapped. 429 */ 430 public String soundex(final String source) { 431 return String.join("|", soundex(source, true)); 432 } 433 434 /** 435 * Perform the actual DM Soundex algorithm on the input string. 436 * 437 * @param source 438 * A String object to encode. 439 * @param branching 440 * If branching shall be performed. 441 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 442 * selected branching mode. 443 */ 444 private String[] soundex(final String source, final boolean branching) { 445 if (source == null) { 446 return null; 447 } 448 final String input = cleanup(source); 449 final Set<Branch> currentBranches = new LinkedHashSet<>(); 450 currentBranches.add(new Branch()); 451 char lastChar = NUL; 452 for (int index = 0; index < input.length(); index++) { 453 final char ch = input.charAt(index); 454 final String inputContext = input.substring(index); 455 final List<Rule> rules = RULES.get(ch); 456 if (rules == null) { 457 continue; 458 } 459 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 460 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList(); 461 for (final Rule rule : rules) { 462 if (rule.matches(inputContext)) { 463 if (branching) { 464 nextBranches.clear(); 465 } 466 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 467 final boolean branchingRequired = replacements.length > 1 && branching; 468 for (final Branch branch : currentBranches) { 469 for (final String nextReplacement : replacements) { 470 // if we have multiple replacements, always create a new branch 471 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 472 // special rule: occurrences of mn or nm are treated differently 473 final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm'; 474 nextBranch.processNextReplacement(nextReplacement, force); 475 if (!branching) { 476 break; 477 } 478 nextBranches.add(nextBranch); 479 } 480 } 481 if (branching) { 482 currentBranches.clear(); 483 currentBranches.addAll(nextBranches); 484 } 485 index += rule.getPatternLength() - 1; 486 break; 487 } 488 } 489 lastChar = ch; 490 } 491 final String[] result = new String[currentBranches.size()]; 492 int index = 0; 493 for (final Branch branch : currentBranches) { 494 branch.finish(); 495 result[index++] = branch.toString(); 496 } 497 return result; 498 } 499}