001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.LinkedHashSet; 024import java.util.List; 025import java.util.Map; 026import java.util.Scanner; 027import java.util.Set; 028 029import org.apache.commons.codec.CharEncoding; 030import org.apache.commons.codec.EncoderException; 031import org.apache.commons.codec.Resources; 032import org.apache.commons.codec.StringEncoder; 033 034/** 035 * Encodes a string into a Daitch-Mokotoff Soundex value. 036 * <p> 037 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 038 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 039 * </p> 040 * <p> 041 * The main differences compared to the other soundex variants are: 042 * </p> 043 * <ul> 044 * <li>coded names are 6 digits long 045 * <li>the initial character of the name is coded 046 * <li>rules to encoded multi-character n-grams 047 * <li>multiple possible encodings for the same name (branching) 048 * </ul> 049 * <p> 050 * This implementation supports branching, depending on the used method: 051 * <ul> 052 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 053 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 054 * </ul> 055 * <p> 056 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The 057 * rules can be customized by overriding the default rules contained in the resource file 058 * {@code org/apache/commons/codec/language/dmrules.txt}. 059 * </p> 060 * <p> 061 * This class is thread-safe. 062 * </p> 063 * 064 * @see Soundex 065 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 066 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 067 * 068 * @since 1.10 069 */ 070public class DaitchMokotoffSoundex implements StringEncoder { 071 072 private static final String COMMENT = "//"; 073 074 private static final String DOUBLE_QUOTE = "\""; 075 076 private static final String MULTILINE_COMMENT_END = "*/"; 077 078 private static final String MULTILINE_COMMENT_START = "/*"; 079 080 /** The resource file containing the replacement and folding rules */ 081 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; 082 083 /** The code length of a DM soundex value. */ 084 private static final int MAX_LENGTH = 6; 085 086 /** Transformation rules indexed by the first character of their pattern. */ 087 private static final Map<Character, List<Rule>> RULES = new HashMap<>(); 088 089 /** Folding rules. */ 090 private static final Map<Character, Character> FOLDINGS = new HashMap<>(); 091 092 /** Whether to use ASCII folding prior to encoding. */ 093 private final boolean folding; 094 095 /** 096 * Inner class representing a branch during DM soundex encoding. 097 */ 098 private static final class Branch { 099 private final StringBuilder builder; 100 private String cachedString; 101 private String lastReplacement; 102 103 private Branch() { 104 builder = new StringBuilder(); 105 lastReplacement = null; 106 cachedString = null; 107 } 108 109 /** 110 * Creates a new branch, identical to this branch. 111 * 112 * @return a new, identical branch 113 */ 114 public Branch createBranch() { 115 final Branch branch = new Branch(); 116 branch.builder.append(toString()); 117 branch.lastReplacement = this.lastReplacement; 118 return branch; 119 } 120 121 @Override 122 public boolean equals(final Object other) { 123 if (this == other) { 124 return true; 125 } 126 if (!(other instanceof Branch)) { 127 return false; 128 } 129 130 return toString().equals(((Branch) other).toString()); 131 } 132 133 /** 134 * Finish this branch by appending '0's until the maximum code length has been reached. 135 */ 136 public void finish() { 137 while (builder.length() < MAX_LENGTH) { 138 builder.append('0'); 139 cachedString = null; 140 } 141 } 142 143 @Override 144 public int hashCode() { 145 return toString().hashCode(); 146 } 147 148 /** 149 * Process the next replacement to be added to this branch. 150 * 151 * @param replacement 152 * the next replacement to append 153 * @param forceAppend 154 * indicates if the default processing shall be overridden 155 */ 156 public void processNextReplacement(final String replacement, final boolean forceAppend) { 157 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 158 159 if (append && builder.length() < MAX_LENGTH) { 160 builder.append(replacement); 161 // remove all characters after the maximum length 162 if (builder.length() > MAX_LENGTH) { 163 builder.delete(MAX_LENGTH, builder.length()); 164 } 165 cachedString = null; 166 } 167 168 lastReplacement = replacement; 169 } 170 171 @Override 172 public String toString() { 173 if (cachedString == null) { 174 cachedString = builder.toString(); 175 } 176 return cachedString; 177 } 178 } 179 180 /** 181 * Inner class for storing rules. 182 */ 183 private static final class Rule { 184 private final String pattern; 185 private final String[] replacementAtStart; 186 private final String[] replacementBeforeVowel; 187 private final String[] replacementDefault; 188 189 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 190 final String replacementDefault) { 191 this.pattern = pattern; 192 this.replacementAtStart = replacementAtStart.split("\\|"); 193 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); 194 this.replacementDefault = replacementDefault.split("\\|"); 195 } 196 197 public int getPatternLength() { 198 return pattern.length(); 199 } 200 201 public String[] getReplacements(final String context, final boolean atStart) { 202 if (atStart) { 203 return replacementAtStart; 204 } 205 206 final int nextIndex = getPatternLength(); 207 final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex)); 208 if (nextCharIsVowel) { 209 return replacementBeforeVowel; 210 } 211 212 return replacementDefault; 213 } 214 215 private boolean isVowel(final char ch) { 216 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 217 } 218 219 public boolean matches(final String context) { 220 return context.startsWith(pattern); 221 } 222 223 @Override 224 public String toString() { 225 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 226 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 227 } 228 } 229 230 static { 231 try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { 232 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 233 } 234 235 // sort RULES by pattern length in descending order 236 RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength())); 237 } 238 239 private static void parseRules(final Scanner scanner, final String location, 240 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { 241 int currentLine = 0; 242 boolean inMultilineComment = false; 243 244 while (scanner.hasNextLine()) { 245 currentLine++; 246 final String rawLine = scanner.nextLine(); 247 String line = rawLine; 248 249 if (inMultilineComment) { 250 if (line.endsWith(MULTILINE_COMMENT_END)) { 251 inMultilineComment = false; 252 } 253 continue; 254 } 255 256 if (line.startsWith(MULTILINE_COMMENT_START)) { 257 inMultilineComment = true; 258 } else { 259 // discard comments 260 final int cmtI = line.indexOf(COMMENT); 261 if (cmtI >= 0) { 262 line = line.substring(0, cmtI); 263 } 264 265 // trim leading-trailing whitespace 266 line = line.trim(); 267 268 if (line.isEmpty()) { 269 continue; // empty lines can be safely skipped 270 } 271 272 if (line.contains("=")) { 273 // folding 274 final String[] parts = line.split("="); 275 if (parts.length != 2) { 276 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + 277 " parts: " + rawLine + " in " + location); 278 } 279 final String leftCharacter = parts[0]; 280 final String rightCharacter = parts[1]; 281 282 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 283 throw new IllegalArgumentException("Malformed folding statement - " + 284 "patterns are not single characters: " + rawLine + " in " + location); 285 } 286 287 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 288 } else { 289 // rule 290 final String[] parts = line.split("\\s+"); 291 if (parts.length != 4) { 292 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 293 " parts: " + rawLine + " in " + location); 294 } 295 try { 296 final String pattern = stripQuotes(parts[0]); 297 final String replacement1 = stripQuotes(parts[1]); 298 final String replacement2 = stripQuotes(parts[2]); 299 final String replacement3 = stripQuotes(parts[3]); 300 301 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 302 final char patternKey = r.pattern.charAt(0); 303 final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>()); 304 rules.add(r); 305 } catch (final IllegalArgumentException e) { 306 throw new IllegalStateException( 307 "Problem parsing line '" + currentLine + "' in " + location, e); 308 } 309 } 310 } 311 } 312 } 313 314 private static String stripQuotes(String str) { 315 if (str.startsWith(DOUBLE_QUOTE)) { 316 str = str.substring(1); 317 } 318 319 if (str.endsWith(DOUBLE_QUOTE)) { 320 str = str.substring(0, str.length() - 1); 321 } 322 323 return str; 324 } 325 326 /** 327 * Creates a new instance with ASCII-folding enabled. 328 */ 329 public DaitchMokotoffSoundex() { 330 this(true); 331 } 332 333 /** 334 * Creates a new instance. 335 * <p> 336 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. 337 * รจ -> e. 338 * </p> 339 * 340 * @param folding 341 * if ASCII-folding shall be performed before encoding 342 */ 343 public DaitchMokotoffSoundex(final boolean folding) { 344 this.folding = folding; 345 } 346 347 /** 348 * Performs a cleanup of the input string before the actual soundex transformation. 349 * <p> 350 * Removes all whitespace characters and performs ASCII folding if enabled. 351 * </p> 352 * 353 * @param input 354 * the input string to clean up 355 * @return a cleaned up string 356 */ 357 private String cleanup(final String input) { 358 final StringBuilder sb = new StringBuilder(); 359 for (char ch : input.toCharArray()) { 360 if (Character.isWhitespace(ch)) { 361 continue; 362 } 363 364 ch = Character.toLowerCase(ch); 365 final Character character = FOLDINGS.get(ch); 366 if (folding && character != null) { 367 ch = character; 368 } 369 sb.append(ch); 370 } 371 return sb.toString(); 372 } 373 374 /** 375 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. 376 * <p> 377 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 378 * EncoderException if the supplied object is not of type java.lang.String. 379 * </p> 380 * 381 * @see #soundex(String) 382 * 383 * @param obj 384 * Object to encode 385 * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String 386 * supplied. 387 * @throws EncoderException 388 * if the parameter supplied is not of type java.lang.String 389 * @throws IllegalArgumentException 390 * if a character is not mapped 391 */ 392 @Override 393 public Object encode(final Object obj) throws EncoderException { 394 if (!(obj instanceof String)) { 395 throw new EncoderException( 396 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 397 } 398 return encode((String) obj); 399 } 400 401 /** 402 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. 403 * 404 * @see #soundex(String) 405 * 406 * @param source 407 * A String object to encode 408 * @return A DM Soundex code corresponding to the String supplied 409 * @throws IllegalArgumentException 410 * if a character is not mapped 411 */ 412 @Override 413 public String encode(final String source) { 414 if (source == null) { 415 return null; 416 } 417 return soundex(source, false)[0]; 418 } 419 420 /** 421 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. 422 * <p> 423 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 424 * separated by '|'. 425 * </p> 426 * <p> 427 * Example: the name "AUERBACH" is encoded as both 428 * </p> 429 * <ul> 430 * <li>097400</li> 431 * <li>097500</li> 432 * </ul> 433 * <p> 434 * Thus the result will be "097400|097500". 435 * </p> 436 * 437 * @param source 438 * A String object to encode 439 * @return A string containing a set of DM Soundex codes corresponding to the String supplied 440 * @throws IllegalArgumentException 441 * if a character is not mapped 442 */ 443 public String soundex(final String source) { 444 final String[] branches = soundex(source, true); 445 final StringBuilder sb = new StringBuilder(); 446 int index = 0; 447 for (final String branch : branches) { 448 sb.append(branch); 449 if (++index < branches.length) { 450 sb.append('|'); 451 } 452 } 453 return sb.toString(); 454 } 455 456 /** 457 * Perform the actual DM Soundex algorithm on the input string. 458 * 459 * @param source 460 * A String object to encode 461 * @param branching 462 * If branching shall be performed 463 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 464 * selected branching mode 465 */ 466 private String[] soundex(final String source, final boolean branching) { 467 if (source == null) { 468 return null; 469 } 470 471 final String input = cleanup(source); 472 473 final Set<Branch> currentBranches = new LinkedHashSet<>(); 474 currentBranches.add(new Branch()); 475 476 char lastChar = '\0'; 477 for (int index = 0; index < input.length(); index++) { 478 final char ch = input.charAt(index); 479 480 // ignore whitespace inside a name 481 if (Character.isWhitespace(ch)) { 482 continue; 483 } 484 485 final String inputContext = input.substring(index); 486 final List<Rule> rules = RULES.get(ch); 487 if (rules == null) { 488 continue; 489 } 490 491 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 492 final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList(); 493 494 for (final Rule rule : rules) { 495 if (rule.matches(inputContext)) { 496 if (branching) { 497 nextBranches.clear(); 498 } 499 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 500 final boolean branchingRequired = replacements.length > 1 && branching; 501 502 for (final Branch branch : currentBranches) { 503 for (final String nextReplacement : replacements) { 504 // if we have multiple replacements, always create a new branch 505 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 506 507 // special rule: occurrences of mn or nm are treated differently 508 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); 509 510 nextBranch.processNextReplacement(nextReplacement, force); 511 512 if (!branching) { 513 break; 514 } 515 nextBranches.add(nextBranch); 516 } 517 } 518 519 if (branching) { 520 currentBranches.clear(); 521 currentBranches.addAll(nextBranches); 522 } 523 index += rule.getPatternLength() - 1; 524 break; 525 } 526 } 527 528 lastChar = ch; 529 } 530 531 final String[] result = new String[currentBranches.size()]; 532 int index = 0; 533 for (final Branch branch : currentBranches) { 534 branch.finish(); 535 result[index++] = branch.toString(); 536 } 537 538 return result; 539 } 540}