001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.Comparator; 023import java.util.HashMap; 024import java.util.LinkedHashSet; 025import java.util.List; 026import java.util.Map; 027import java.util.Scanner; 028import java.util.Set; 029 030import org.apache.commons.codec.CharEncoding; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.Resources; 033import org.apache.commons.codec.StringEncoder; 034 035/** 036 * Encodes a string into a Daitch-Mokotoff Soundex value. 037 * <p> 038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 040 * </p> 041 * <p> 042 * The main differences compared to the other soundex variants are: 043 * </p> 044 * <ul> 045 * <li>coded names are 6 digits long 046 * <li>the initial character of the name is coded 047 * <li>rules to encoded multi-character n-grams 048 * <li>multiple possible encodings for the same name (branching) 049 * </ul> 050 * <p> 051 * This implementation supports branching, depending on the used method: 052 * <ul> 053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 055 * </ul> 056 * <p> 057 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The 058 * rules can be customized by overriding the default rules contained in the resource file 059 * {@code org/apache/commons/codec/language/dmrules.txt}. 060 * </p> 061 * <p> 062 * This class is thread-safe. 063 * </p> 064 * 065 * @see Soundex 066 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 067 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 068 * 069 * @since 1.10 070 */ 071public class DaitchMokotoffSoundex implements StringEncoder { 072 073 /** 074 * Inner class representing a branch during DM soundex encoding. 075 */ 076 private static final class Branch { 077 private final StringBuilder builder; 078 private String cachedString; 079 private String lastReplacement; 080 081 private Branch() { 082 builder = new StringBuilder(); 083 lastReplacement = null; 084 cachedString = null; 085 } 086 087 /** 088 * Creates a new branch, identical to this branch. 089 * 090 * @return a new, identical branch 091 */ 092 public Branch createBranch() { 093 final Branch branch = new Branch(); 094 branch.builder.append(toString()); 095 branch.lastReplacement = this.lastReplacement; 096 return branch; 097 } 098 099 @Override 100 public boolean equals(final Object other) { 101 if (this == other) { 102 return true; 103 } 104 if (!(other instanceof Branch)) { 105 return false; 106 } 107 108 return toString().equals(((Branch) other).toString()); 109 } 110 111 /** 112 * Finish this branch by appending '0's until the maximum code length has been reached. 113 */ 114 public void finish() { 115 while (builder.length() < MAX_LENGTH) { 116 builder.append('0'); 117 cachedString = null; 118 } 119 } 120 121 @Override 122 public int hashCode() { 123 return toString().hashCode(); 124 } 125 126 /** 127 * Process the next replacement to be added to this branch. 128 * 129 * @param replacement 130 * the next replacement to append 131 * @param forceAppend 132 * indicates if the default processing shall be overridden 133 */ 134 public void processNextReplacement(final String replacement, final boolean forceAppend) { 135 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 136 137 if (append && builder.length() < MAX_LENGTH) { 138 builder.append(replacement); 139 // remove all characters after the maximum length 140 if (builder.length() > MAX_LENGTH) { 141 builder.delete(MAX_LENGTH, builder.length()); 142 } 143 cachedString = null; 144 } 145 146 lastReplacement = replacement; 147 } 148 149 @Override 150 public String toString() { 151 if (cachedString == null) { 152 cachedString = builder.toString(); 153 } 154 return cachedString; 155 } 156 } 157 158 /** 159 * Inner class for storing rules. 160 */ 161 private static final class Rule { 162 private final String pattern; 163 private final String[] replacementAtStart; 164 private final String[] replacementBeforeVowel; 165 private final String[] replacementDefault; 166 167 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 168 final String replacementDefault) { 169 this.pattern = pattern; 170 this.replacementAtStart = replacementAtStart.split("\\|"); 171 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); 172 this.replacementDefault = replacementDefault.split("\\|"); 173 } 174 175 public int getPatternLength() { 176 return pattern.length(); 177 } 178 179 public String[] getReplacements(final String context, final boolean atStart) { 180 if (atStart) { 181 return replacementAtStart; 182 } 183 184 final int nextIndex = getPatternLength(); 185 final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; 186 if (nextCharIsVowel) { 187 return replacementBeforeVowel; 188 } 189 190 return replacementDefault; 191 } 192 193 private boolean isVowel(final char ch) { 194 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 195 } 196 197 public boolean matches(final String context) { 198 return context.startsWith(pattern); 199 } 200 201 @Override 202 public String toString() { 203 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 204 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 205 } 206 } 207 208 private static final String COMMENT = "//"; 209 private static final String DOUBLE_QUOTE = "\""; 210 211 private static final String MULTILINE_COMMENT_END = "*/"; 212 213 private static final String MULTILINE_COMMENT_START = "/*"; 214 215 /** The resource file containing the replacement and folding rules */ 216 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; 217 218 /** The code length of a DM soundex value. */ 219 private static final int MAX_LENGTH = 6; 220 221 /** Transformation rules indexed by the first character of their pattern. */ 222 private static final Map<Character, List<Rule>> RULES = new HashMap<>(); 223 224 /** Folding rules. */ 225 private static final Map<Character, Character> FOLDINGS = new HashMap<>(); 226 227 static { 228 try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) { 229 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 230 } 231 232 // sort RULES by pattern length in descending order 233 for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { 234 final List<Rule> ruleList = rule.getValue(); 235 Collections.sort(ruleList, new Comparator<Rule>() { 236 @Override 237 public int compare(final Rule rule1, final Rule rule2) { 238 return rule2.getPatternLength() - rule1.getPatternLength(); 239 } 240 }); 241 } 242 } 243 244 private static void parseRules(final Scanner scanner, final String location, 245 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { 246 int currentLine = 0; 247 boolean inMultilineComment = false; 248 249 while (scanner.hasNextLine()) { 250 currentLine++; 251 final String rawLine = scanner.nextLine(); 252 String line = rawLine; 253 254 if (inMultilineComment) { 255 if (line.endsWith(MULTILINE_COMMENT_END)) { 256 inMultilineComment = false; 257 } 258 continue; 259 } 260 261 if (line.startsWith(MULTILINE_COMMENT_START)) { 262 inMultilineComment = true; 263 } else { 264 // discard comments 265 final int cmtI = line.indexOf(COMMENT); 266 if (cmtI >= 0) { 267 line = line.substring(0, cmtI); 268 } 269 270 // trim leading-trailing whitespace 271 line = line.trim(); 272 273 if (line.length() == 0) { 274 continue; // empty lines can be safely skipped 275 } 276 277 if (line.contains("=")) { 278 // folding 279 final String[] parts = line.split("="); 280 if (parts.length != 2) { 281 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + 282 " parts: " + rawLine + " in " + location); 283 } 284 final String leftCharacter = parts[0]; 285 final String rightCharacter = parts[1]; 286 287 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 288 throw new IllegalArgumentException("Malformed folding statement - " + 289 "patterns are not single characters: " + rawLine + " in " + location); 290 } 291 292 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 293 } else { 294 // rule 295 final String[] parts = line.split("\\s+"); 296 if (parts.length != 4) { 297 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 298 " parts: " + rawLine + " in " + location); 299 } 300 try { 301 final String pattern = stripQuotes(parts[0]); 302 final String replacement1 = stripQuotes(parts[1]); 303 final String replacement2 = stripQuotes(parts[2]); 304 final String replacement3 = stripQuotes(parts[3]); 305 306 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 307 final char patternKey = r.pattern.charAt(0); 308 List<Rule> rules = ruleMapping.get(patternKey); 309 if (rules == null) { 310 rules = new ArrayList<>(); 311 ruleMapping.put(patternKey, rules); 312 } 313 rules.add(r); 314 } catch (final IllegalArgumentException e) { 315 throw new IllegalStateException( 316 "Problem parsing line '" + currentLine + "' in " + location, e); 317 } 318 } 319 } 320 } 321 } 322 323 private static String stripQuotes(String str) { 324 if (str.startsWith(DOUBLE_QUOTE)) { 325 str = str.substring(1); 326 } 327 328 if (str.endsWith(DOUBLE_QUOTE)) { 329 str = str.substring(0, str.length() - 1); 330 } 331 332 return str; 333 } 334 335 /** Whether to use ASCII folding prior to encoding. */ 336 private final boolean folding; 337 338 /** 339 * Creates a new instance with ASCII-folding enabled. 340 */ 341 public DaitchMokotoffSoundex() { 342 this(true); 343 } 344 345 /** 346 * Creates a new instance. 347 * <p> 348 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. 349 * รจ -> e. 350 * </p> 351 * 352 * @param folding 353 * if ASCII-folding shall be performed before encoding 354 */ 355 public DaitchMokotoffSoundex(final boolean folding) { 356 this.folding = folding; 357 } 358 359 /** 360 * Performs a cleanup of the input string before the actual soundex transformation. 361 * <p> 362 * Removes all whitespace characters and performs ASCII folding if enabled. 363 * </p> 364 * 365 * @param input 366 * the input string to cleanup 367 * @return a cleaned up string 368 */ 369 private String cleanup(final String input) { 370 final StringBuilder sb = new StringBuilder(); 371 for (char ch : input.toCharArray()) { 372 if (Character.isWhitespace(ch)) { 373 continue; 374 } 375 376 ch = Character.toLowerCase(ch); 377 if (folding && FOLDINGS.containsKey(ch)) { 378 ch = FOLDINGS.get(ch); 379 } 380 sb.append(ch); 381 } 382 return sb.toString(); 383 } 384 385 /** 386 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. 387 * <p> 388 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 389 * EncoderException if the supplied object is not of type java.lang.String. 390 * </p> 391 * 392 * @see #soundex(String) 393 * 394 * @param obj 395 * Object to encode 396 * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String 397 * supplied. 398 * @throws EncoderException 399 * if the parameter supplied is not of type java.lang.String 400 * @throws IllegalArgumentException 401 * if a character is not mapped 402 */ 403 @Override 404 public Object encode(final Object obj) throws EncoderException { 405 if (!(obj instanceof String)) { 406 throw new EncoderException( 407 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 408 } 409 return encode((String) obj); 410 } 411 412 /** 413 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. 414 * 415 * @see #soundex(String) 416 * 417 * @param source 418 * A String object to encode 419 * @return A DM Soundex code corresponding to the String supplied 420 * @throws IllegalArgumentException 421 * if a character is not mapped 422 */ 423 @Override 424 public String encode(final String source) { 425 if (source == null) { 426 return null; 427 } 428 return soundex(source, false)[0]; 429 } 430 431 /** 432 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. 433 * <p> 434 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 435 * separated by '|'. 436 * </p> 437 * <p> 438 * Example: the name "AUERBACH" is encoded as both 439 * </p> 440 * <ul> 441 * <li>097400</li> 442 * <li>097500</li> 443 * </ul> 444 * <p> 445 * Thus the result will be "097400|097500". 446 * </p> 447 * 448 * @param source 449 * A String object to encode 450 * @return A string containing a set of DM Soundex codes corresponding to the String supplied 451 * @throws IllegalArgumentException 452 * if a character is not mapped 453 */ 454 public String soundex(final String source) { 455 final String[] branches = soundex(source, true); 456 final StringBuilder sb = new StringBuilder(); 457 int index = 0; 458 for (final String branch : branches) { 459 sb.append(branch); 460 if (++index < branches.length) { 461 sb.append('|'); 462 } 463 } 464 return sb.toString(); 465 } 466 467 /** 468 * Perform the actual DM Soundex algorithm on the input string. 469 * 470 * @param source 471 * A String object to encode 472 * @param branching 473 * If branching shall be performed 474 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 475 * selected branching mode 476 */ 477 private String[] soundex(final String source, final boolean branching) { 478 if (source == null) { 479 return null; 480 } 481 482 final String input = cleanup(source); 483 484 final Set<Branch> currentBranches = new LinkedHashSet<>(); 485 currentBranches.add(new Branch()); 486 487 char lastChar = '\0'; 488 for (int index = 0; index < input.length(); index++) { 489 final char ch = input.charAt(index); 490 491 // ignore whitespace inside a name 492 if (Character.isWhitespace(ch)) { 493 continue; 494 } 495 496 final String inputContext = input.substring(index); 497 final List<Rule> rules = RULES.get(ch); 498 if (rules == null) { 499 continue; 500 } 501 502 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 503 final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.<Branch>emptyList(); 504 505 for (final Rule rule : rules) { 506 if (rule.matches(inputContext)) { 507 if (branching) { 508 nextBranches.clear(); 509 } 510 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 511 final boolean branchingRequired = replacements.length > 1 && branching; 512 513 for (final Branch branch : currentBranches) { 514 for (final String nextReplacement : replacements) { 515 // if we have multiple replacements, always create a new branch 516 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 517 518 // special rule: occurrences of mn or nm are treated differently 519 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); 520 521 nextBranch.processNextReplacement(nextReplacement, force); 522 523 if (branching) { 524 nextBranches.add(nextBranch); 525 } else { 526 break; 527 } 528 } 529 } 530 531 if (branching) { 532 currentBranches.clear(); 533 currentBranches.addAll(nextBranches); 534 } 535 index += rule.getPatternLength() - 1; 536 break; 537 } 538 } 539 540 lastChar = ch; 541 } 542 543 final String[] result = new String[currentBranches.size()]; 544 int index = 0; 545 for (final Branch branch : currentBranches) { 546 branch.finish(); 547 result[index++] = branch.toString(); 548 } 549 550 return result; 551 } 552}