001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.io.InputStream; 020import java.util.ArrayList; 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Comparator; 024import java.util.HashMap; 025import java.util.LinkedHashSet; 026import java.util.List; 027import java.util.Map; 028import java.util.Scanner; 029import java.util.Set; 030 031import org.apache.commons.codec.CharEncoding; 032import org.apache.commons.codec.EncoderException; 033import org.apache.commons.codec.StringEncoder; 034 035/** 036 * Encodes a string into a Daitch-Mokotoff Soundex value. 037 * <p> 038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater 039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. 040 * </p> 041 * <p> 042 * The main differences compared to the other soundex variants are: 043 * </p> 044 * <ul> 045 * <li>coded names are 6 digits long 046 * <li>the initial character of the name is coded 047 * <li>rules to encoded multi-character n-grams 048 * <li>multiple possible encodings for the same name (branching) 049 * </ul> 050 * <p> 051 * This implementation supports branching, depending on the used method: 052 * <ul> 053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned 054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' 055 * </ul> 056 * <p> 057 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The 058 * rules can be customized by overriding the default rules contained in the resource file 059 * {@code org/apache/commons/codec/language/dmrules.txt}. 060 * </p> 061 * <p> 062 * This class is thread-safe. 063 * </p> 064 * 065 * @see Soundex 066 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> 067 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> 068 * 069 * @version $Id: DaitchMokotoffSoundex.java 1760691 2016-09-14 12:14:26Z jochen $ 070 * @since 1.10 071 */ 072public class DaitchMokotoffSoundex implements StringEncoder { 073 074 /** 075 * Inner class representing a branch during DM soundex encoding. 076 */ 077 private static final class Branch { 078 private final StringBuilder builder; 079 private String cachedString; 080 private String lastReplacement; 081 082 private Branch() { 083 builder = new StringBuilder(); 084 lastReplacement = null; 085 cachedString = null; 086 } 087 088 /** 089 * Creates a new branch, identical to this branch. 090 * 091 * @return a new, identical branch 092 */ 093 public Branch createBranch() { 094 final Branch branch = new Branch(); 095 branch.builder.append(toString()); 096 branch.lastReplacement = this.lastReplacement; 097 return branch; 098 } 099 100 @Override 101 public boolean equals(final Object other) { 102 if (this == other) { 103 return true; 104 } 105 if (!(other instanceof Branch)) { 106 return false; 107 } 108 109 return toString().equals(((Branch) other).toString()); 110 } 111 112 /** 113 * Finish this branch by appending '0's until the maximum code length has been reached. 114 */ 115 public void finish() { 116 while (builder.length() < MAX_LENGTH) { 117 builder.append('0'); 118 cachedString = null; 119 } 120 } 121 122 @Override 123 public int hashCode() { 124 return toString().hashCode(); 125 } 126 127 /** 128 * Process the next replacement to be added to this branch. 129 * 130 * @param replacement 131 * the next replacement to append 132 * @param forceAppend 133 * indicates if the default processing shall be overridden 134 */ 135 public void processNextReplacement(final String replacement, final boolean forceAppend) { 136 final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; 137 138 if (append && builder.length() < MAX_LENGTH) { 139 builder.append(replacement); 140 // remove all characters after the maximum length 141 if (builder.length() > MAX_LENGTH) { 142 builder.delete(MAX_LENGTH, builder.length()); 143 } 144 cachedString = null; 145 } 146 147 lastReplacement = replacement; 148 } 149 150 @Override 151 public String toString() { 152 if (cachedString == null) { 153 cachedString = builder.toString(); 154 } 155 return cachedString; 156 } 157 } 158 159 /** 160 * Inner class for storing rules. 161 */ 162 private static final class Rule { 163 private final String pattern; 164 private final String[] replacementAtStart; 165 private final String[] replacementBeforeVowel; 166 private final String[] replacementDefault; 167 168 protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, 169 final String replacementDefault) { 170 this.pattern = pattern; 171 this.replacementAtStart = replacementAtStart.split("\\|"); 172 this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); 173 this.replacementDefault = replacementDefault.split("\\|"); 174 } 175 176 public int getPatternLength() { 177 return pattern.length(); 178 } 179 180 public String[] getReplacements(final String context, final boolean atStart) { 181 if (atStart) { 182 return replacementAtStart; 183 } 184 185 final int nextIndex = getPatternLength(); 186 final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; 187 if (nextCharIsVowel) { 188 return replacementBeforeVowel; 189 } 190 191 return replacementDefault; 192 } 193 194 private boolean isVowel(final char ch) { 195 return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; 196 } 197 198 public boolean matches(final String context) { 199 return context.startsWith(pattern); 200 } 201 202 @Override 203 public String toString() { 204 return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), 205 Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); 206 } 207 } 208 209 private static final String COMMENT = "//"; 210 private static final String DOUBLE_QUOTE = "\""; 211 212 private static final String MULTILINE_COMMENT_END = "*/"; 213 214 private static final String MULTILINE_COMMENT_START = "/*"; 215 216 /** The resource file containing the replacement and folding rules */ 217 private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt"; 218 219 /** The code length of a DM soundex value. */ 220 private static final int MAX_LENGTH = 6; 221 222 /** Transformation rules indexed by the first character of their pattern. */ 223 private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>(); 224 225 /** Folding rules. */ 226 private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>(); 227 228 static { 229 final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); 230 if (rulesIS == null) { 231 throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); 232 } 233 234 final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8); 235 try { 236 parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); 237 } finally { 238 scanner.close(); 239 } 240 241 // sort RULES by pattern length in descending order 242 for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { 243 final List<Rule> ruleList = rule.getValue(); 244 Collections.sort(ruleList, new Comparator<Rule>() { 245 @Override 246 public int compare(final Rule rule1, final Rule rule2) { 247 return rule2.getPatternLength() - rule1.getPatternLength(); 248 } 249 }); 250 } 251 } 252 253 private static void parseRules(final Scanner scanner, final String location, 254 final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { 255 int currentLine = 0; 256 boolean inMultilineComment = false; 257 258 while (scanner.hasNextLine()) { 259 currentLine++; 260 final String rawLine = scanner.nextLine(); 261 String line = rawLine; 262 263 if (inMultilineComment) { 264 if (line.endsWith(MULTILINE_COMMENT_END)) { 265 inMultilineComment = false; 266 } 267 continue; 268 } 269 270 if (line.startsWith(MULTILINE_COMMENT_START)) { 271 inMultilineComment = true; 272 } else { 273 // discard comments 274 final int cmtI = line.indexOf(COMMENT); 275 if (cmtI >= 0) { 276 line = line.substring(0, cmtI); 277 } 278 279 // trim leading-trailing whitespace 280 line = line.trim(); 281 282 if (line.length() == 0) { 283 continue; // empty lines can be safely skipped 284 } 285 286 if (line.contains("=")) { 287 // folding 288 final String[] parts = line.split("="); 289 if (parts.length != 2) { 290 throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + 291 " parts: " + rawLine + " in " + location); 292 } 293 final String leftCharacter = parts[0]; 294 final String rightCharacter = parts[1]; 295 296 if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { 297 throw new IllegalArgumentException("Malformed folding statement - " + 298 "patterns are not single characters: " + rawLine + " in " + location); 299 } 300 301 asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); 302 } else { 303 // rule 304 final String[] parts = line.split("\\s+"); 305 if (parts.length != 4) { 306 throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + 307 " parts: " + rawLine + " in " + location); 308 } 309 try { 310 final String pattern = stripQuotes(parts[0]); 311 final String replacement1 = stripQuotes(parts[1]); 312 final String replacement2 = stripQuotes(parts[2]); 313 final String replacement3 = stripQuotes(parts[3]); 314 315 final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); 316 final char patternKey = r.pattern.charAt(0); 317 List<Rule> rules = ruleMapping.get(patternKey); 318 if (rules == null) { 319 rules = new ArrayList<Rule>(); 320 ruleMapping.put(patternKey, rules); 321 } 322 rules.add(r); 323 } catch (final IllegalArgumentException e) { 324 throw new IllegalStateException( 325 "Problem parsing line '" + currentLine + "' in " + location, e); 326 } 327 } 328 } 329 } 330 } 331 332 private static String stripQuotes(String str) { 333 if (str.startsWith(DOUBLE_QUOTE)) { 334 str = str.substring(1); 335 } 336 337 if (str.endsWith(DOUBLE_QUOTE)) { 338 str = str.substring(0, str.length() - 1); 339 } 340 341 return str; 342 } 343 344 /** Whether to use ASCII folding prior to encoding. */ 345 private final boolean folding; 346 347 /** 348 * Creates a new instance with ASCII-folding enabled. 349 */ 350 public DaitchMokotoffSoundex() { 351 this(true); 352 } 353 354 /** 355 * Creates a new instance. 356 * <p> 357 * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. 358 * รจ -> e. 359 * </p> 360 * 361 * @param folding 362 * if ASCII-folding shall be performed before encoding 363 */ 364 public DaitchMokotoffSoundex(final boolean folding) { 365 this.folding = folding; 366 } 367 368 /** 369 * Performs a cleanup of the input string before the actual soundex transformation. 370 * <p> 371 * Removes all whitespace characters and performs ASCII folding if enabled. 372 * </p> 373 * 374 * @param input 375 * the input string to cleanup 376 * @return a cleaned up string 377 */ 378 private String cleanup(final String input) { 379 final StringBuilder sb = new StringBuilder(); 380 for (char ch : input.toCharArray()) { 381 if (Character.isWhitespace(ch)) { 382 continue; 383 } 384 385 ch = Character.toLowerCase(ch); 386 if (folding && FOLDINGS.containsKey(ch)) { 387 ch = FOLDINGS.get(ch); 388 } 389 sb.append(ch); 390 } 391 return sb.toString(); 392 } 393 394 /** 395 * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. 396 * <p> 397 * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 398 * EncoderException if the supplied object is not of type java.lang.String. 399 * </p> 400 * 401 * @see #soundex(String) 402 * 403 * @param obj 404 * Object to encode 405 * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String 406 * supplied. 407 * @throws EncoderException 408 * if the parameter supplied is not of type java.lang.String 409 * @throws IllegalArgumentException 410 * if a character is not mapped 411 */ 412 @Override 413 public Object encode(final Object obj) throws EncoderException { 414 if (!(obj instanceof String)) { 415 throw new EncoderException( 416 "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); 417 } 418 return encode((String) obj); 419 } 420 421 /** 422 * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. 423 * 424 * @see #soundex(String) 425 * 426 * @param source 427 * A String object to encode 428 * @return A DM Soundex code corresponding to the String supplied 429 * @throws IllegalArgumentException 430 * if a character is not mapped 431 */ 432 @Override 433 public String encode(final String source) { 434 if (source == null) { 435 return null; 436 } 437 return soundex(source, false)[0]; 438 } 439 440 /** 441 * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. 442 * <p> 443 * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, 444 * separated by '|'. 445 * </p> 446 * <p> 447 * Example: the name "AUERBACH" is encoded as both 448 * </p> 449 * <ul> 450 * <li>097400</li> 451 * <li>097500</li> 452 * </ul> 453 * <p> 454 * Thus the result will be "097400|097500". 455 * </p> 456 * 457 * @param source 458 * A String object to encode 459 * @return A string containing a set of DM Soundex codes corresponding to the String supplied 460 * @throws IllegalArgumentException 461 * if a character is not mapped 462 */ 463 public String soundex(final String source) { 464 final String[] branches = soundex(source, true); 465 final StringBuilder sb = new StringBuilder(); 466 int index = 0; 467 for (final String branch : branches) { 468 sb.append(branch); 469 if (++index < branches.length) { 470 sb.append('|'); 471 } 472 } 473 return sb.toString(); 474 } 475 476 /** 477 * Perform the actual DM Soundex algorithm on the input string. 478 * 479 * @param source 480 * A String object to encode 481 * @param branching 482 * If branching shall be performed 483 * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the 484 * selected branching mode 485 */ 486 private String[] soundex(final String source, final boolean branching) { 487 if (source == null) { 488 return null; 489 } 490 491 final String input = cleanup(source); 492 493 final Set<Branch> currentBranches = new LinkedHashSet<Branch>(); 494 currentBranches.add(new Branch()); 495 496 char lastChar = '\0'; 497 for (int index = 0; index < input.length(); index++) { 498 final char ch = input.charAt(index); 499 500 // ignore whitespace inside a name 501 if (Character.isWhitespace(ch)) { 502 continue; 503 } 504 505 final String inputContext = input.substring(index); 506 final List<Rule> rules = RULES.get(ch); 507 if (rules == null) { 508 continue; 509 } 510 511 // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access 512 @SuppressWarnings("unchecked") 513 final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; 514 515 for (final Rule rule : rules) { 516 if (rule.matches(inputContext)) { 517 if (branching) { 518 nextBranches.clear(); 519 } 520 final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); 521 final boolean branchingRequired = replacements.length > 1 && branching; 522 523 for (final Branch branch : currentBranches) { 524 for (final String nextReplacement : replacements) { 525 // if we have multiple replacements, always create a new branch 526 final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; 527 528 // special rule: occurrences of mn or nm are treated differently 529 final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); 530 531 nextBranch.processNextReplacement(nextReplacement, force); 532 533 if (branching) { 534 nextBranches.add(nextBranch); 535 } else { 536 break; 537 } 538 } 539 } 540 541 if (branching) { 542 currentBranches.clear(); 543 currentBranches.addAll(nextBranches); 544 } 545 index += rule.getPatternLength() - 1; 546 break; 547 } 548 } 549 550 lastChar = ch; 551 } 552 553 final String[] result = new String[currentBranches.size()]; 554 int index = 0; 555 for (final Branch branch : currentBranches) { 556 branch.finish(); 557 result[index++] = branch.toString(); 558 } 559 560 return result; 561 } 562}