001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.io.InputStream;
020import java.util.ArrayList;
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Comparator;
024import java.util.HashMap;
025import java.util.LinkedHashSet;
026import java.util.List;
027import java.util.Map;
028import java.util.Scanner;
029import java.util.Set;
030
031import org.apache.commons.codec.CharEncoding;
032import org.apache.commons.codec.EncoderException;
033import org.apache.commons.codec.StringEncoder;
034
035/**
036 * Encodes a string into a Daitch-Mokotoff Soundex value.
037 * <p>
038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
040 * </p>
041 * <p>
042 * The main differences compared to the other soundex variants are:
043 * </p>
044 * <ul>
045 * <li>coded names are 6 digits long
046 * <li>the initial character of the name is coded
047 * <li>rules to encoded multi-character n-grams
048 * <li>multiple possible encodings for the same name (branching)
049 * </ul>
050 * <p>
051 * This implementation supports branching, depending on the used method:
052 * <ul>
053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
055 * </ul>
056 * <p>
057 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
058 * rules can be customized by overriding the default rules contained in the resource file
059 * {@code org/apache/commons/codec/language/dmrules.txt}.
060 * </p>
061 * <p>
062 * This class is thread-safe.
063 * </p>
064 *
065 * @see Soundex
066 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
067 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
068 *
069 * @version $Id: DaitchMokotoffSoundex.java 1760691 2016-09-14 12:14:26Z jochen $
070 * @since 1.10
071 */
072public class DaitchMokotoffSoundex implements StringEncoder {
073
074    /**
075     * Inner class representing a branch during DM soundex encoding.
076     */
077    private static final class Branch {
078        private final StringBuilder builder;
079        private String cachedString;
080        private String lastReplacement;
081
082        private Branch() {
083            builder = new StringBuilder();
084            lastReplacement = null;
085            cachedString = null;
086        }
087
088        /**
089         * Creates a new branch, identical to this branch.
090         *
091         * @return a new, identical branch
092         */
093        public Branch createBranch() {
094            final Branch branch = new Branch();
095            branch.builder.append(toString());
096            branch.lastReplacement = this.lastReplacement;
097            return branch;
098        }
099
100        @Override
101        public boolean equals(final Object other) {
102            if (this == other) {
103                return true;
104            }
105            if (!(other instanceof Branch)) {
106                return false;
107            }
108
109            return toString().equals(((Branch) other).toString());
110        }
111
112        /**
113         * Finish this branch by appending '0's until the maximum code length has been reached.
114         */
115        public void finish() {
116            while (builder.length() < MAX_LENGTH) {
117                builder.append('0');
118                cachedString = null;
119            }
120        }
121
122        @Override
123        public int hashCode() {
124            return toString().hashCode();
125        }
126
127        /**
128         * Process the next replacement to be added to this branch.
129         *
130         * @param replacement
131         *            the next replacement to append
132         * @param forceAppend
133         *            indicates if the default processing shall be overridden
134         */
135        public void processNextReplacement(final String replacement, final boolean forceAppend) {
136            final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
137
138            if (append && builder.length() < MAX_LENGTH) {
139                builder.append(replacement);
140                // remove all characters after the maximum length
141                if (builder.length() > MAX_LENGTH) {
142                    builder.delete(MAX_LENGTH, builder.length());
143                }
144                cachedString = null;
145            }
146
147            lastReplacement = replacement;
148        }
149
150        @Override
151        public String toString() {
152            if (cachedString == null) {
153                cachedString = builder.toString();
154            }
155            return cachedString;
156        }
157    }
158
159    /**
160     * Inner class for storing rules.
161     */
162    private static final class Rule {
163        private final String pattern;
164        private final String[] replacementAtStart;
165        private final String[] replacementBeforeVowel;
166        private final String[] replacementDefault;
167
168        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
169                final String replacementDefault) {
170            this.pattern = pattern;
171            this.replacementAtStart = replacementAtStart.split("\\|");
172            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
173            this.replacementDefault = replacementDefault.split("\\|");
174        }
175
176        public int getPatternLength() {
177            return pattern.length();
178        }
179
180        public String[] getReplacements(final String context, final boolean atStart) {
181            if (atStart) {
182                return replacementAtStart;
183            }
184
185            final int nextIndex = getPatternLength();
186            final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
187            if (nextCharIsVowel) {
188                return replacementBeforeVowel;
189            }
190
191            return replacementDefault;
192        }
193
194        private boolean isVowel(final char ch) {
195            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
196        }
197
198        public boolean matches(final String context) {
199            return context.startsWith(pattern);
200        }
201
202        @Override
203        public String toString() {
204            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
205                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
206        }
207    }
208
209    private static final String COMMENT = "//";
210    private static final String DOUBLE_QUOTE = "\"";
211
212    private static final String MULTILINE_COMMENT_END = "*/";
213
214    private static final String MULTILINE_COMMENT_START = "/*";
215
216    /** The resource file containing the replacement and folding rules */
217    private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
218
219    /** The code length of a DM soundex value. */
220    private static final int MAX_LENGTH = 6;
221
222    /** Transformation rules indexed by the first character of their pattern. */
223    private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
224
225    /** Folding rules. */
226    private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
227
228    static {
229        final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
230        if (rulesIS == null) {
231            throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
232        }
233
234        final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
235        try {
236            parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
237        } finally {
238            scanner.close();
239        }
240
241        // sort RULES by pattern length in descending order
242        for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
243            final List<Rule> ruleList = rule.getValue();
244            Collections.sort(ruleList, new Comparator<Rule>() {
245                @Override
246                public int compare(final Rule rule1, final Rule rule2) {
247                    return rule2.getPatternLength() - rule1.getPatternLength();
248                }
249            });
250        }
251    }
252
253    private static void parseRules(final Scanner scanner, final String location,
254            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
255        int currentLine = 0;
256        boolean inMultilineComment = false;
257
258        while (scanner.hasNextLine()) {
259            currentLine++;
260            final String rawLine = scanner.nextLine();
261            String line = rawLine;
262
263            if (inMultilineComment) {
264                if (line.endsWith(MULTILINE_COMMENT_END)) {
265                    inMultilineComment = false;
266                }
267                continue;
268            }
269
270            if (line.startsWith(MULTILINE_COMMENT_START)) {
271                inMultilineComment = true;
272            } else {
273                // discard comments
274                final int cmtI = line.indexOf(COMMENT);
275                if (cmtI >= 0) {
276                    line = line.substring(0, cmtI);
277                }
278
279                // trim leading-trailing whitespace
280                line = line.trim();
281
282                if (line.length() == 0) {
283                    continue; // empty lines can be safely skipped
284                }
285
286                if (line.contains("=")) {
287                    // folding
288                    final String[] parts = line.split("=");
289                    if (parts.length != 2) {
290                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
291                                " parts: " + rawLine + " in " + location);
292                    }
293                    final String leftCharacter = parts[0];
294                    final String rightCharacter = parts[1];
295
296                    if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
297                        throw new IllegalArgumentException("Malformed folding statement - " +
298                                "patterns are not single characters: " + rawLine + " in " + location);
299                    }
300
301                    asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
302                } else {
303                    // rule
304                    final String[] parts = line.split("\\s+");
305                    if (parts.length != 4) {
306                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
307                                " parts: " + rawLine + " in " + location);
308                    }
309                    try {
310                        final String pattern = stripQuotes(parts[0]);
311                        final String replacement1 = stripQuotes(parts[1]);
312                        final String replacement2 = stripQuotes(parts[2]);
313                        final String replacement3 = stripQuotes(parts[3]);
314
315                        final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
316                        final char patternKey = r.pattern.charAt(0);
317                        List<Rule> rules = ruleMapping.get(patternKey);
318                        if (rules == null) {
319                            rules = new ArrayList<Rule>();
320                            ruleMapping.put(patternKey, rules);
321                        }
322                        rules.add(r);
323                    } catch (final IllegalArgumentException e) {
324                        throw new IllegalStateException(
325                                "Problem parsing line '" + currentLine + "' in " + location, e);
326                    }
327                }
328            }
329        }
330    }
331
332    private static String stripQuotes(String str) {
333        if (str.startsWith(DOUBLE_QUOTE)) {
334            str = str.substring(1);
335        }
336
337        if (str.endsWith(DOUBLE_QUOTE)) {
338            str = str.substring(0, str.length() - 1);
339        }
340
341        return str;
342    }
343
344    /** Whether to use ASCII folding prior to encoding. */
345    private final boolean folding;
346
347    /**
348     * Creates a new instance with ASCII-folding enabled.
349     */
350    public DaitchMokotoffSoundex() {
351        this(true);
352    }
353
354    /**
355     * Creates a new instance.
356     * <p>
357     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
358     * รจ -&gt; e.
359     * </p>
360     *
361     * @param folding
362     *            if ASCII-folding shall be performed before encoding
363     */
364    public DaitchMokotoffSoundex(final boolean folding) {
365        this.folding = folding;
366    }
367
368    /**
369     * Performs a cleanup of the input string before the actual soundex transformation.
370     * <p>
371     * Removes all whitespace characters and performs ASCII folding if enabled.
372     * </p>
373     *
374     * @param input
375     *            the input string to cleanup
376     * @return a cleaned up string
377     */
378    private String cleanup(final String input) {
379        final StringBuilder sb = new StringBuilder();
380        for (char ch : input.toCharArray()) {
381            if (Character.isWhitespace(ch)) {
382                continue;
383            }
384
385            ch = Character.toLowerCase(ch);
386            if (folding && FOLDINGS.containsKey(ch)) {
387                ch = FOLDINGS.get(ch);
388            }
389            sb.append(ch);
390        }
391        return sb.toString();
392    }
393
394    /**
395     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
396     * <p>
397     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
398     * EncoderException if the supplied object is not of type java.lang.String.
399     * </p>
400     *
401     * @see #soundex(String)
402     *
403     * @param obj
404     *            Object to encode
405     * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
406     *         supplied.
407     * @throws EncoderException
408     *             if the parameter supplied is not of type java.lang.String
409     * @throws IllegalArgumentException
410     *             if a character is not mapped
411     */
412    @Override
413    public Object encode(final Object obj) throws EncoderException {
414        if (!(obj instanceof String)) {
415            throw new EncoderException(
416                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
417        }
418        return encode((String) obj);
419    }
420
421    /**
422     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
423     *
424     * @see #soundex(String)
425     *
426     * @param source
427     *            A String object to encode
428     * @return A DM Soundex code corresponding to the String supplied
429     * @throws IllegalArgumentException
430     *             if a character is not mapped
431     */
432    @Override
433    public String encode(final String source) {
434        if (source == null) {
435            return null;
436        }
437        return soundex(source, false)[0];
438    }
439
440    /**
441     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
442     * <p>
443     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
444     * separated by '|'.
445     * </p>
446     * <p>
447     * Example: the name "AUERBACH" is encoded as both
448     * </p>
449     * <ul>
450     * <li>097400</li>
451     * <li>097500</li>
452     * </ul>
453     * <p>
454     * Thus the result will be "097400|097500".
455     * </p>
456     *
457     * @param source
458     *            A String object to encode
459     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
460     * @throws IllegalArgumentException
461     *             if a character is not mapped
462     */
463    public String soundex(final String source) {
464        final String[] branches = soundex(source, true);
465        final StringBuilder sb = new StringBuilder();
466        int index = 0;
467        for (final String branch : branches) {
468            sb.append(branch);
469            if (++index < branches.length) {
470                sb.append('|');
471            }
472        }
473        return sb.toString();
474    }
475
476    /**
477     * Perform the actual DM Soundex algorithm on the input string.
478     *
479     * @param source
480     *            A String object to encode
481     * @param branching
482     *            If branching shall be performed
483     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
484     *         selected branching mode
485     */
486    private String[] soundex(final String source, final boolean branching) {
487        if (source == null) {
488            return null;
489        }
490
491        final String input = cleanup(source);
492
493        final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
494        currentBranches.add(new Branch());
495
496        char lastChar = '\0';
497        for (int index = 0; index < input.length(); index++) {
498            final char ch = input.charAt(index);
499
500            // ignore whitespace inside a name
501            if (Character.isWhitespace(ch)) {
502                continue;
503            }
504
505            final String inputContext = input.substring(index);
506            final List<Rule> rules = RULES.get(ch);
507            if (rules == null) {
508                continue;
509            }
510
511            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
512            @SuppressWarnings("unchecked")
513            final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
514
515            for (final Rule rule : rules) {
516                if (rule.matches(inputContext)) {
517                    if (branching) {
518                        nextBranches.clear();
519                    }
520                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
521                    final boolean branchingRequired = replacements.length > 1 && branching;
522
523                    for (final Branch branch : currentBranches) {
524                        for (final String nextReplacement : replacements) {
525                            // if we have multiple replacements, always create a new branch
526                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
527
528                            // special rule: occurrences of mn or nm are treated differently
529                            final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
530
531                            nextBranch.processNextReplacement(nextReplacement, force);
532
533                            if (branching) {
534                                nextBranches.add(nextBranch);
535                            } else {
536                                break;
537                            }
538                        }
539                    }
540
541                    if (branching) {
542                        currentBranches.clear();
543                        currentBranches.addAll(nextBranches);
544                    }
545                    index += rule.getPatternLength() - 1;
546                    break;
547                }
548            }
549
550            lastChar = ch;
551        }
552
553        final String[] result = new String[currentBranches.size()];
554        int index = 0;
555        for (final Branch branch : currentBranches) {
556            branch.finish();
557            result[index++] = branch.toString();
558        }
559
560        return result;
561    }
562}