001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.LinkedHashSet;
024import java.util.List;
025import java.util.Map;
026import java.util.Scanner;
027import java.util.Set;
028
029import org.apache.commons.codec.CharEncoding;
030import org.apache.commons.codec.EncoderException;
031import org.apache.commons.codec.Resources;
032import org.apache.commons.codec.StringEncoder;
033
034/**
035 * Encodes a string into a Daitch-Mokotoff Soundex value.
036 * <p>
037 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
038 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
039 * </p>
040 * <p>
041 * The main differences compared to the other soundex variants are:
042 * </p>
043 * <ul>
044 * <li>coded names are 6 digits long
045 * <li>the initial character of the name is coded
046 * <li>rules to encoded multi-character n-grams
047 * <li>multiple possible encodings for the same name (branching)
048 * </ul>
049 * <p>
050 * This implementation supports branching, depending on the used method:
051 * <ul>
052 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
053 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
054 * </ul>
055 * <p>
056 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
057 * rules can be customized by overriding the default rules contained in the resource file
058 * {@code org/apache/commons/codec/language/dmrules.txt}.
059 * </p>
060 * <p>
061 * This class is thread-safe.
062 * </p>
063 *
064 * @see Soundex
065 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
066 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
067 *
068 * @since 1.10
069 */
070public class DaitchMokotoffSoundex implements StringEncoder {
071
072    private static final String COMMENT = "//";
073
074    private static final String DOUBLE_QUOTE = "\"";
075
076    private static final String MULTILINE_COMMENT_END = "*/";
077
078    private static final String MULTILINE_COMMENT_START = "/*";
079
080    /** The resource file containing the replacement and folding rules */
081    private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
082
083    /** The code length of a DM soundex value. */
084    private static final int MAX_LENGTH = 6;
085
086    /** Transformation rules indexed by the first character of their pattern. */
087    private static final Map<Character, List<Rule>> RULES = new HashMap<>();
088
089    /** Folding rules. */
090    private static final Map<Character, Character> FOLDINGS = new HashMap<>();
091
092    /** Whether to use ASCII folding prior to encoding. */
093    private final boolean folding;
094
095    /**
096     * Inner class representing a branch during DM soundex encoding.
097     */
098    private static final class Branch {
099        private final StringBuilder builder;
100        private String cachedString;
101        private String lastReplacement;
102
103        private Branch() {
104            builder = new StringBuilder();
105            lastReplacement = null;
106            cachedString = null;
107        }
108
109        /**
110         * Creates a new branch, identical to this branch.
111         *
112         * @return a new, identical branch
113         */
114        public Branch createBranch() {
115            final Branch branch = new Branch();
116            branch.builder.append(toString());
117            branch.lastReplacement = this.lastReplacement;
118            return branch;
119        }
120
121        @Override
122        public boolean equals(final Object other) {
123            if (this == other) {
124                return true;
125            }
126            if (!(other instanceof Branch)) {
127                return false;
128            }
129
130            return toString().equals(((Branch) other).toString());
131        }
132
133        /**
134         * Finish this branch by appending '0's until the maximum code length has been reached.
135         */
136        public void finish() {
137            while (builder.length() < MAX_LENGTH) {
138                builder.append('0');
139                cachedString = null;
140            }
141        }
142
143        @Override
144        public int hashCode() {
145            return toString().hashCode();
146        }
147
148        /**
149         * Process the next replacement to be added to this branch.
150         *
151         * @param replacement
152         *            the next replacement to append
153         * @param forceAppend
154         *            indicates if the default processing shall be overridden
155         */
156        public void processNextReplacement(final String replacement, final boolean forceAppend) {
157            final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
158
159            if (append && builder.length() < MAX_LENGTH) {
160                builder.append(replacement);
161                // remove all characters after the maximum length
162                if (builder.length() > MAX_LENGTH) {
163                    builder.delete(MAX_LENGTH, builder.length());
164                }
165                cachedString = null;
166            }
167
168            lastReplacement = replacement;
169        }
170
171        @Override
172        public String toString() {
173            if (cachedString == null) {
174                cachedString = builder.toString();
175            }
176            return cachedString;
177        }
178    }
179
180    /**
181     * Inner class for storing rules.
182     */
183    private static final class Rule {
184        private final String pattern;
185        private final String[] replacementAtStart;
186        private final String[] replacementBeforeVowel;
187        private final String[] replacementDefault;
188
189        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
190                final String replacementDefault) {
191            this.pattern = pattern;
192            this.replacementAtStart = replacementAtStart.split("\\|");
193            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
194            this.replacementDefault = replacementDefault.split("\\|");
195        }
196
197        public int getPatternLength() {
198            return pattern.length();
199        }
200
201        public String[] getReplacements(final String context, final boolean atStart) {
202            if (atStart) {
203                return replacementAtStart;
204            }
205
206            final int nextIndex = getPatternLength();
207            final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
208            if (nextCharIsVowel) {
209                return replacementBeforeVowel;
210            }
211
212            return replacementDefault;
213        }
214
215        private boolean isVowel(final char ch) {
216            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
217        }
218
219        public boolean matches(final String context) {
220            return context.startsWith(pattern);
221        }
222
223        @Override
224        public String toString() {
225            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
226                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
227        }
228    }
229
230    static {
231        try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
232            parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
233        }
234
235        // sort RULES by pattern length in descending order
236        RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
237    }
238
239    private static void parseRules(final Scanner scanner, final String location,
240            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
241        int currentLine = 0;
242        boolean inMultilineComment = false;
243
244        while (scanner.hasNextLine()) {
245            currentLine++;
246            final String rawLine = scanner.nextLine();
247            String line = rawLine;
248
249            if (inMultilineComment) {
250                if (line.endsWith(MULTILINE_COMMENT_END)) {
251                    inMultilineComment = false;
252                }
253                continue;
254            }
255
256            if (line.startsWith(MULTILINE_COMMENT_START)) {
257                inMultilineComment = true;
258            } else {
259                // discard comments
260                final int cmtI = line.indexOf(COMMENT);
261                if (cmtI >= 0) {
262                    line = line.substring(0, cmtI);
263                }
264
265                // trim leading-trailing whitespace
266                line = line.trim();
267
268                if (line.isEmpty()) {
269                    continue; // empty lines can be safely skipped
270                }
271
272                if (line.contains("=")) {
273                    // folding
274                    final String[] parts = line.split("=");
275                    if (parts.length != 2) {
276                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
277                                " parts: " + rawLine + " in " + location);
278                    }
279                    final String leftCharacter = parts[0];
280                    final String rightCharacter = parts[1];
281
282                    if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
283                        throw new IllegalArgumentException("Malformed folding statement - " +
284                                "patterns are not single characters: " + rawLine + " in " + location);
285                    }
286
287                    asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
288                } else {
289                    // rule
290                    final String[] parts = line.split("\\s+");
291                    if (parts.length != 4) {
292                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
293                                " parts: " + rawLine + " in " + location);
294                    }
295                    try {
296                        final String pattern = stripQuotes(parts[0]);
297                        final String replacement1 = stripQuotes(parts[1]);
298                        final String replacement2 = stripQuotes(parts[2]);
299                        final String replacement3 = stripQuotes(parts[3]);
300
301                        final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
302                        final char patternKey = r.pattern.charAt(0);
303                        final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
304                        rules.add(r);
305                    } catch (final IllegalArgumentException e) {
306                        throw new IllegalStateException(
307                                "Problem parsing line '" + currentLine + "' in " + location, e);
308                    }
309                }
310            }
311        }
312    }
313
314    private static String stripQuotes(String str) {
315        if (str.startsWith(DOUBLE_QUOTE)) {
316            str = str.substring(1);
317        }
318
319        if (str.endsWith(DOUBLE_QUOTE)) {
320            str = str.substring(0, str.length() - 1);
321        }
322
323        return str;
324    }
325
326    /**
327     * Creates a new instance with ASCII-folding enabled.
328     */
329    public DaitchMokotoffSoundex() {
330        this(true);
331    }
332
333    /**
334     * Creates a new instance.
335     * <p>
336     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
337     * รจ -&gt; e.
338     * </p>
339     *
340     * @param folding
341     *            if ASCII-folding shall be performed before encoding
342     */
343    public DaitchMokotoffSoundex(final boolean folding) {
344        this.folding = folding;
345    }
346
347    /**
348     * Performs a cleanup of the input string before the actual soundex transformation.
349     * <p>
350     * Removes all whitespace characters and performs ASCII folding if enabled.
351     * </p>
352     *
353     * @param input
354     *            the input string to clean up
355     * @return a cleaned up string
356     */
357    private String cleanup(final String input) {
358        final StringBuilder sb = new StringBuilder();
359        for (char ch : input.toCharArray()) {
360            if (Character.isWhitespace(ch)) {
361                continue;
362            }
363
364            ch = Character.toLowerCase(ch);
365            final Character character = FOLDINGS.get(ch);
366            if (folding && character != null) {
367                ch = character;
368            }
369            sb.append(ch);
370        }
371        return sb.toString();
372    }
373
374    /**
375     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
376     * <p>
377     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
378     * EncoderException if the supplied object is not of type java.lang.String.
379     * </p>
380     *
381     * @see #soundex(String)
382     *
383     * @param obj
384     *            Object to encode
385     * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
386     *         supplied.
387     * @throws EncoderException
388     *             if the parameter supplied is not of type java.lang.String
389     * @throws IllegalArgumentException
390     *             if a character is not mapped
391     */
392    @Override
393    public Object encode(final Object obj) throws EncoderException {
394        if (!(obj instanceof String)) {
395            throw new EncoderException(
396                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
397        }
398        return encode((String) obj);
399    }
400
401    /**
402     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
403     *
404     * @see #soundex(String)
405     *
406     * @param source
407     *            A String object to encode
408     * @return A DM Soundex code corresponding to the String supplied
409     * @throws IllegalArgumentException
410     *             if a character is not mapped
411     */
412    @Override
413    public String encode(final String source) {
414        if (source == null) {
415            return null;
416        }
417        return soundex(source, false)[0];
418    }
419
420    /**
421     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
422     * <p>
423     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
424     * separated by '|'.
425     * </p>
426     * <p>
427     * Example: the name "AUERBACH" is encoded as both
428     * </p>
429     * <ul>
430     * <li>097400</li>
431     * <li>097500</li>
432     * </ul>
433     * <p>
434     * Thus the result will be "097400|097500".
435     * </p>
436     *
437     * @param source
438     *            A String object to encode
439     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
440     * @throws IllegalArgumentException
441     *             if a character is not mapped
442     */
443    public String soundex(final String source) {
444        final String[] branches = soundex(source, true);
445        final StringBuilder sb = new StringBuilder();
446        int index = 0;
447        for (final String branch : branches) {
448            sb.append(branch);
449            if (++index < branches.length) {
450                sb.append('|');
451            }
452        }
453        return sb.toString();
454    }
455
456    /**
457     * Perform the actual DM Soundex algorithm on the input string.
458     *
459     * @param source
460     *            A String object to encode
461     * @param branching
462     *            If branching shall be performed
463     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
464     *         selected branching mode
465     */
466    private String[] soundex(final String source, final boolean branching) {
467        if (source == null) {
468            return null;
469        }
470
471        final String input = cleanup(source);
472
473        final Set<Branch> currentBranches = new LinkedHashSet<>();
474        currentBranches.add(new Branch());
475
476        char lastChar = '\0';
477        for (int index = 0; index < input.length(); index++) {
478            final char ch = input.charAt(index);
479
480            // ignore whitespace inside a name
481            if (Character.isWhitespace(ch)) {
482                continue;
483            }
484
485            final String inputContext = input.substring(index);
486            final List<Rule> rules = RULES.get(ch);
487            if (rules == null) {
488                continue;
489            }
490
491            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
492            final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
493
494            for (final Rule rule : rules) {
495                if (rule.matches(inputContext)) {
496                    if (branching) {
497                        nextBranches.clear();
498                    }
499                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
500                    final boolean branchingRequired = replacements.length > 1 && branching;
501
502                    for (final Branch branch : currentBranches) {
503                        for (final String nextReplacement : replacements) {
504                            // if we have multiple replacements, always create a new branch
505                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
506
507                            // special rule: occurrences of mn or nm are treated differently
508                            final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
509
510                            nextBranch.processNextReplacement(nextReplacement, force);
511
512                            if (!branching) {
513                                break;
514                            }
515                            nextBranches.add(nextBranch);
516                        }
517                    }
518
519                    if (branching) {
520                        currentBranches.clear();
521                        currentBranches.addAll(nextBranches);
522                    }
523                    index += rule.getPatternLength() - 1;
524                    break;
525                }
526            }
527
528            lastChar = ch;
529        }
530
531        final String[] result = new String[currentBranches.size()];
532        int index = 0;
533        for (final Branch branch : currentBranches) {
534            branch.finish();
535            result[index++] = branch.toString();
536        }
537
538        return result;
539    }
540}