001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.Comparator;
023import java.util.HashMap;
024import java.util.LinkedHashSet;
025import java.util.List;
026import java.util.Map;
027import java.util.Scanner;
028import java.util.Set;
029
030import org.apache.commons.codec.CharEncoding;
031import org.apache.commons.codec.EncoderException;
032import org.apache.commons.codec.Resources;
033import org.apache.commons.codec.StringEncoder;
034
035/**
036 * Encodes a string into a Daitch-Mokotoff Soundex value.
037 * <p>
038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
040 * </p>
041 * <p>
042 * The main differences compared to the other soundex variants are:
043 * </p>
044 * <ul>
045 * <li>coded names are 6 digits long
046 * <li>the initial character of the name is coded
047 * <li>rules to encoded multi-character n-grams
048 * <li>multiple possible encodings for the same name (branching)
049 * </ul>
050 * <p>
051 * This implementation supports branching, depending on the used method:
052 * <ul>
053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
055 * </ul>
056 * <p>
057 * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
058 * rules can be customized by overriding the default rules contained in the resource file
059 * {@code org/apache/commons/codec/language/dmrules.txt}.
060 * </p>
061 * <p>
062 * This class is thread-safe.
063 * </p>
064 *
065 * @see Soundex
066 * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
067 * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
068 *
069 * @since 1.10
070 */
071public class DaitchMokotoffSoundex implements StringEncoder {
072
073    /**
074     * Inner class representing a branch during DM soundex encoding.
075     */
076    private static final class Branch {
077        private final StringBuilder builder;
078        private String cachedString;
079        private String lastReplacement;
080
081        private Branch() {
082            builder = new StringBuilder();
083            lastReplacement = null;
084            cachedString = null;
085        }
086
087        /**
088         * Creates a new branch, identical to this branch.
089         *
090         * @return a new, identical branch
091         */
092        public Branch createBranch() {
093            final Branch branch = new Branch();
094            branch.builder.append(toString());
095            branch.lastReplacement = this.lastReplacement;
096            return branch;
097        }
098
099        @Override
100        public boolean equals(final Object other) {
101            if (this == other) {
102                return true;
103            }
104            if (!(other instanceof Branch)) {
105                return false;
106            }
107
108            return toString().equals(((Branch) other).toString());
109        }
110
111        /**
112         * Finish this branch by appending '0's until the maximum code length has been reached.
113         */
114        public void finish() {
115            while (builder.length() < MAX_LENGTH) {
116                builder.append('0');
117                cachedString = null;
118            }
119        }
120
121        @Override
122        public int hashCode() {
123            return toString().hashCode();
124        }
125
126        /**
127         * Process the next replacement to be added to this branch.
128         *
129         * @param replacement
130         *            the next replacement to append
131         * @param forceAppend
132         *            indicates if the default processing shall be overridden
133         */
134        public void processNextReplacement(final String replacement, final boolean forceAppend) {
135            final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
136
137            if (append && builder.length() < MAX_LENGTH) {
138                builder.append(replacement);
139                // remove all characters after the maximum length
140                if (builder.length() > MAX_LENGTH) {
141                    builder.delete(MAX_LENGTH, builder.length());
142                }
143                cachedString = null;
144            }
145
146            lastReplacement = replacement;
147        }
148
149        @Override
150        public String toString() {
151            if (cachedString == null) {
152                cachedString = builder.toString();
153            }
154            return cachedString;
155        }
156    }
157
158    /**
159     * Inner class for storing rules.
160     */
161    private static final class Rule {
162        private final String pattern;
163        private final String[] replacementAtStart;
164        private final String[] replacementBeforeVowel;
165        private final String[] replacementDefault;
166
167        protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
168                final String replacementDefault) {
169            this.pattern = pattern;
170            this.replacementAtStart = replacementAtStart.split("\\|");
171            this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
172            this.replacementDefault = replacementDefault.split("\\|");
173        }
174
175        public int getPatternLength() {
176            return pattern.length();
177        }
178
179        public String[] getReplacements(final String context, final boolean atStart) {
180            if (atStart) {
181                return replacementAtStart;
182            }
183
184            final int nextIndex = getPatternLength();
185            final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
186            if (nextCharIsVowel) {
187                return replacementBeforeVowel;
188            }
189
190            return replacementDefault;
191        }
192
193        private boolean isVowel(final char ch) {
194            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
195        }
196
197        public boolean matches(final String context) {
198            return context.startsWith(pattern);
199        }
200
201        @Override
202        public String toString() {
203            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
204                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
205        }
206    }
207
208    private static final String COMMENT = "//";
209    private static final String DOUBLE_QUOTE = "\"";
210
211    private static final String MULTILINE_COMMENT_END = "*/";
212
213    private static final String MULTILINE_COMMENT_START = "/*";
214
215    /** The resource file containing the replacement and folding rules */
216    private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
217
218    /** The code length of a DM soundex value. */
219    private static final int MAX_LENGTH = 6;
220
221    /** Transformation rules indexed by the first character of their pattern. */
222    private static final Map<Character, List<Rule>> RULES = new HashMap<>();
223
224    /** Folding rules. */
225    private static final Map<Character, Character> FOLDINGS = new HashMap<>();
226
227    static {
228        try (final Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
229            parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
230        }
231
232        // sort RULES by pattern length in descending order
233        for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
234            final List<Rule> ruleList = rule.getValue();
235            Collections.sort(ruleList, new Comparator<Rule>() {
236                @Override
237                public int compare(final Rule rule1, final Rule rule2) {
238                    return rule2.getPatternLength() - rule1.getPatternLength();
239                }
240            });
241        }
242    }
243
244    private static void parseRules(final Scanner scanner, final String location,
245            final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
246        int currentLine = 0;
247        boolean inMultilineComment = false;
248
249        while (scanner.hasNextLine()) {
250            currentLine++;
251            final String rawLine = scanner.nextLine();
252            String line = rawLine;
253
254            if (inMultilineComment) {
255                if (line.endsWith(MULTILINE_COMMENT_END)) {
256                    inMultilineComment = false;
257                }
258                continue;
259            }
260
261            if (line.startsWith(MULTILINE_COMMENT_START)) {
262                inMultilineComment = true;
263            } else {
264                // discard comments
265                final int cmtI = line.indexOf(COMMENT);
266                if (cmtI >= 0) {
267                    line = line.substring(0, cmtI);
268                }
269
270                // trim leading-trailing whitespace
271                line = line.trim();
272
273                if (line.length() == 0) {
274                    continue; // empty lines can be safely skipped
275                }
276
277                if (line.contains("=")) {
278                    // folding
279                    final String[] parts = line.split("=");
280                    if (parts.length != 2) {
281                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
282                                " parts: " + rawLine + " in " + location);
283                    }
284                    final String leftCharacter = parts[0];
285                    final String rightCharacter = parts[1];
286
287                    if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
288                        throw new IllegalArgumentException("Malformed folding statement - " +
289                                "patterns are not single characters: " + rawLine + " in " + location);
290                    }
291
292                    asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
293                } else {
294                    // rule
295                    final String[] parts = line.split("\\s+");
296                    if (parts.length != 4) {
297                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
298                                " parts: " + rawLine + " in " + location);
299                    }
300                    try {
301                        final String pattern = stripQuotes(parts[0]);
302                        final String replacement1 = stripQuotes(parts[1]);
303                        final String replacement2 = stripQuotes(parts[2]);
304                        final String replacement3 = stripQuotes(parts[3]);
305
306                        final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
307                        final char patternKey = r.pattern.charAt(0);
308                        List<Rule> rules = ruleMapping.get(patternKey);
309                        if (rules == null) {
310                            rules = new ArrayList<>();
311                            ruleMapping.put(patternKey, rules);
312                        }
313                        rules.add(r);
314                    } catch (final IllegalArgumentException e) {
315                        throw new IllegalStateException(
316                                "Problem parsing line '" + currentLine + "' in " + location, e);
317                    }
318                }
319            }
320        }
321    }
322
323    private static String stripQuotes(String str) {
324        if (str.startsWith(DOUBLE_QUOTE)) {
325            str = str.substring(1);
326        }
327
328        if (str.endsWith(DOUBLE_QUOTE)) {
329            str = str.substring(0, str.length() - 1);
330        }
331
332        return str;
333    }
334
335    /** Whether to use ASCII folding prior to encoding. */
336    private final boolean folding;
337
338    /**
339     * Creates a new instance with ASCII-folding enabled.
340     */
341    public DaitchMokotoffSoundex() {
342        this(true);
343    }
344
345    /**
346     * Creates a new instance.
347     * <p>
348     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g.
349     * รจ -&gt; e.
350     * </p>
351     *
352     * @param folding
353     *            if ASCII-folding shall be performed before encoding
354     */
355    public DaitchMokotoffSoundex(final boolean folding) {
356        this.folding = folding;
357    }
358
359    /**
360     * Performs a cleanup of the input string before the actual soundex transformation.
361     * <p>
362     * Removes all whitespace characters and performs ASCII folding if enabled.
363     * </p>
364     *
365     * @param input
366     *            the input string to cleanup
367     * @return a cleaned up string
368     */
369    private String cleanup(final String input) {
370        final StringBuilder sb = new StringBuilder();
371        for (char ch : input.toCharArray()) {
372            if (Character.isWhitespace(ch)) {
373                continue;
374            }
375
376            ch = Character.toLowerCase(ch);
377            if (folding && FOLDINGS.containsKey(ch)) {
378                ch = FOLDINGS.get(ch);
379            }
380            sb.append(ch);
381        }
382        return sb.toString();
383    }
384
385    /**
386     * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
387     * <p>
388     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
389     * EncoderException if the supplied object is not of type java.lang.String.
390     * </p>
391     *
392     * @see #soundex(String)
393     *
394     * @param obj
395     *            Object to encode
396     * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
397     *         supplied.
398     * @throws EncoderException
399     *             if the parameter supplied is not of type java.lang.String
400     * @throws IllegalArgumentException
401     *             if a character is not mapped
402     */
403    @Override
404    public Object encode(final Object obj) throws EncoderException {
405        if (!(obj instanceof String)) {
406            throw new EncoderException(
407                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
408        }
409        return encode((String) obj);
410    }
411
412    /**
413     * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
414     *
415     * @see #soundex(String)
416     *
417     * @param source
418     *            A String object to encode
419     * @return A DM Soundex code corresponding to the String supplied
420     * @throws IllegalArgumentException
421     *             if a character is not mapped
422     */
423    @Override
424    public String encode(final String source) {
425        if (source == null) {
426            return null;
427        }
428        return soundex(source, false)[0];
429    }
430
431    /**
432     * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
433     * <p>
434     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
435     * separated by '|'.
436     * </p>
437     * <p>
438     * Example: the name "AUERBACH" is encoded as both
439     * </p>
440     * <ul>
441     * <li>097400</li>
442     * <li>097500</li>
443     * </ul>
444     * <p>
445     * Thus the result will be "097400|097500".
446     * </p>
447     *
448     * @param source
449     *            A String object to encode
450     * @return A string containing a set of DM Soundex codes corresponding to the String supplied
451     * @throws IllegalArgumentException
452     *             if a character is not mapped
453     */
454    public String soundex(final String source) {
455        final String[] branches = soundex(source, true);
456        final StringBuilder sb = new StringBuilder();
457        int index = 0;
458        for (final String branch : branches) {
459            sb.append(branch);
460            if (++index < branches.length) {
461                sb.append('|');
462            }
463        }
464        return sb.toString();
465    }
466
467    /**
468     * Perform the actual DM Soundex algorithm on the input string.
469     *
470     * @param source
471     *            A String object to encode
472     * @param branching
473     *            If branching shall be performed
474     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
475     *         selected branching mode
476     */
477    private String[] soundex(final String source, final boolean branching) {
478        if (source == null) {
479            return null;
480        }
481
482        final String input = cleanup(source);
483
484        final Set<Branch> currentBranches = new LinkedHashSet<>();
485        currentBranches.add(new Branch());
486
487        char lastChar = '\0';
488        for (int index = 0; index < input.length(); index++) {
489            final char ch = input.charAt(index);
490
491            // ignore whitespace inside a name
492            if (Character.isWhitespace(ch)) {
493                continue;
494            }
495
496            final String inputContext = input.substring(index);
497            final List<Rule> rules = RULES.get(ch);
498            if (rules == null) {
499                continue;
500            }
501
502            // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
503            final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.<Branch>emptyList();
504
505            for (final Rule rule : rules) {
506                if (rule.matches(inputContext)) {
507                    if (branching) {
508                        nextBranches.clear();
509                    }
510                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
511                    final boolean branchingRequired = replacements.length > 1 && branching;
512
513                    for (final Branch branch : currentBranches) {
514                        for (final String nextReplacement : replacements) {
515                            // if we have multiple replacements, always create a new branch
516                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
517
518                            // special rule: occurrences of mn or nm are treated differently
519                            final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
520
521                            nextBranch.processNextReplacement(nextReplacement, force);
522
523                            if (branching) {
524                                nextBranches.add(nextBranch);
525                            } else {
526                                break;
527                            }
528                        }
529                    }
530
531                    if (branching) {
532                        currentBranches.clear();
533                        currentBranches.addAll(nextBranches);
534                    }
535                    index += rule.getPatternLength() - 1;
536                    break;
537                }
538            }
539
540            lastChar = ch;
541        }
542
543        final String[] result = new String[currentBranches.size()];
544        int index = 0;
545        for (final Branch branch : currentBranches) {
546            branch.finish();
547            result[index++] = branch.toString();
548        }
549
550        return result;
551    }
552}