Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.codec.language;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.LinkedHashSet;
024import java.util.List;
025import java.util.Map;
026import java.util.Scanner;
027import java.util.Set;
028import java.util.regex.Pattern;
029
030import org.apache.commons.codec.CharEncoding;
031import org.apache.commons.codec.EncoderException;
032import org.apache.commons.codec.Resources;
033import org.apache.commons.codec.StringEncoder;
034
035/**
036 * Encodes a string into a Daitch-Mokotoff Soundex value.
037 * <p>
038 * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
039 * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
040 * </p>
041 * <p>
042 * The main differences compared to the other Soundex variants are:
043 * </p>
044 * <ul>
045 * <li>coded names are 6 digits long
046 * <li>the initial character of the name is coded
047 * <li>rules to encoded multi-character n-grams
048 * <li>multiple possible encodings for the same name (branching)
049 * </ul>
050 * <p>
051 * This implementation supports branching, depending on the used method:
052 * <ul>
053 * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
054 * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
055 * </ul>
056 * <p>
057 * Note: This implementation has additional branching rules compared to the original description of the algorithm. The
058 * rules can be customized by overriding the default rules contained in the resource file
059 * {@code org/apache/commons/codec/language/dmrules.txt}.
060 * </p>
061 * <p>
062 * This class is thread-safe.
063 * </p>
064 *
065 * @see Soundex
066 * @see <a href="https://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
067 * @see <a href="https://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
068 * @since 1.10
069 */
070public class DaitchMokotoffSoundex implements StringEncoder {
071
072    /**
073     * Inner class representing a branch during DM Soundex encoding.
074     */
075    private static final class Branch {
076        private final StringBuilder builder;
077        private String cachedString;
078        private String lastReplacement;
079
080        private Branch() {
081            builder = new StringBuilder();
082        }
083
084        /**
085         * Creates a new branch, identical to this branch.
086         *
087         * @return a new, identical branch
088         */
089        private Branch createBranch() {
090            final Branch branch = new Branch();
091            branch.builder.append(toString());
092            branch.lastReplacement = this.lastReplacement;
093            return branch;
094        }
095
096        @Override
097        public boolean equals(final Object other) {
098            if (this == other) {
099                return true;
100            }
101            if (!(other instanceof Branch)) {
102                return false;
103            }
104            return toString().equals(((Branch) other).toString());
105        }
106
107        /**
108         * Finish this branch by appending '0's until the maximum code length has been reached.
109         */
110        private void finish() {
111            while (builder.length() < MAX_LENGTH) {
112                builder.append('0');
113                cachedString = null;
114            }
115        }
116
117        @Override
118        public int hashCode() {
119            return toString().hashCode();
120        }
121
122        /**
123         * Process the next replacement to be added to this branch.
124         *
125         * @param replacement
126         *            the next replacement to append.
127         * @param forceAppend
128         *            indicates if the default processing shall be overridden.
129         */
130        private void processNextReplacement(final String replacement, final boolean forceAppend) {
131            final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
132            if (append && builder.length() < MAX_LENGTH) {
133                builder.append(replacement);
134                // remove all characters after the maximum length
135                if (builder.length() > MAX_LENGTH) {
136                    builder.delete(MAX_LENGTH, builder.length());
137                }
138                cachedString = null;
139            }
140            lastReplacement = replacement;
141        }
142
143        @Override
144        public String toString() {
145            if (cachedString == null) {
146                cachedString = builder.toString();
147            }
148            return cachedString;
149        }
150    }
151
152    /**
153     * Inner class for storing rules.
154     */
155    private static final class Rule {
156        private static final Pattern PIPE = Pattern.compile("\\|");
157        private final String pattern;
158        private final String[] replacementAtStart;
159        private final String[] replacementBeforeVowel;
160        private final String[] replacementDefault;
161
162        private Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
163                final String replacementDefault) {
164            this.pattern = pattern;
165            this.replacementAtStart = PIPE.split(replacementAtStart);
166            this.replacementBeforeVowel = PIPE.split(replacementBeforeVowel);
167            this.replacementDefault = PIPE.split(replacementDefault);
168        }
169
170        private int getPatternLength() {
171            return pattern.length();
172        }
173
174        private String[] getReplacements(final String context, final boolean atStart) {
175            if (atStart) {
176                return replacementAtStart;
177            }
178
179            final int nextIndex = getPatternLength();
180            final boolean nextCharIsVowel = nextIndex < context.length() && isVowel(context.charAt(nextIndex));
181            if (nextCharIsVowel) {
182                return replacementBeforeVowel;
183            }
184
185            return replacementDefault;
186        }
187
188        private boolean isVowel(final char ch) {
189            return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
190        }
191
192        private boolean matches(final String context) {
193            return context.startsWith(pattern);
194        }
195
196        @Override
197        public String toString() {
198            return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
199                    Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
200        }
201    }
202
203    /**
204     * The NUL character.
205     */
206    private static final char NUL = '\0';
207
208    private static final String COMMENT = "//";
209
210    private static final String DOUBLE_QUOTE = "\"";
211
212    private static final String MULTILINE_COMMENT_END = "*/";
213
214    private static final String MULTILINE_COMMENT_START = "/*";
215
216    /** The resource file containing the replacement and folding rules */
217    private static final String RESOURCE_FILE = "/org/apache/commons/codec/language/dmrules.txt";
218
219    /** The code length of a DM Soundex value. */
220    private static final int MAX_LENGTH = 6;
221
222    /** Transformation rules indexed by the first character of their pattern. */
223    private static final Map<Character, List<Rule>> RULES = new HashMap<>();
224
225    /** Folding rules. */
226    private static final Map<Character, Character> FOLDINGS = new HashMap<>();
227
228    private static final Pattern EQUAL = Pattern.compile("=");
229
230    private static final Pattern SPACES = Pattern.compile("\\s+");
231
232    static {
233        try (Scanner scanner = new Scanner(Resources.getInputStream(RESOURCE_FILE), CharEncoding.UTF_8)) {
234            parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
235        }
236        // sort RULES by pattern length in descending order
237        RULES.forEach((k, v) -> v.sort((rule1, rule2) -> rule2.getPatternLength() - rule1.getPatternLength()));
238    }
239
240    private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping,
241            final Map<Character, Character> asciiFoldings) {
242        int currentLine = 0;
243        boolean inMultilineComment = false;
244        while (scanner.hasNextLine()) {
245            currentLine++;
246            final String rawLine = scanner.nextLine();
247            String line = rawLine;
248            if (inMultilineComment) {
249                if (line.endsWith(MULTILINE_COMMENT_END)) {
250                    inMultilineComment = false;
251                }
252                continue;
253            }
254            if (line.startsWith(MULTILINE_COMMENT_START)) {
255                inMultilineComment = true;
256            } else {
257                // discard comments
258                final int cmtI = line.indexOf(COMMENT);
259                if (cmtI >= 0) {
260                    line = line.substring(0, cmtI);
261                }
262                // trim leading-trailing whitespace
263                line = line.trim();
264                if (line.isEmpty()) {
265                    continue; // empty lines can be safely skipped
266                }
267                if (line.contains("=")) {
268                    // folding
269                    final String[] parts = EQUAL.split(line);
270                    if (parts.length != 2) {
271                        throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location);
272                    }
273                    final String leftCharacter = parts[0];
274                    final String rightCharacter = parts[1];
275                    if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
276                        throw new IllegalArgumentException(
277                                "Malformed folding statement - patterns are not single characters: " + rawLine + " in " + location);
278                    }
279                    asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
280                } else {
281                    // rule
282                    final String[] parts = SPACES.split(line);
283                    if (parts.length != 4) {
284                        throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location);
285                    }
286                    try {
287                        final String pattern = stripQuotes(parts[0]);
288                        final String replacement1 = stripQuotes(parts[1]);
289                        final String replacement2 = stripQuotes(parts[2]);
290                        final String replacement3 = stripQuotes(parts[3]);
291                        final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
292                        final char patternKey = r.pattern.charAt(0);
293                        final List<Rule> rules = ruleMapping.computeIfAbsent(patternKey, k -> new ArrayList<>());
294                        rules.add(r);
295                    } catch (final IllegalArgumentException e) {
296                        throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + location, e);
297                    }
298                }
299            }
300        }
301    }
302
303    private static String stripQuotes(String str) {
304        if (str.startsWith(DOUBLE_QUOTE)) {
305            str = str.substring(1);
306        }
307        if (str.endsWith(DOUBLE_QUOTE)) {
308            str = str.substring(0, str.length() - 1);
309        }
310        return str;
311    }
312
313    /** Whether to use ASCII folding prior to encoding. */
314    private final boolean folding;
315
316    /**
317     * Creates a new instance with ASCII-folding enabled.
318     */
319    public DaitchMokotoffSoundex() {
320        this(true);
321    }
322
323    /**
324     * Creates a new instance.
325     * <p>
326     * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, for example
327     * è -&gt; e.
328     * </p>
329     *
330     * @param folding
331     *            if ASCII-folding shall be performed before encoding
332     */
333    public DaitchMokotoffSoundex(final boolean folding) {
334        this.folding = folding;
335    }
336
337    /**
338     * Performs a cleanup of the input string before the actual Soundex transformation.
339     * <p>
340     * Removes all whitespace characters and performs ASCII folding if enabled.
341     * </p>
342     *
343     * @param input
344     *            the input string to clean up.
345     * @return a cleaned up string.
346     */
347    private String cleanup(final String input) {
348        final StringBuilder sb = new StringBuilder();
349        for (char ch : input.toCharArray()) {
350            if (Character.isWhitespace(ch) || !Character.isLetter(ch)) {
351                continue;
352            }
353            ch = Character.toLowerCase(ch);
354            final Character character = FOLDINGS.get(ch);
355            if (folding && character != null) {
356                ch = character;
357            }
358            sb.append(ch);
359        }
360        return sb.toString();
361    }
362
363    /**
364     * Encodes an Object using the Daitch-Mokotoff Soundex algorithm without branching.
365     * <p>
366     * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
367     * EncoderException if the supplied object is not of type {@link String}.
368     * </p>
369     *
370     * @see #soundex(String)
371     * @param obj
372     *            Object to encode.
373     * @return An object (of type {@link String}) containing the DM Soundex code, which corresponds to the String
374     *         supplied.
375     * @throws EncoderException
376     *             if the parameter supplied is not of type {@link String}.
377     * @throws IllegalArgumentException
378     *             if a character is not mapped.
379     */
380    @Override
381    public Object encode(final Object obj) throws EncoderException {
382        if (!(obj instanceof String)) {
383            throw new EncoderException(
384                    "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
385        }
386        return encode((String) obj);
387    }
388
389    /**
390     * Encodes a String using the Daitch-Mokotoff Soundex algorithm without branching.
391     *
392     * @see #soundex(String)
393     * @param source
394     *            A String object to encode.
395     * @return A DM Soundex code corresponding to the String supplied.
396     * @throws IllegalArgumentException
397     *             if a character is not mapped.
398     */
399    @Override
400    public String encode(final String source) {
401        if (source == null) {
402            return null;
403        }
404        return soundex(source, false)[0];
405    }
406
407    /**
408     * Encodes a String using the Daitch-Mokotoff Soundex algorithm with branching.
409     * <p>
410     * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
411     * separated by '|'.
412     * </p>
413     * <p>
414     * Example: the name "AUERBACH" is encoded as both
415     * </p>
416     * <ul>
417     * <li>097400</li>
418     * <li>097500</li>
419     * </ul>
420     * <p>
421     * Thus the result will be "097400|097500".
422     * </p>
423     *
424     * @param source
425     *            A String object to encode.
426     * @return A string containing a set of DM Soundex codes corresponding to the String supplied.
427     * @throws IllegalArgumentException
428     *             if a character is not mapped.
429     */
430    public String soundex(final String source) {
431        return String.join("|", soundex(source, true));
432    }
433
434    /**
435     * Perform the actual DM Soundex algorithm on the input string.
436     *
437     * @param source
438     *            A String object to encode.
439     * @param branching
440     *            If branching shall be performed.
441     * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
442     *         selected branching mode.
443     */
444    private String[] soundex(final String source, final boolean branching) {
445        if (source == null) {
446            return null;
447        }
448        final String input = cleanup(source);
449        final Set<Branch> currentBranches = new LinkedHashSet<>();
450        currentBranches.add(new Branch());
451        char lastChar = NUL;
452        for (int index = 0; index < input.length(); index++) {
453            final char ch = input.charAt(index);
454            final String inputContext = input.substring(index);
455            final List<Rule> rules = RULES.get(ch);
456            if (rules == null) {
457                continue;
458            }
459            // Use an EMPTY_LIST to avoid false positive warnings regarding potential null pointer access.
460            final List<Branch> nextBranches = branching ? new ArrayList<>() : Collections.emptyList();
461            for (final Rule rule : rules) {
462                if (rule.matches(inputContext)) {
463                    if (branching) {
464                        nextBranches.clear();
465                    }
466                    final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
467                    final boolean branchingRequired = replacements.length > 1 && branching;
468                    for (final Branch branch : currentBranches) {
469                        for (final String nextReplacement : replacements) {
470                            // if we have multiple replacements, always create a new branch
471                            final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
472                            // special rule: occurrences of mn or nm are treated differently
473                            final boolean force = lastChar == 'm' && ch == 'n' || lastChar == 'n' && ch == 'm';
474                            nextBranch.processNextReplacement(nextReplacement, force);
475                            if (!branching) {
476                                break;
477                            }
478                            nextBranches.add(nextBranch);
479                        }
480                    }
481                    if (branching) {
482                        currentBranches.clear();
483                        currentBranches.addAll(nextBranches);
484                    }
485                    index += rule.getPatternLength() - 1;
486                    break;
487                }
488            }
489            lastChar = ch;
490        }
491        final String[] result = new String[currentBranches.size()];
492        int index = 0;
493        for (final Branch branch : currentBranches) {
494            branch.finish();
495            result[index++] = branch.toString();
496        }
497        return result;
498    }
499}