Nysiis.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.commons.codec.language;
- import java.util.regex.Pattern;
- import org.apache.commons.codec.EncoderException;
- import org.apache.commons.codec.StringEncoder;
- /**
- * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
- * general purpose scheme to find word with similar phonemes.
- * <p>
- * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
- * </p>
- * <p>
- * Algorithm description:
- * </p>
- * <pre>
- * 1. Transcode first characters of name
- * 1a. MAC -> MCC
- * 1b. KN -> NN
- * 1c. K -> C
- * 1d. PH -> FF
- * 1e. PF -> FF
- * 1f. SCH -> SSS
- * 2. Transcode last characters of name
- * 2a. EE, IE -> Y
- * 2b. DT,RT,RD,NT,ND -> D
- * 3. First character of key = first character of name
- * 4. Transcode remaining characters by following these rules, incrementing by one character each time
- * 4a. EV -> AF else A,E,I,O,U -> A
- * 4b. Q -> G
- * 4c. Z -> S
- * 4d. M -> N
- * 4e. KN -> N else K -> C
- * 4f. SCH -> SSS
- * 4g. PH -> FF
- * 4h. H -> If previous or next is non-vowel, previous
- * 4i. W -> If previous is vowel, previous
- * 4j. Add current to key if current != last key character
- * 5. If last character is S, remove it
- * 6. If last characters are AY, replace with Y
- * 7. If last character is A, remove it
- * 8. Collapse all strings of repeated characters
- * 9. Add original first character of name as first character of key
- * </pre>
- * <p>
- * This class is immutable and thread-safe.
- * </p>
- *
- * @see <a href="https://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
- * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
- * @see Soundex
- * @since 1.7
- */
- public class Nysiis implements StringEncoder {
- private static final char[] CHARS_A = { 'A' };
- private static final char[] CHARS_AF = { 'A', 'F' };
- private static final char[] CHARS_C = { 'C' };
- private static final char[] CHARS_FF = { 'F', 'F' };
- private static final char[] CHARS_G = { 'G' };
- private static final char[] CHARS_N = { 'N' };
- private static final char[] CHARS_NN = { 'N', 'N' };
- private static final char[] CHARS_S = { 'S' };
- private static final char[] CHARS_SSS = { 'S', 'S', 'S' };
- private static final Pattern PAT_MAC = Pattern.compile("^MAC");
- private static final Pattern PAT_KN = Pattern.compile("^KN");
- private static final Pattern PAT_K = Pattern.compile("^K");
- private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
- private static final Pattern PAT_SCH = Pattern.compile("^SCH");
- private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
- private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
- private static final char SPACE = ' ';
- private static final int TRUE_LENGTH = 6;
- /**
- * Tests if the given character is a vowel.
- *
- * @param c
- * the character to test
- * @return {@code true} if the character is a vowel, {@code false} otherwise
- */
- private static boolean isVowel(final char c) {
- return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
- }
- /**
- * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
- * a time: [i-1, i, i+1, i+2].
- *
- * @param prev
- * the previous character
- * @param curr
- * the current character
- * @param next
- * the next character
- * @param aNext
- * the after next character
- * @return a transcoded array of characters, starting from the current position
- */
- private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
- // 1. EV -> AF
- if (curr == 'E' && next == 'V') {
- return CHARS_AF;
- }
- // A, E, I, O, U -> A
- if (isVowel(curr)) {
- return CHARS_A;
- }
- // 2. Q -> G, Z -> S, M -> N
- // 3. KN -> NN else K -> C
- switch (curr) {
- case 'Q':
- return CHARS_G;
- case 'Z':
- return CHARS_S;
- case 'M':
- return CHARS_N;
- case 'K':
- if (next == 'N') {
- return CHARS_NN;
- }
- return CHARS_C;
- default:
- break;
- }
- // 4. SCH -> SSS
- if (curr == 'S' && next == 'C' && aNext == 'H') {
- return CHARS_SSS;
- }
- // PH -> FF
- if (curr == 'P' && next == 'H') {
- return CHARS_FF;
- }
- // 5. H -> If previous or next is a non vowel, previous.
- if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
- return new char[] { prev };
- }
- // 6. W -> If previous is vowel, previous.
- if (curr == 'W' && isVowel(prev)) {
- return new char[] { prev };
- }
- return new char[] { curr };
- }
- /** Indicates the strict mode. */
- private final boolean strict;
- /**
- * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
- * i.e. encoded strings have a maximum length of 6.
- */
- public Nysiis() {
- this(true);
- }
- /**
- * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
- *
- * <ul>
- * <li>{@code true}: encoded strings have a maximum length of 6</li>
- * <li>{@code false}: encoded strings may have arbitrary length</li>
- * </ul>
- *
- * @param strict
- * the strict mode
- */
- public Nysiis(final boolean strict) {
- this.strict = strict;
- }
- /**
- * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
- * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
- * {@link String}.
- *
- * @param obj
- * Object to encode
- * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
- * @throws EncoderException
- * if the parameter supplied is not of a {@link String}
- * @throws IllegalArgumentException
- * if a character is not mapped
- */
- @Override
- public Object encode(final Object obj) throws EncoderException {
- if (!(obj instanceof String)) {
- throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
- }
- return nysiis((String) obj);
- }
- /**
- * Encodes a String using the NYSIIS algorithm.
- *
- * @param str
- * A String object to encode
- * @return A Nysiis code corresponding to the String supplied
- * @throws IllegalArgumentException
- * if a character is not mapped
- */
- @Override
- public String encode(final String str) {
- return nysiis(str);
- }
- /**
- * Indicates the strict mode for this {@link Nysiis} encoder.
- *
- * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
- */
- public boolean isStrict() {
- return this.strict;
- }
- /**
- * Retrieves the NYSIIS code for a given String object.
- *
- * @param str
- * String to encode using the NYSIIS algorithm
- * @return A NYSIIS code for the String supplied
- */
- public String nysiis(String str) {
- if (str == null) {
- return null;
- }
- // Use the same clean rules as Soundex
- str = SoundexUtils.clean(str);
- if (str.isEmpty()) {
- return str;
- }
- // Translate first characters of name:
- // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
- str = PAT_MAC.matcher(str).replaceFirst("MCC");
- str = PAT_KN.matcher(str).replaceFirst("NN");
- str = PAT_K.matcher(str).replaceFirst("C");
- str = PAT_PH_PF.matcher(str).replaceFirst("FF");
- str = PAT_SCH.matcher(str).replaceFirst("SSS");
- // Translate last characters of name:
- // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
- str = PAT_EE_IE.matcher(str).replaceFirst("Y");
- str = PAT_DT_ETC.matcher(str).replaceFirst("D");
- // First character of key = first character of name.
- final StringBuilder key = new StringBuilder(str.length());
- key.append(str.charAt(0));
- // Transcode remaining characters, incrementing by one character each time
- final char[] chars = str.toCharArray();
- final int len = chars.length;
- for (int i = 1; i < len; i++) {
- final char next = i < len - 1 ? chars[i + 1] : SPACE;
- final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
- final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
- System.arraycopy(transcoded, 0, chars, i, transcoded.length);
- // only append the current char to the key if it is different from the last one
- if (chars[i] != chars[i - 1]) {
- key.append(chars[i]);
- }
- }
- if (key.length() > 1) {
- char lastChar = key.charAt(key.length() - 1);
- // If last character is S, remove it.
- if (lastChar == 'S') {
- key.deleteCharAt(key.length() - 1);
- lastChar = key.charAt(key.length() - 1);
- }
- if (key.length() > 2) {
- final char last2Char = key.charAt(key.length() - 2);
- // If last characters are AY, replace with Y.
- if (last2Char == 'A' && lastChar == 'Y') {
- key.deleteCharAt(key.length() - 2);
- }
- }
- // If last character is A, remove it.
- if (lastChar == 'A') {
- key.deleteCharAt(key.length() - 1);
- }
- }
- final String string = key.toString();
- return isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
- }
- }