Nysiis.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import java.util.regex.Pattern;

  19. import org.apache.commons.codec.EncoderException;
  20. import org.apache.commons.codec.StringEncoder;

  21. /**
  22.  * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
  23.  * general purpose scheme to find word with similar phonemes.
  24.  * <p>
  25.  * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
  26.  * <p>
  27.  * Algorithm description:
  28.  * <pre>
  29.  * 1. Transcode first characters of name
  30.  *   1a. MAC -&gt;   MCC
  31.  *   1b. KN  -&gt;   NN
  32.  *   1c. K   -&gt;   C
  33.  *   1d. PH  -&gt;   FF
  34.  *   1e. PF  -&gt;   FF
  35.  *   1f. SCH -&gt;   SSS
  36.  * 2. Transcode last characters of name
  37.  *   2a. EE, IE          -&gt;   Y
  38.  *   2b. DT,RT,RD,NT,ND  -&gt;   D
  39.  * 3. First character of key = first character of name
  40.  * 4. Transcode remaining characters by following these rules, incrementing by one character each time
  41.  *   4a. EV  -&gt;   AF  else A,E,I,O,U -&gt; A
  42.  *   4b. Q   -&gt;   G
  43.  *   4c. Z   -&gt;   S
  44.  *   4d. M   -&gt;   N
  45.  *   4e. KN  -&gt;   N   else K -&gt; C
  46.  *   4f. SCH -&gt;   SSS
  47.  *   4g. PH  -&gt;   FF
  48.  *   4h. H   -&gt;   If previous or next is nonvowel, previous
  49.  *   4i. W   -&gt;   If previous is vowel, previous
  50.  *   4j. Add current to key if current != last key character
  51.  * 5. If last character is S, remove it
  52.  * 6. If last characters are AY, replace with Y
  53.  * 7. If last character is A, remove it
  54.  * 8. Collapse all strings of repeated characters
  55.  * 9. Add original first character of name as first character of key
  56.  * </pre>
  57.  * <p>
  58.  * This class is immutable and thread-safe.
  59.  *
  60.  * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
  61.  * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
  62.  * @see Soundex
  63.  * @since 1.7
  64.  * @version $Id: Nysiis.java 1725161 2016-01-18 01:08:56Z ggregory $
  65.  */
  66. public class Nysiis implements StringEncoder {

  67.     private static final char[] CHARS_A   = new char[] { 'A' };
  68.     private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
  69.     private static final char[] CHARS_C   = new char[] { 'C' };
  70.     private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
  71.     private static final char[] CHARS_G   = new char[] { 'G' };
  72.     private static final char[] CHARS_N   = new char[] { 'N' };
  73.     private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
  74.     private static final char[] CHARS_S   = new char[] { 'S' };
  75.     private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };

  76.     private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
  77.     private static final Pattern PAT_KN     = Pattern.compile("^KN");
  78.     private static final Pattern PAT_K      = Pattern.compile("^K");
  79.     private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
  80.     private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
  81.     private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
  82.     private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");

  83.     private static final char SPACE = ' ';
  84.     private static final int TRUE_LENGTH = 6;

  85.     /**
  86.      * Tests if the given character is a vowel.
  87.      *
  88.      * @param c
  89.      *            the character to test
  90.      * @return <code>true</code> if the character is a vowel, <code>false</code> otherwise
  91.      */
  92.     private static boolean isVowel(final char c) {
  93.         return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
  94.     }

  95.     /**
  96.      * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
  97.      * a time: [i-1, i, i+1, i+2].
  98.      *
  99.      * @param prev
  100.      *            the previous character
  101.      * @param curr
  102.      *            the current character
  103.      * @param next
  104.      *            the next character
  105.      * @param aNext
  106.      *            the after next character
  107.      * @return a transcoded array of characters, starting from the current position
  108.      */
  109.     private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
  110.         // 1. EV -> AF
  111.         if (curr == 'E' && next == 'V') {
  112.             return CHARS_AF;
  113.         }

  114.         // A, E, I, O, U -> A
  115.         if (isVowel(curr)) {
  116.             return CHARS_A;
  117.         }

  118.         // 2. Q -> G, Z -> S, M -> N
  119.         if (curr == 'Q') {
  120.             return CHARS_G;
  121.         } else if (curr == 'Z') {
  122.             return CHARS_S;
  123.         } else if (curr == 'M') {
  124.             return CHARS_N;
  125.         }

  126.         // 3. KN -> NN else K -> C
  127.         if (curr == 'K') {
  128.             if (next == 'N') {
  129.                 return CHARS_NN;
  130.             }
  131.             return CHARS_C;
  132.         }

  133.         // 4. SCH -> SSS
  134.         if (curr == 'S' && next == 'C' && aNext == 'H') {
  135.             return CHARS_SSS;
  136.         }

  137.         // PH -> FF
  138.         if (curr == 'P' && next == 'H') {
  139.             return CHARS_FF;
  140.         }

  141.         // 5. H -> If previous or next is a non vowel, previous.
  142.         if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
  143.             return new char[] { prev };
  144.         }

  145.         // 6. W -> If previous is vowel, previous.
  146.         if (curr == 'W' && isVowel(prev)) {
  147.             return new char[] { prev };
  148.         }

  149.         return new char[] { curr };
  150.     }

  151.     /** Indicates the strict mode. */
  152.     private final boolean strict;

  153.     /**
  154.      * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
  155.      * i.e. encoded strings have a maximum length of 6.
  156.      */
  157.     public Nysiis() {
  158.         this(true);
  159.     }

  160.     /**
  161.      * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
  162.      *
  163.      * <ul>
  164.      *  <li><code>true</code>: encoded strings have a maximum length of 6</li>
  165.      *  <li><code>false</code>: encoded strings may have arbitrary length</li>
  166.      * </ul>
  167.      *
  168.      * @param strict
  169.      *            the strict mode
  170.      */
  171.     public Nysiis(final boolean strict) {
  172.         this.strict = strict;
  173.     }

  174.     /**
  175.      * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
  176.      * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
  177.      * {@link String}.
  178.      *
  179.      * @param obj
  180.      *            Object to encode
  181.      * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
  182.      * @throws EncoderException
  183.      *            if the parameter supplied is not of a {@link String}
  184.      * @throws IllegalArgumentException
  185.      *            if a character is not mapped
  186.      */
  187.     @Override
  188.     public Object encode(final Object obj) throws EncoderException {
  189.         if (!(obj instanceof String)) {
  190.             throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
  191.         }
  192.         return this.nysiis((String) obj);
  193.     }

  194.     /**
  195.      * Encodes a String using the NYSIIS algorithm.
  196.      *
  197.      * @param str
  198.      *            A String object to encode
  199.      * @return A Nysiis code corresponding to the String supplied
  200.      * @throws IllegalArgumentException
  201.      *            if a character is not mapped
  202.      */
  203.     @Override
  204.     public String encode(final String str) {
  205.         return this.nysiis(str);
  206.     }

  207.     /**
  208.      * Indicates the strict mode for this {@link Nysiis} encoder.
  209.      *
  210.      * @return <code>true</code> if the encoder is configured for strict mode, <code>false</code> otherwise
  211.      */
  212.     public boolean isStrict() {
  213.         return this.strict;
  214.     }

  215.     /**
  216.      * Retrieves the NYSIIS code for a given String object.
  217.      *
  218.      * @param str
  219.      *            String to encode using the NYSIIS algorithm
  220.      * @return A NYSIIS code for the String supplied
  221.      */
  222.     public String nysiis(String str) {
  223.         if (str == null) {
  224.             return null;
  225.         }

  226.         // Use the same clean rules as Soundex
  227.         str = SoundexUtils.clean(str);

  228.         if (str.length() == 0) {
  229.             return str;
  230.         }

  231.         // Translate first characters of name:
  232.         // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
  233.         str = PAT_MAC.matcher(str).replaceFirst("MCC");
  234.         str = PAT_KN.matcher(str).replaceFirst("NN");
  235.         str = PAT_K.matcher(str).replaceFirst("C");
  236.         str = PAT_PH_PF.matcher(str).replaceFirst("FF");
  237.         str = PAT_SCH.matcher(str).replaceFirst("SSS");

  238.         // Translate last characters of name:
  239.         // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
  240.         str = PAT_EE_IE.matcher(str).replaceFirst("Y");
  241.         str = PAT_DT_ETC.matcher(str).replaceFirst("D");

  242.         // First character of key = first character of name.
  243.         final StringBuilder key = new StringBuilder(str.length());
  244.         key.append(str.charAt(0));

  245.         // Transcode remaining characters, incrementing by one character each time
  246.         final char[] chars = str.toCharArray();
  247.         final int len = chars.length;

  248.         for (int i = 1; i < len; i++) {
  249.             final char next = i < len - 1 ? chars[i + 1] : SPACE;
  250.             final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
  251.             final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
  252.             System.arraycopy(transcoded, 0, chars, i, transcoded.length);

  253.             // only append the current char to the key if it is different from the last one
  254.             if (chars[i] != chars[i - 1]) {
  255.                 key.append(chars[i]);
  256.             }
  257.         }

  258.         if (key.length() > 1) {
  259.             char lastChar = key.charAt(key.length() - 1);

  260.             // If last character is S, remove it.
  261.             if (lastChar == 'S') {
  262.                 key.deleteCharAt(key.length() - 1);
  263.                 lastChar = key.charAt(key.length() - 1);
  264.             }

  265.             if (key.length() > 2) {
  266.                 final char last2Char = key.charAt(key.length() - 2);
  267.                 // If last characters are AY, replace with Y.
  268.                 if (last2Char == 'A' && lastChar == 'Y') {
  269.                     key.deleteCharAt(key.length() - 2);
  270.                 }
  271.             }

  272.             // If last character is A, remove it.
  273.             if (lastChar == 'A') {
  274.                 key.deleteCharAt(key.length() - 1);
  275.             }
  276.         }

  277.         final String string = key.toString();
  278.         return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
  279.     }

  280. }