Soundex.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import org.apache.commons.codec.EncoderException;
  19. import org.apache.commons.codec.StringEncoder;

  20. /**
  21.  * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
  22.  * general purpose scheme to find word with similar phonemes.
  23.  *
  24.  * <p>This class is thread-safe.
  25.  * Although not strictly immutable, the mutable fields are not actually used.</p>
  26.  */
  27. public class Soundex implements StringEncoder {

  28.     /**
  29.      * The marker character used to indicate a silent (ignored) character.
  30.      * These are ignored except when they appear as the first character.
  31.      * <p>
  32.      * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
  33.      * because changing it might break existing code. Mappings that don't contain
  34.      * a silent marker code are treated as though H and W are silent.
  35.      * </p>
  36.      * <p>
  37.      * To override this, use the {@link #Soundex(String, boolean)} constructor.
  38.      * </p>
  39.      *
  40.      * @since 1.11
  41.      */
  42.     public static final char SILENT_MARKER = '-';

  43.     /**
  44.      * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
  45.      * means do not encode, but treat as a separator when it occurs between consonants with the same code.
  46.      * <p>
  47.      * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
  48.      * up the value for the constant values page.)
  49.      * </p>
  50.      * <p>
  51.      * <strong>Note that letters H and W are treated specially.</strong>
  52.      * They are ignored (after the first letter) and don't act as separators
  53.      * between consonants with the same code.
  54.      * </p>
  55.      */
  56.     public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";

  57.     /**
  58.      * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
  59.      * means do not encode.
  60.      *
  61.      * @see Soundex#Soundex(char[])
  62.      */
  63.     private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();

  64.     /**
  65.      * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
  66.      * This treats H and W as silent letters.
  67.      * Apart from when they appear as the first letter, they are ignored.
  68.      * They don't act as separators between duplicate codes.
  69.      *
  70.      * @see #US_ENGLISH_MAPPING_STRING
  71.      */
  72.     public static final Soundex US_ENGLISH = new Soundex();

  73.     /**
  74.      * An instance of Soundex using the Simplified Soundex mapping, as described here:
  75.      * http://west-penwith.org.uk/misc/soundex.htm
  76.      * <p>
  77.      * This treats H and W the same as vowels (AEIOUY).
  78.      * Such letters aren't encoded (after the first), but they do
  79.      * act as separators when dropping duplicate codes.
  80.      * The mapping is otherwise the same as for {@link #US_ENGLISH}
  81.      * </p>
  82.      *
  83.      * @since 1.11
  84.      */
  85.     public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);

  86.     /**
  87.      * An instance of Soundex using the mapping as per the Genealogy site:
  88.      * http://www.genealogy.com/articles/research/00000060.html
  89.      * <p>
  90.      * This treats vowels (AEIOUY), H and W as silent letters.
  91.      * Such letters are ignored (after the first) and do not
  92.      * act as separators when dropping duplicate codes.
  93.      * </p>
  94.      * <p>
  95.      * The codes for consonants are otherwise the same as for
  96.      * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
  97.      * </p>
  98.      *
  99.      * @since 1.11
  100.      */
  101.     public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
  102.     //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ

  103.     /**
  104.      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
  105.      *
  106.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  107.      */
  108.     @Deprecated
  109.     private int maxLength = 4;

  110.     /**
  111.      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  112.      * letter is mapped. This implementation contains a default map for US_ENGLISH
  113.      */
  114.     private final char[] soundexMapping;

  115.     /**
  116.      * Should H and W be treated specially?
  117.      * <p>
  118.      * In versions of the code prior to 1.11,
  119.      * the code always treated H and W as silent (ignored) letters.
  120.      * If this field is false, H and W are no longer special-cased.
  121.      * </p>
  122.      */
  123.     private final boolean specialCaseHW;

  124.     /**
  125.      * Creates an instance using US_ENGLISH_MAPPING
  126.      *
  127.      * @see Soundex#Soundex(char[])
  128.      * @see Soundex#US_ENGLISH_MAPPING_STRING
  129.      */
  130.     public Soundex() {
  131.         this.soundexMapping = US_ENGLISH_MAPPING;
  132.         this.specialCaseHW = true;
  133.     }

  134.     /**
  135.      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
  136.      * mapping for a non-Western character set.
  137.      * <p>
  138.      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  139.      * letter is mapped. This implementation contains a default map for US_ENGLISH
  140.      * </p>
  141.      * <p>
  142.      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
  143.      * </p>
  144.      *
  145.      * @param mapping
  146.      *                  Mapping array to use when finding the corresponding code for a given character
  147.      */
  148.     public Soundex(final char[] mapping) {
  149.         this.soundexMapping = mapping.clone();
  150.         this.specialCaseHW = !hasMarker(this.soundexMapping);
  151.     }

  152.     /**
  153.      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
  154.      * and/or possibly provide an internationalized mapping for a non-Western character set.
  155.      * <p>
  156.      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
  157.      * </p>
  158.      *
  159.      * @param mapping
  160.      *            Mapping string to use when finding the corresponding code for a given character
  161.      * @since 1.4
  162.      */
  163.     public Soundex(final String mapping) {
  164.         this.soundexMapping = mapping.toCharArray();
  165.         this.specialCaseHW = !hasMarker(this.soundexMapping);
  166.     }

  167.     /**
  168.      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
  169.      * and/or possibly provide an internationalized mapping for a non-Western character set.
  170.      *
  171.      * @param mapping
  172.      *            Mapping string to use when finding the corresponding code for a given character
  173.      * @param specialCaseHW if true, then
  174.      * @since 1.11
  175.      */
  176.     public Soundex(final String mapping, final boolean specialCaseHW) {
  177.         this.soundexMapping = mapping.toCharArray();
  178.         this.specialCaseHW = specialCaseHW;
  179.     }

  180.     /**
  181.      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
  182.      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
  183.      * identical values.
  184.      *
  185.      * @param s1
  186.      *                  A String that will be encoded and compared.
  187.      * @param s2
  188.      *                  A String that will be encoded and compared.
  189.      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
  190.      * @see SoundexUtils#difference(StringEncoder,String,String)
  191.      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
  192.      *          T-SQL DIFFERENCE</a>
  193.      *
  194.      * @throws EncoderException
  195.      *                  if an error occurs encoding one of the strings
  196.      * @since 1.3
  197.      */
  198.     public int difference(final String s1, final String s2) throws EncoderException {
  199.         return SoundexUtils.difference(this, s1, s2);
  200.     }

  201.     /**
  202.      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
  203.      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type {@link String}.
  204.      *
  205.      * @param obj
  206.      *                  Object to encode
  207.      * @return An object (or type {@link String}) containing the soundex code which corresponds to the String
  208.      *             supplied.
  209.      * @throws EncoderException
  210.      *                  if the parameter supplied is not of type {@link String}
  211.      * @throws IllegalArgumentException
  212.      *                  if a character is not mapped
  213.      */
  214.     @Override
  215.     public Object encode(final Object obj) throws EncoderException {
  216.         if (!(obj instanceof String)) {
  217.             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
  218.         }
  219.         return soundex((String) obj);
  220.     }

  221.     /**
  222.      * Encodes a String using the soundex algorithm.
  223.      *
  224.      * @param str
  225.      *                  A String object to encode
  226.      * @return A Soundex code corresponding to the String supplied
  227.      * @throws IllegalArgumentException
  228.      *                  if a character is not mapped
  229.      */
  230.     @Override
  231.     public String encode(final String str) {
  232.         return soundex(str);
  233.     }

  234.     /**
  235.      * Returns the maxLength. Standard Soundex
  236.      *
  237.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  238.      * @return int
  239.      */
  240.     @Deprecated
  241.     public int getMaxLength() {
  242.         return this.maxLength;
  243.     }

  244.     private boolean hasMarker(final char[] mapping) {
  245.         for (final char ch : mapping) {
  246.             if (ch == SILENT_MARKER) {
  247.                 return true;
  248.             }
  249.         }
  250.         return false;
  251.     }

  252.     /**
  253.      * Maps the given upper-case character to its Soundex code.
  254.      *
  255.      * @param ch
  256.      *                  An upper-case character.
  257.      * @return A Soundex code.
  258.      * @throws IllegalArgumentException
  259.      *                  Thrown if {@code ch} is not mapped.
  260.      */
  261.     private char map(final char ch) {
  262.         final int index = ch - 'A';
  263.         if (index < 0 || index >= this.soundexMapping.length) {
  264.             throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
  265.         }
  266.         return this.soundexMapping[index];
  267.     }

  268.     /**
  269.      * Sets the maxLength.
  270.      *
  271.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  272.      * @param maxLength
  273.      *                  The maxLength to set
  274.      */
  275.     @Deprecated
  276.     public void setMaxLength(final int maxLength) {
  277.         this.maxLength = maxLength;
  278.     }

  279.     /**
  280.      * Retrieves the Soundex code for a given String object.
  281.      *
  282.      * @param str
  283.      *                  String to encode using the Soundex algorithm
  284.      * @return A soundex code for the String supplied
  285.      * @throws IllegalArgumentException
  286.      *                  if a character is not mapped
  287.      */
  288.     public String soundex(String str) {
  289.         if (str == null) {
  290.             return null;
  291.         }
  292.         str = SoundexUtils.clean(str);
  293.         if (str.isEmpty()) {
  294.             return str;
  295.         }
  296.         final char[] out = { '0', '0', '0', '0' };
  297.         int count = 0;
  298.         final char first = str.charAt(0);
  299.         out[count++] = first;
  300.         char lastDigit = map(first); // previous digit
  301.         for (int i = 1; i < str.length() && count < out.length; i++) {
  302.             final char ch = str.charAt(i);
  303.             if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
  304.                 continue;
  305.             }
  306.             final char digit = map(ch);
  307.             if (digit == SILENT_MARKER) {
  308.                 continue;
  309.             }
  310.             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
  311.                 out[count++] = digit;
  312.             }
  313.             lastDigit = digit;
  314.         }
  315.         return new String(out);
  316.     }

  317. }