Soundex.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import org.apache.commons.codec.EncoderException;
  19. import org.apache.commons.codec.StringEncoder;

  20. /**
  21.  * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
  22.  * general purpose scheme to find word with similar phonemes.
  23.  *
  24.  * This class is thread-safe.
  25.  * Although not strictly immutable, the {@link #maxLength} field is not actually used.
  26.  *
  27.  * @version $Id: Soundex.java 1811347 2017-10-06 15:21:18Z ggregory $
  28.  */
  29. public class Soundex implements StringEncoder {

  30.     /**
  31.      * The marker character used to indicate a silent (ignored) character.
  32.      * These are ignored except when they appear as the first character.
  33.      * <p>
  34.      * Note: the {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
  35.      * because changing it might break existing code. Mappings that don't contain
  36.      * a silent marker code are treated as though H and W are silent.
  37.      * <p>
  38.      * To override this, use the {@link #Soundex(String, boolean)} constructor.
  39.      * @since 1.11
  40.      */
  41.     public static final char SILENT_MARKER = '-';

  42.     /**
  43.      * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
  44.      * means do not encode, but treat as a separator when it occurs between consonants with the same code.
  45.      * <p>
  46.      * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
  47.      * up the value for the constant values page.)
  48.      * <p>
  49.      * <b>Note that letters H and W are treated specially.</b>
  50.      * They are ignored (after the first letter) and don't act as separators
  51.      * between consonants with the same code.
  52.      * @see #US_ENGLISH_MAPPING
  53.      */
  54.     //                                                      ABCDEFGHIJKLMNOPQRSTUVWXYZ
  55.     public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";

  56.     /**
  57.      * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
  58.      * means do not encode.
  59.      *
  60.      * @see Soundex#Soundex(char[])
  61.      */
  62.     private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();

  63.     /**
  64.      * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
  65.      * This treats H and W as silent letters.
  66.      * Apart from when they appear as the first letter, they are ignored.
  67.      * They don't act as separators between duplicate codes.
  68.      *
  69.      * @see #US_ENGLISH_MAPPING
  70.      * @see #US_ENGLISH_MAPPING_STRING
  71.      */
  72.     public static final Soundex US_ENGLISH = new Soundex();

  73.     /**
  74.      * An instance of Soundex using the Simplified Soundex mapping, as described here:
  75.      * http://west-penwith.org.uk/misc/soundex.htm
  76.      * <p>
  77.      * This treats H and W the same as vowels (AEIOUY).
  78.      * Such letters aren't encoded (after the first), but they do
  79.      * act as separators when dropping duplicate codes.
  80.      * The mapping is otherwise the same as for {@link #US_ENGLISH}
  81.      * <p>
  82.      * @since 1.11
  83.      */
  84.     public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);

  85.     /**
  86.      * An instance of Soundex using the mapping as per the Genealogy site:
  87.      * http://www.genealogy.com/articles/research/00000060.html
  88.      * <p>
  89.      * This treats vowels (AEIOUY), H and W as silent letters.
  90.      * Such letters are ignored (after the first) and do not
  91.      * act as separators when dropping duplicate codes.
  92.      * <p>
  93.      * The codes for consonants are otherwise the same as for
  94.      * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
  95.      *
  96.      * @since 1.11
  97.      */
  98.     public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
  99.     //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ

  100.     /**
  101.      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
  102.      *
  103.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  104.      */
  105.     @Deprecated
  106.     private int maxLength = 4;

  107.     /**
  108.      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  109.      * letter is mapped. This implementation contains a default map for US_ENGLISH
  110.      */
  111.     private final char[] soundexMapping;

  112.     /**
  113.      * Should H and W be treated specially?
  114.      * <p>
  115.      * In versions of the code prior to 1.11,
  116.      * the code always treated H and W as silent (ignored) letters.
  117.      * If this field is false, H and W are no longer special-cased.
  118.      */
  119.     private final boolean specialCaseHW;

  120.     /**
  121.      * Creates an instance using US_ENGLISH_MAPPING
  122.      *
  123.      * @see Soundex#Soundex(char[])
  124.      * @see Soundex#US_ENGLISH_MAPPING
  125.      */
  126.     public Soundex() {
  127.         this.soundexMapping = US_ENGLISH_MAPPING;
  128.         this.specialCaseHW = true;
  129.     }

  130.     /**
  131.      * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
  132.      * mapping for a non-Western character set.
  133.      *
  134.      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
  135.      * letter is mapped. This implementation contains a default map for US_ENGLISH
  136.      * <p>
  137.      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
  138.      *
  139.      * @param mapping
  140.      *                  Mapping array to use when finding the corresponding code for a given character
  141.      */
  142.     public Soundex(final char[] mapping) {
  143.         this.soundexMapping = new char[mapping.length];
  144.         System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
  145.         this.specialCaseHW = !hasMarker(this.soundexMapping);
  146.     }

  147.     private boolean hasMarker(final char[] mapping) {
  148.         for(final char ch : mapping) {
  149.             if (ch == SILENT_MARKER) {
  150.                 return true;
  151.             }
  152.         }
  153.         return false;
  154.     }

  155.     /**
  156.      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
  157.      * and/or possibly provide an internationalized mapping for a non-Western character set.
  158.      * <p>
  159.      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment
  160.      *
  161.      * @param mapping
  162.      *            Mapping string to use when finding the corresponding code for a given character
  163.      * @since 1.4
  164.      */
  165.     public Soundex(final String mapping) {
  166.         this.soundexMapping = mapping.toCharArray();
  167.         this.specialCaseHW = !hasMarker(this.soundexMapping);
  168.     }

  169.     /**
  170.      * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
  171.      * and/or possibly provide an internationalized mapping for a non-Western character set.
  172.      *
  173.      * @param mapping
  174.      *            Mapping string to use when finding the corresponding code for a given character
  175.      * @param specialCaseHW if true, then
  176.      * @since 1.11
  177.      */
  178.     public Soundex(final String mapping, final boolean specialCaseHW) {
  179.         this.soundexMapping = mapping.toCharArray();
  180.         this.specialCaseHW = specialCaseHW;
  181.     }

  182.     /**
  183.      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
  184.      * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
  185.      * identical values.
  186.      *
  187.      * @param s1
  188.      *                  A String that will be encoded and compared.
  189.      * @param s2
  190.      *                  A String that will be encoded and compared.
  191.      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
  192.      *
  193.      * @see SoundexUtils#difference(StringEncoder,String,String)
  194.      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
  195.      *          T-SQL DIFFERENCE </a>
  196.      *
  197.      * @throws EncoderException
  198.      *                  if an error occurs encoding one of the strings
  199.      * @since 1.3
  200.      */
  201.     public int difference(final String s1, final String s2) throws EncoderException {
  202.         return SoundexUtils.difference(this, s1, s2);
  203.     }

  204.     /**
  205.      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
  206.      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
  207.      *
  208.      * @param obj
  209.      *                  Object to encode
  210.      * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
  211.      *             supplied.
  212.      * @throws EncoderException
  213.      *                  if the parameter supplied is not of type java.lang.String
  214.      * @throws IllegalArgumentException
  215.      *                  if a character is not mapped
  216.      */
  217.     @Override
  218.     public Object encode(final Object obj) throws EncoderException {
  219.         if (!(obj instanceof String)) {
  220.             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
  221.         }
  222.         return soundex((String) obj);
  223.     }

  224.     /**
  225.      * Encodes a String using the soundex algorithm.
  226.      *
  227.      * @param str
  228.      *                  A String object to encode
  229.      * @return A Soundex code corresponding to the String supplied
  230.      * @throws IllegalArgumentException
  231.      *                  if a character is not mapped
  232.      */
  233.     @Override
  234.     public String encode(final String str) {
  235.         return soundex(str);
  236.     }

  237.     /**
  238.      * Returns the maxLength. Standard Soundex
  239.      *
  240.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  241.      * @return int
  242.      */
  243.     @Deprecated
  244.     public int getMaxLength() {
  245.         return this.maxLength;
  246.     }

  247.     /**
  248.      * Maps the given upper-case character to its Soundex code.
  249.      *
  250.      * @param ch
  251.      *                  An upper-case character.
  252.      * @return A Soundex code.
  253.      * @throws IllegalArgumentException
  254.      *                  Thrown if <code>ch</code> is not mapped.
  255.      */
  256.     private char map(final char ch) {
  257.         final int index = ch - 'A';
  258.         if (index < 0 || index >= this.soundexMapping.length) {
  259.             throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
  260.         }
  261.         return this.soundexMapping[index];
  262.     }

  263.     /**
  264.      * Sets the maxLength.
  265.      *
  266.      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
  267.      * @param maxLength
  268.      *                  The maxLength to set
  269.      */
  270.     @Deprecated
  271.     public void setMaxLength(final int maxLength) {
  272.         this.maxLength = maxLength;
  273.     }

  274.     /**
  275.      * Retrieves the Soundex code for a given String object.
  276.      *
  277.      * @param str
  278.      *                  String to encode using the Soundex algorithm
  279.      * @return A soundex code for the String supplied
  280.      * @throws IllegalArgumentException
  281.      *                  if a character is not mapped
  282.      */
  283.     public String soundex(String str) {
  284.         if (str == null) {
  285.             return null;
  286.         }
  287.         str = SoundexUtils.clean(str);
  288.         if (str.length() == 0) {
  289.             return str;
  290.         }
  291.         final char out[] = {'0', '0', '0', '0'};
  292.         int count = 0;
  293.         final char first = str.charAt(0);
  294.         out[count++] = first;
  295.         char lastDigit = map(first); // previous digit
  296.         for(int i = 1; i < str.length() && count < out.length ; i++) {
  297.             final char ch = str.charAt(i);
  298.             if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) { // these are ignored completely
  299.                 continue;
  300.             }
  301.             final char digit = map(ch);
  302.             if (digit == SILENT_MARKER) {
  303.                 continue;
  304.             }
  305.             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
  306.                 out[count++] = digit;
  307.             }
  308.             lastDigit = digit;
  309.         }
  310.         return new String(out);
  311.     }

  312. }