SoundexUtils.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import java.util.Locale;

  19. import org.apache.commons.codec.EncoderException;
  20. import org.apache.commons.codec.StringEncoder;

  21. /**
  22.  * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
  23.  *
  24.  * <p>This class is immutable and thread-safe.</p>
  25.  *
  26.  * @since 1.3
  27.  */
  28. final class SoundexUtils {

  29.     /**
  30.      * Cleans up the input string before Soundex processing by only returning
  31.      * upper case letters.
  32.      *
  33.      * @param str
  34.      *                  The String to clean.
  35.      * @return A clean String.
  36.      */
  37.     static String clean(final String str) {
  38.         if (isEmpty(str)) {
  39.             return str;
  40.         }
  41.         final int len = str.length();
  42.         final char[] chars = new char[len];
  43.         int count = 0;
  44.         for (int i = 0; i < len; i++) {
  45.             if (Character.isLetter(str.charAt(i))) {
  46.                 chars[count++] = str.charAt(i);
  47.             }
  48.         }
  49.         if (count == len) {
  50.             return str.toUpperCase(Locale.ENGLISH);
  51.         }
  52.         return new String(chars, 0, count).toUpperCase(Locale.ENGLISH);
  53.     }

  54.     /**
  55.      * Encodes the Strings and returns the number of characters in the two
  56.      * encoded Strings that are the same.
  57.      * <ul>
  58.      * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  59.      * little or no similarity, and 4 indicates strong similarity or identical
  60.      * values.</li>
  61.      * <li>For refined Soundex, the return value can be greater than 4.</li>
  62.      * </ul>
  63.      *
  64.      * @param encoder
  65.      *                  The encoder to use to encode the Strings.
  66.      * @param s1
  67.      *                  A String that will be encoded and compared.
  68.      * @param s2
  69.      *                  A String that will be encoded and compared.
  70.      * @return The number of characters in the two Soundex encoded Strings that
  71.      *             are the same.
  72.      *
  73.      * @see #differenceEncoded(String,String)
  74.      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  75.      *          MS T-SQL DIFFERENCE</a>
  76.      *
  77.      * @throws EncoderException
  78.      *                  if an error occurs encoding one of the strings
  79.      */
  80.     static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
  81.         return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
  82.     }

  83.     /**
  84.      * Returns the number of characters in the two Soundex encoded Strings that
  85.      * are the same.
  86.      * <ul>
  87.      * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  88.      * little or no similarity, and 4 indicates strong similarity or identical
  89.      * values.</li>
  90.      * <li>For refined Soundex, the return value can be greater than 4.</li>
  91.      * </ul>
  92.      *
  93.      * @param es1
  94.      *                  An encoded String.
  95.      * @param es2
  96.      *                  An encoded String.
  97.      * @return The number of characters in the two Soundex encoded Strings that
  98.      *             are the same.
  99.      *
  100.      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  101.      *          MS T-SQL DIFFERENCE</a>
  102.      */
  103.     static int differenceEncoded(final String es1, final String es2) {

  104.         if (es1 == null || es2 == null) {
  105.             return 0;
  106.         }
  107.         final int lengthToMatch = Math.min(es1.length(), es2.length());
  108.         int diff = 0;
  109.         for (int i = 0; i < lengthToMatch; i++) {
  110.             if (es1.charAt(i) == es2.charAt(i)) {
  111.                 diff++;
  112.             }
  113.         }
  114.         return diff;
  115.     }

  116.     /**
  117.      * <p>Checks if a CharSequence is empty ("") or null.</p>
  118.      *
  119.      * <pre>
  120.      * StringUtils.isEmpty(null)      = true
  121.      * StringUtils.isEmpty("")        = true
  122.      * StringUtils.isEmpty(" ")       = false
  123.      * StringUtils.isEmpty("bob")     = false
  124.      * StringUtils.isEmpty("  bob  ") = false
  125.      * </pre>
  126.      *
  127.      * @param cs  the CharSequence to check, may be null.
  128.      * @return {@code true} if the CharSequence is empty or null.
  129.      */
  130.     static boolean isEmpty(final CharSequence cs) {
  131.         return cs == null || cs.length() == 0;
  132.     }

  133. }