SoundexUtils.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import org.apache.commons.codec.EncoderException;
  19. import org.apache.commons.codec.StringEncoder;

  20. /**
  21.  * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
  22.  *
  23.  * <p>This class is immutable and thread-safe.</p>
  24.  *
  25.  * @since 1.3
  26.  */
  27. final class SoundexUtils {

  28.     /**
  29.      * Cleans up the input string before Soundex processing by only returning
  30.      * upper case letters.
  31.      *
  32.      * @param str
  33.      *                  The String to clean.
  34.      * @return A clean String.
  35.      */
  36.     static String clean(final String str) {
  37.         if (isEmpty(str)) {
  38.             return str;
  39.         }
  40.         final int len = str.length();
  41.         final char[] chars = new char[len];
  42.         int count = 0;
  43.         for (int i = 0; i < len; i++) {
  44.             if (Character.isLetter(str.charAt(i))) {
  45.                 chars[count++] = str.charAt(i);
  46.             }
  47.         }
  48.         if (count == len) {
  49.             return str.toUpperCase(java.util.Locale.ENGLISH);
  50.         }
  51.         return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
  52.     }

  53.     /**
  54.      * Encodes the Strings and returns the number of characters in the two
  55.      * encoded Strings that are the same.
  56.      * <ul>
  57.      * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  58.      * little or no similarity, and 4 indicates strong similarity or identical
  59.      * values.</li>
  60.      * <li>For refined Soundex, the return value can be greater than 4.</li>
  61.      * </ul>
  62.      *
  63.      * @param encoder
  64.      *                  The encoder to use to encode the Strings.
  65.      * @param s1
  66.      *                  A String that will be encoded and compared.
  67.      * @param s2
  68.      *                  A String that will be encoded and compared.
  69.      * @return The number of characters in the two Soundex encoded Strings that
  70.      *             are the same.
  71.      *
  72.      * @see #differenceEncoded(String,String)
  73.      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  74.      *          MS T-SQL DIFFERENCE</a>
  75.      *
  76.      * @throws EncoderException
  77.      *                  if an error occurs encoding one of the strings
  78.      */
  79.     static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
  80.         return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
  81.     }

  82.     /**
  83.      * Returns the number of characters in the two Soundex encoded Strings that
  84.      * are the same.
  85.      * <ul>
  86.      * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
  87.      * little or no similarity, and 4 indicates strong similarity or identical
  88.      * values.</li>
  89.      * <li>For refined Soundex, the return value can be greater than 4.</li>
  90.      * </ul>
  91.      *
  92.      * @param es1
  93.      *                  An encoded String.
  94.      * @param es2
  95.      *                  An encoded String.
  96.      * @return The number of characters in the two Soundex encoded Strings that
  97.      *             are the same.
  98.      *
  99.      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
  100.      *          MS T-SQL DIFFERENCE</a>
  101.      */
  102.     static int differenceEncoded(final String es1, final String es2) {

  103.         if (es1 == null || es2 == null) {
  104.             return 0;
  105.         }
  106.         final int lengthToMatch = Math.min(es1.length(), es2.length());
  107.         int diff = 0;
  108.         for (int i = 0; i < lengthToMatch; i++) {
  109.             if (es1.charAt(i) == es2.charAt(i)) {
  110.                 diff++;
  111.             }
  112.         }
  113.         return diff;
  114.     }

  115.     /**
  116.      * <p>Checks if a CharSequence is empty ("") or null.</p>
  117.      *
  118.      * <pre>
  119.      * StringUtils.isEmpty(null)      = true
  120.      * StringUtils.isEmpty("")        = true
  121.      * StringUtils.isEmpty(" ")       = false
  122.      * StringUtils.isEmpty("bob")     = false
  123.      * StringUtils.isEmpty("  bob  ") = false
  124.      * </pre>
  125.      *
  126.      * @param cs  the CharSequence to check, may be null
  127.      * @return {@code true} if the CharSequence is empty or null
  128.      */
  129.     static boolean isEmpty(final CharSequence cs) {
  130.         return cs == null || cs.length() == 0;
  131.     }

  132. }