ColognePhonetic.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.language;

  18. import java.util.Locale;

  19. import org.apache.commons.codec.EncoderException;
  20. import org.apache.commons.codec.StringEncoder;

  21. /**
  22.  * Encodes a string into a Cologne Phonetic value.
  23.  * <p>
  24.  * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a> (Cologne
  25.  * Phonetic) algorithm issued by Hans Joachim Postel in 1969.
  26.  * </p>
  27.  * <p>
  28.  * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language. It is related to
  29.  * the well-known soundex algorithm.
  30.  * </p>
  31.  *
  32.  * <h2>Algorithm</h2>
  33.  *
  34.  * <ul>
  35.  *
  36.  * <li>
  37.  * <h3>Step 1:</h3>
  38.  * After preprocessing (conversion to upper case, transcription of <a
  39.  * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
  40.  * letters of the supplied text are replaced by their phonetic code according to the following table.
  41.  * <table border="1">
  42.  * <caption style="caption-side: bottom"><small><i>(Source: <a
  43.  * href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes">Wikipedia (de): K&ouml;lner Phonetik --
  44.  * Buchstabencodes</a>)</i></small></caption> <tbody>
  45.  * <tr>
  46.  * <th>Letter</th>
  47.  * <th>Context</th>
  48.  * <th align="center">Code</th>
  49.  * </tr>
  50.  * <tr>
  51.  * <td>A, E, I, J, O, U, Y</td>
  52.  * <td></td>
  53.  * <td align="center">0</td>
  54.  * </tr>
  55.  * <tr>
  56.  *
  57.  * <td>H</td>
  58.  * <td></td>
  59.  * <td align="center">-</td>
  60.  * </tr>
  61.  * <tr>
  62.  * <td>B</td>
  63.  * <td></td>
  64.  * <td rowspan="2" align="center">1</td>
  65.  * </tr>
  66.  * <tr>
  67.  * <td>P</td>
  68.  * <td>not before H</td>
  69.  *
  70.  * </tr>
  71.  * <tr>
  72.  * <td>D, T</td>
  73.  * <td>not before C, S, Z</td>
  74.  * <td align="center">2</td>
  75.  * </tr>
  76.  * <tr>
  77.  * <td>F, V, W</td>
  78.  * <td></td>
  79.  * <td rowspan="2" align="center">3</td>
  80.  * </tr>
  81.  * <tr>
  82.  *
  83.  * <td>P</td>
  84.  * <td>before H</td>
  85.  * </tr>
  86.  * <tr>
  87.  * <td>G, K, Q</td>
  88.  * <td></td>
  89.  * <td rowspan="3" align="center">4</td>
  90.  * </tr>
  91.  * <tr>
  92.  * <td rowspan="2">C</td>
  93.  * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
  94.  *
  95.  * </tr>
  96.  * <tr>
  97.  * <td>before A, H, K, O, Q, U, X except after S, Z</td>
  98.  * </tr>
  99.  * <tr>
  100.  * <td>X</td>
  101.  * <td>not after C, K, Q</td>
  102.  * <td align="center">48</td>
  103.  * </tr>
  104.  * <tr>
  105.  * <td>L</td>
  106.  * <td></td>
  107.  *
  108.  * <td align="center">5</td>
  109.  * </tr>
  110.  * <tr>
  111.  * <td>M, N</td>
  112.  * <td></td>
  113.  * <td align="center">6</td>
  114.  * </tr>
  115.  * <tr>
  116.  * <td>R</td>
  117.  * <td></td>
  118.  * <td align="center">7</td>
  119.  * </tr>
  120.  *
  121.  * <tr>
  122.  * <td>S, Z</td>
  123.  * <td></td>
  124.  * <td rowspan="6" align="center">8</td>
  125.  * </tr>
  126.  * <tr>
  127.  * <td rowspan="3">C</td>
  128.  * <td>after S, Z</td>
  129.  * </tr>
  130.  * <tr>
  131.  * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
  132.  * </tr>
  133.  *
  134.  * <tr>
  135.  * <td>not before A, H, K, O, Q, U, X</td>
  136.  * </tr>
  137.  * <tr>
  138.  * <td>D, T</td>
  139.  * <td>before C, S, Z</td>
  140.  * </tr>
  141.  * <tr>
  142.  * <td>X</td>
  143.  * <td>after C, K, Q</td>
  144.  * </tr>
  145.  * </tbody>
  146.  * </table>
  147.  *
  148.  * <h4>Example:</h4>
  149.  *
  150.  * <code>"M</code>&uuml;<code>ller-L</code>&uuml;
  151.  * <code>denscheidt" =&gt; "MULLERLUDENSCHEIDT" =&gt; "6005507500206880022"</code>
  152.  *
  153.  * </li>
  154.  *
  155.  * <li>
  156.  * <h3>Step 2:</h3>
  157.  * Collapse of all multiple consecutive code digits.
  158.  * <h4>Example:</h4>
  159.  * <code>"6005507500206880022" =&gt; "6050750206802"</code></li>
  160.  *
  161.  * <li>
  162.  * <h3>Step 3:</h3>
  163.  * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
  164.  * if they occur after removing the "0" digits.
  165.  *
  166.  * <h4>Example:</h4>
  167.  * <code>"6050750206802" =&gt; "65752682"</code></li>
  168.  *
  169.  * </ul>
  170.  *
  171.  * <p>
  172.  * This class is thread-safe.
  173.  * </p>
  174.  *
  175.  * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
  176.  * @since 1.5
  177.  */
  178. public class ColognePhonetic implements StringEncoder {

  179.     // Predefined char arrays for better performance and less GC load
  180.     private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
  181.     private static final char[] SCZ = new char[] { 'S', 'C', 'Z' };
  182.     private static final char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
  183.     private static final char[] GKQ = new char[] { 'G', 'K', 'Q' };
  184.     private static final char[] CKQ = new char[] { 'C', 'K', 'Q' };
  185.     private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
  186.     private static final char[] SZ = new char[] { 'S', 'Z' };
  187.     private static final char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
  188.     private static final char[] TDX = new char[] { 'T', 'D', 'X' };

  189.     /**
  190.      * This class is not thread-safe; the field {@link #length} is mutable.
  191.      * However, it is not shared between threads, as it is constructed on demand
  192.      * by the method {@link ColognePhonetic#colognePhonetic(String)}
  193.      */
  194.     private abstract class CologneBuffer {

  195.         protected final char[] data;

  196.         protected int length = 0;

  197.         public CologneBuffer(final char[] data) {
  198.             this.data = data;
  199.             this.length = data.length;
  200.         }

  201.         public CologneBuffer(final int buffSize) {
  202.             this.data = new char[buffSize];
  203.             this.length = 0;
  204.         }

  205.         protected abstract char[] copyData(int start, final int length);

  206.         public int length() {
  207.             return length;
  208.         }

  209.         @Override
  210.         public String toString() {
  211.             return new String(copyData(0, length));
  212.         }
  213.     }

  214.     private class CologneOutputBuffer extends CologneBuffer {

  215.         public CologneOutputBuffer(final int buffSize) {
  216.             super(buffSize);
  217.         }

  218.         public void addRight(final char chr) {
  219.             data[length] = chr;
  220.             length++;
  221.         }

  222.         @Override
  223.         protected char[] copyData(final int start, final int length) {
  224.             final char[] newData = new char[length];
  225.             System.arraycopy(data, start, newData, 0, length);
  226.             return newData;
  227.         }
  228.     }

  229.     private class CologneInputBuffer extends CologneBuffer {

  230.         public CologneInputBuffer(final char[] data) {
  231.             super(data);
  232.         }

  233.         public void addLeft(final char ch) {
  234.             length++;
  235.             data[getNextPos()] = ch;
  236.         }

  237.         @Override
  238.         protected char[] copyData(final int start, final int length) {
  239.             final char[] newData = new char[length];
  240.             System.arraycopy(data, data.length - this.length + start, newData, 0, length);
  241.             return newData;
  242.         }

  243.         public char getNextChar() {
  244.             return data[getNextPos()];
  245.         }

  246.         protected int getNextPos() {
  247.             return data.length - length;
  248.         }

  249.         public char removeNext() {
  250.             final char ch = getNextChar();
  251.             length--;
  252.             return ch;
  253.         }
  254.     }

  255.     /**
  256.      * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
  257.      * <ul>
  258.      * <li>capital a, umlaut mark</li>
  259.      * <li>capital u, umlaut mark</li>
  260.      * <li>capital o, umlaut mark</li>
  261.      * <li>small sharp s, German</li>
  262.      * </ul>
  263.      */
  264.     private static final char[][] PREPROCESS_MAP = new char[][]{
  265.         {'\u00C4', 'A'}, // capital a, umlaut mark
  266.         {'\u00DC', 'U'}, // capital u, umlaut mark
  267.         {'\u00D6', 'O'}, // capital o, umlaut mark
  268.         {'\u00DF', 'S'} // small sharp s, German
  269.     };

  270.     /*
  271.      * Returns whether the array contains the key, or not.
  272.      */
  273.     private static boolean arrayContains(final char[] arr, final char key) {
  274.         for (final char element : arr) {
  275.             if (element == key) {
  276.                 return true;
  277.             }
  278.         }
  279.         return false;
  280.     }

  281.     /**
  282.      * <p>
  283.      * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
  284.      * </p>
  285.      * <p>
  286.      * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
  287.      * </p>
  288.      *
  289.      * @param text The source text to encode
  290.      * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
  291.      */
  292.     public String colognePhonetic(String text) {
  293.         if (text == null) {
  294.             return null;
  295.         }

  296.         text = preprocess(text);

  297.         final CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
  298.         final CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());

  299.         char nextChar;

  300.         char lastChar = '-';
  301.         char lastCode = '/';
  302.         char code;
  303.         char chr;

  304.         int rightLength = input.length();

  305.         while (rightLength > 0) {
  306.             chr = input.removeNext();

  307.             if ((rightLength = input.length()) > 0) {
  308.                 nextChar = input.getNextChar();
  309.             } else {
  310.                 nextChar = '-';
  311.             }

  312.             if (arrayContains(AEIJOUY, chr)) {
  313.                 code = '0';
  314.             } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
  315.                 if (lastCode == '/') {
  316.                     continue;
  317.                 }
  318.                 code = '-';
  319.             } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
  320.                 code = '1';
  321.             } else if ((chr == 'D' || chr == 'T') && !arrayContains(SCZ, nextChar)) {
  322.                 code = '2';
  323.             } else if (arrayContains(WFPV, chr)) {
  324.                 code = '3';
  325.             } else if (arrayContains(GKQ, chr)) {
  326.                 code = '4';
  327.             } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) {
  328.                 code = '4';
  329.                 input.addLeft('S');
  330.                 rightLength++;
  331.             } else if (chr == 'S' || chr == 'Z') {
  332.                 code = '8';
  333.             } else if (chr == 'C') {
  334.                 if (lastCode == '/') {
  335.                     if (arrayContains(AHKLOQRUX, nextChar)) {
  336.                         code = '4';
  337.                     } else {
  338.                         code = '8';
  339.                     }
  340.                 } else {
  341.                     if (arrayContains(SZ, lastChar) || !arrayContains(AHOUKQX, nextChar)) {
  342.                         code = '8';
  343.                     } else {
  344.                         code = '4';
  345.                     }
  346.                 }
  347.             } else if (arrayContains(TDX, chr)) {
  348.                 code = '8';
  349.             } else if (chr == 'R') {
  350.                 code = '7';
  351.             } else if (chr == 'L') {
  352.                 code = '5';
  353.             } else if (chr == 'M' || chr == 'N') {
  354.                 code = '6';
  355.             } else {
  356.                 code = chr;
  357.             }

  358.             if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
  359.                 output.addRight(code);
  360.             }

  361.             lastChar = chr;
  362.             lastCode = code;
  363.         }
  364.         return output.toString();
  365.     }

  366.     @Override
  367.     public Object encode(final Object object) throws EncoderException {
  368.         if (!(object instanceof String)) {
  369.             throw new EncoderException("This method's parameter was expected to be of the type " +
  370.                 String.class.getName() +
  371.                 ". But actually it was of the type " +
  372.                 object.getClass().getName() +
  373.                 ".");
  374.         }
  375.         return encode((String) object);
  376.     }

  377.     @Override
  378.     public String encode(final String text) {
  379.         return colognePhonetic(text);
  380.     }

  381.     public boolean isEncodeEqual(final String text1, final String text2) {
  382.         return colognePhonetic(text1).equals(colognePhonetic(text2));
  383.     }

  384.     /**
  385.      * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
  386.      */
  387.     private String preprocess(String text) {
  388.         text = text.toUpperCase(Locale.GERMAN);

  389.         final char[] chrs = text.toCharArray();

  390.         for (int index = 0; index < chrs.length; index++) {
  391.             if (chrs[index] > 'Z') {
  392.                 for (final char[] element : PREPROCESS_MAP) {
  393.                     if (chrs[index] == element[0]) {
  394.                         chrs[index] = element[1];
  395.                         break;
  396.                     }
  397.                 }
  398.             }
  399.         }
  400.         return new String(chrs);
  401.     }
  402. }