001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import java.util.Locale;
021    
022    import org.apache.commons.codec.EncoderException;
023    import org.apache.commons.codec.StringEncoder;
024    
025    /**
026     * Encodes a string into a Cologne Phonetic value.
027     * <p>
028     * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a>
029     * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
030     * <p>
031     * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
032     * It is related to the well-known soundex algorithm.
033     * <p>
034     *
035     * <h2>Algorithm</h2>
036     *
037     * <ul>
038     *
039     * <li>
040     * <h3>Step 1:</h3>
041     * After preprocessing (conversion to upper case, transcription of <a
042     * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
043     * letters of the supplied text are replaced by their phonetic code according to the following table.
044     * <table border="1">
045     * <tbody>
046     * <tr>
047     * <th>Letter</th>
048     * <th>Context</th>
049     * <th align="center">Code</th>
050     * </tr>
051     * <tr>
052     * <td>A, E, I, J, O, U, Y</td>
053     * <td></td>
054     * <td align="center">0</td>
055     * </tr>
056     * <tr>
057     *
058     * <td>H</td>
059     * <td></td>
060     * <td align="center">-</td>
061     * </tr>
062     * <tr>
063     * <td>B</td>
064     * <td></td>
065     * <td rowspan="2" align="center">1</td>
066     * </tr>
067     * <tr>
068     * <td>P</td>
069     * <td>not before H</td>
070     *
071     * </tr>
072     * <tr>
073     * <td>D, T</td>
074     * <td>not before C, S, Z</td>
075     * <td align="center">2</td>
076     * </tr>
077     * <tr>
078     * <td>F, V, W</td>
079     * <td></td>
080     * <td rowspan="2" align="center">3</td>
081     * </tr>
082     * <tr>
083     *
084     * <td>P</td>
085     * <td>before H</td>
086     * </tr>
087     * <tr>
088     * <td>G, K, Q</td>
089     * <td></td>
090     * <td rowspan="3" align="center">4</td>
091     * </tr>
092     * <tr>
093     * <td rowspan="2">C</td>
094     * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
095     *
096     * </tr>
097     * <tr>
098     * <td>before A, H, K, O, Q, U, X except after S, Z</td>
099     * </tr>
100     * <tr>
101     * <td>X</td>
102     * <td>not after C, K, Q</td>
103     * <td align="center">48</td>
104     * </tr>
105     * <tr>
106     * <td>L</td>
107     * <td></td>
108     *
109     * <td align="center">5</td>
110     * </tr>
111     * <tr>
112     * <td>M, N</td>
113     * <td></td>
114     * <td align="center">6</td>
115     * </tr>
116     * <tr>
117     * <td>R</td>
118     * <td></td>
119     * <td align="center">7</td>
120     * </tr>
121     *
122     * <tr>
123     * <td>S, Z</td>
124     * <td></td>
125     * <td rowspan="6" align="center">8</td>
126     * </tr>
127     * <tr>
128     * <td rowspan="3">C</td>
129     * <td>after S, Z</td>
130     * </tr>
131     * <tr>
132     * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
133     * </tr>
134     *
135     * <tr>
136     * <td>not before A, H, K, O, Q, U, X</td>
137     * </tr>
138     * <tr>
139     * <td>D, T</td>
140     * <td>before C, S, Z</td>
141     * </tr>
142     * <tr>
143     * <td>X</td>
144     * <td>after C, K, Q</td>
145     * </tr>
146     * </tbody>
147     * </table>
148     * <p>
149     * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
150     * K&ouml;lner Phonetik -- Buchstabencodes</a>)</i></small>
151     * </p>
152     *
153     * <h4>Example:</h4>
154     *
155     * {@code "M}&uuml;{@code ller-L}&uuml;{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
156     *
157     * </li>
158     *
159     * <li>
160     * <h3>Step 2:</h3>
161     * Collapse of all multiple consecutive code digits.
162     * <h4>Example:</h4>
163     * {@code "6005507500206880022" => "6050750206802"}</li>
164     *
165     * <li>
166     * <h3>Step 3:</h3>
167     * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
168     * if they occur after removing the "0" digits.
169     *
170     * <h4>Example:</h4>
171     * {@code "6050750206802" => "65752682"}</li>
172     *
173     * </ul>
174     *
175     * This class is thread-safe.
176     *
177     * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
178     * @since 1.5
179     */
180    public class ColognePhonetic implements StringEncoder {
181    
182        /**
183         * This class is not thread-safe; the field {@link #length} is mutable.
184         * However, it is not shared between threads, as it is constructed on demand
185         * by the method {@link ColognePhonetic#colognePhonetic(String)}
186         */
187        private abstract class CologneBuffer {
188    
189            protected final char[] data;
190    
191            protected int length = 0;
192    
193            public CologneBuffer(char[] data) {
194                this.data = data;
195                this.length = data.length;
196            }
197    
198            public CologneBuffer(int buffSize) {
199                this.data = new char[buffSize];
200                this.length = 0;
201            }
202    
203            protected abstract char[] copyData(int start, final int length);
204    
205            public int length() {
206                return length;
207            }
208    
209            @Override
210            public String toString() {
211                return new String(copyData(0, length));
212            }
213        }
214    
215        private class CologneOutputBuffer extends CologneBuffer {
216    
217            public CologneOutputBuffer(int buffSize) {
218                super(buffSize);
219            }
220    
221            public void addRight(char chr) {
222                data[length] = chr;
223                length++;
224            }
225    
226            @Override
227            protected char[] copyData(int start, final int length) {
228                char[] newData = new char[length];
229                System.arraycopy(data, start, newData, 0, length);
230                return newData;
231            }
232        }
233    
234        private class CologneInputBuffer extends CologneBuffer {
235    
236            public CologneInputBuffer(char[] data) {
237                super(data);
238            }
239    
240            public void addLeft(char ch) {
241                length++;
242                data[getNextPos()] = ch;
243            }
244    
245            @Override
246            protected char[] copyData(int start, final int length) {
247                char[] newData = new char[length];
248                System.arraycopy(data, data.length - this.length + start, newData, 0, length);
249                return newData;
250            }
251    
252            public char getNextChar() {
253                return data[getNextPos()];
254            }
255    
256            protected int getNextPos() {
257                return data.length - length;
258            }
259    
260            public char removeNext() {
261                char ch = getNextChar();
262                length--;
263                return ch;
264            }
265        }
266    
267        /**
268         * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
269         * <ul>
270         * <li>capital a, umlaut mark</li>
271         * <li>capital u, umlaut mark</li>
272         * <li>capital o, umlaut mark</li>
273         * <li>small sharp s, German</li>
274         * </ul>
275         */
276        private static final char[][] PREPROCESS_MAP = new char[][]{
277            {'\u00C4', 'A'}, // capital a, umlaut mark
278            {'\u00DC', 'U'}, // capital u, umlaut mark
279            {'\u00D6', 'O'}, // capital o, umlaut mark
280            {'\u00DF', 'S'} // small sharp s, German
281        };
282    
283        /*
284         * Returns whether the array contains the key, or not.
285         */
286        private static boolean arrayContains(char[] arr, char key) {
287            for (char element : arr) {
288                if (element == key) {
289                    return true;
290                }
291            }
292            return false;
293        }
294    
295        /**
296         * <p>
297         * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
298         * </p>
299         * <p>
300         * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
301         * </p>
302         *
303         * @param text
304         * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
305         */
306        public String colognePhonetic(String text) {
307            if (text == null) {
308                return null;
309            }
310    
311            text = preprocess(text);
312    
313            CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
314            CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
315    
316            char nextChar;
317    
318            char lastChar = '-';
319            char lastCode = '/';
320            char code;
321            char chr;
322    
323            int rightLength = input.length();
324    
325            while (rightLength > 0) {
326                chr = input.removeNext();
327    
328                if ((rightLength = input.length()) > 0) {
329                    nextChar = input.getNextChar();
330                } else {
331                    nextChar = '-';
332                }
333    
334                if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) {
335                    code = '0';
336                } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
337                    if (lastCode == '/') {
338                        continue;
339                    }
340                    code = '-';
341                } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
342                    code = '1';
343                } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) {
344                    code = '2';
345                } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) {
346                    code = '3';
347                } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) {
348                    code = '4';
349                } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) {
350                    code = '4';
351                    input.addLeft('S');
352                    rightLength++;
353                } else if (chr == 'S' || chr == 'Z') {
354                    code = '8';
355                } else if (chr == 'C') {
356                    if (lastCode == '/') {
357                        if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) {
358                            code = '4';
359                        } else {
360                            code = '8';
361                        }
362                    } else {
363                        if (arrayContains(new char[]{'S', 'Z'}, lastChar) ||
364                            !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) {
365                            code = '8';
366                        } else {
367                            code = '4';
368                        }
369                    }
370                } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) {
371                    code = '8';
372                } else if (chr == 'R') {
373                    code = '7';
374                } else if (chr == 'L') {
375                    code = '5';
376                } else if (chr == 'M' || chr == 'N') {
377                    code = '6';
378                } else {
379                    code = chr;
380                }
381    
382                if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
383                    output.addRight(code);
384                }
385    
386                lastChar = chr;
387                lastCode = code;
388            }
389            return output.toString();
390        }
391    
392        @Override
393        public Object encode(Object object) throws EncoderException {
394            if (!(object instanceof String)) {
395                throw new EncoderException("This method's parameter was expected to be of the type " +
396                    String.class.getName() +
397                    ". But actually it was of the type " +
398                    object.getClass().getName() +
399                    ".");
400            }
401            return encode((String) object);
402        }
403    
404        @Override
405        public String encode(String text) {
406            return colognePhonetic(text);
407        }
408    
409        public boolean isEncodeEqual(String text1, String text2) {
410            return colognePhonetic(text1).equals(colognePhonetic(text2));
411        }
412    
413        /**
414         * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
415         */
416        private String preprocess(String text) {
417            text = text.toUpperCase(Locale.GERMAN);
418    
419            char[] chrs = text.toCharArray();
420    
421            for (int index = 0; index < chrs.length; index++) {
422                if (chrs[index] > 'Z') {
423                    for (char[] element : PREPROCESS_MAP) {
424                        if (chrs[index] == element[0]) {
425                            chrs[index] = element[1];
426                            break;
427                        }
428                    }
429                }
430            }
431            return new String(chrs);
432        }
433    }