001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import java.util.Locale;
021    
022    import org.apache.commons.codec.EncoderException;
023    import org.apache.commons.codec.StringEncoder;
024    
025    /**
026     * Encodes a string into a Cologne Phonetic value.
027     * <p>
028     * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a>
029     * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
030     * <p>
031     * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
032     * It is related to the well-known soundex algorithm.
033     * <p>
034     *
035     * <h2>Algorithm</h2>
036     *
037     * <ul>
038     *
039     * <li>
040     * <h3>Step 1:</h3>
041     * After preprocessing (conversion to upper case, transcription of <a
042     * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
043     * letters of the supplied text are replaced by their phonetic code according to the following table.
044     * <table border="1">
045     * <tbody>
046     * <tr>
047     * <th>Letter</th>
048     * <th>Context</th>
049     * <th align="center">Code</th>
050     * </tr>
051     * <tr>
052     * <td>A, E, I, J, O, U, Y</td>
053     * <td></td>
054     * <td align="center">0</td>
055     * </tr>
056     * <tr>
057     *
058     * <td>H</td>
059     * <td></td>
060     * <td align="center">-</td>
061     * </tr>
062     * <tr>
063     * <td>B</td>
064     * <td></td>
065     * <td rowspan="2" align="center">1</td>
066     * </tr>
067     * <tr>
068     * <td>P</td>
069     * <td>not before H</td>
070     *
071     * </tr>
072     * <tr>
073     * <td>D, T</td>
074     * <td>not before C, S, Z</td>
075     * <td align="center">2</td>
076     * </tr>
077     * <tr>
078     * <td>F, V, W</td>
079     * <td></td>
080     * <td rowspan="2" align="center">3</td>
081     * </tr>
082     * <tr>
083     *
084     * <td>P</td>
085     * <td>before H</td>
086     * </tr>
087     * <tr>
088     * <td>G, K, Q</td>
089     * <td></td>
090     * <td rowspan="3" align="center">4</td>
091     * </tr>
092     * <tr>
093     * <td rowspan="2">C</td>
094     * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
095     *
096     * </tr>
097     * <tr>
098     * <td>before A, H, K, O, Q, U, X except after S, Z</td>
099     * </tr>
100     * <tr>
101     * <td>X</td>
102     * <td>not after C, K, Q</td>
103     * <td align="center">48</td>
104     * </tr>
105     * <tr>
106     * <td>L</td>
107     * <td></td>
108     *
109     * <td align="center">5</td>
110     * </tr>
111     * <tr>
112     * <td>M, N</td>
113     * <td></td>
114     * <td align="center">6</td>
115     * </tr>
116     * <tr>
117     * <td>R</td>
118     * <td></td>
119     * <td align="center">7</td>
120     * </tr>
121     *
122     * <tr>
123     * <td>S, Z</td>
124     * <td></td>
125     * <td rowspan="6" align="center">8</td>
126     * </tr>
127     * <tr>
128     * <td rowspan="3">C</td>
129     * <td>after S, Z</td>
130     * </tr>
131     * <tr>
132     * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
133     * </tr>
134     *
135     * <tr>
136     * <td>not before A, H, K, O, Q, U, X</td>
137     * </tr>
138     * <tr>
139     * <td>D, T</td>
140     * <td>before C, S, Z</td>
141     * </tr>
142     * <tr>
143     * <td>X</td>
144     * <td>after C, K, Q</td>
145     * </tr>
146     * </tbody>
147     * </table>
148     * <p>
149     * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
150     * K&ouml;lner Phonetik -- Buchstabencodes</a>)</i></small>
151     * </p>
152     *
153     * <h4>Example:</h4>
154     *
155     * {@code "M}&uuml;{@code ller-L}&uuml;{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
156     *
157     * </li>
158     *
159     * <li>
160     * <h3>Step 2:</h3>
161     * Collapse of all multiple consecutive code digits.
162     * <h4>Example:</h4>
163     * {@code "6005507500206880022" => "6050750206802"}</li>
164     *
165     * <li>
166     * <h3>Step 3:</h3>
167     * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
168     * if they occur after removing the "0" digits.
169     *
170     * <h4>Example:</h4>
171     * {@code "6050750206802" => "65752682"}</li>
172     *
173     * </ul>
174     *
175     * This class is thread-safe.
176     *
177     * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
178     * @since 1.5
179     */
180    public class ColognePhonetic implements StringEncoder {
181    
182        // Predefined char arrays for better performance and less GC load
183        private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
184        private static final char[] SCZ = new char[] { 'S', 'C', 'Z' };
185        private static final char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
186        private static final char[] GKQ = new char[] { 'G', 'K', 'Q' };
187        private static final char[] CKQ = new char[] { 'C', 'K', 'Q' };
188        private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
189        private static final char[] SZ = new char[] { 'S', 'Z' };
190        private static final char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
191        private static final char[] TDX = new char[] { 'T', 'D', 'X' };
192    
193        /**
194         * This class is not thread-safe; the field {@link #length} is mutable.
195         * However, it is not shared between threads, as it is constructed on demand
196         * by the method {@link ColognePhonetic#colognePhonetic(String)}
197         */
198        private abstract class CologneBuffer {
199    
200            protected final char[] data;
201    
202            protected int length = 0;
203    
204            public CologneBuffer(final char[] data) {
205                this.data = data;
206                this.length = data.length;
207            }
208    
209            public CologneBuffer(final int buffSize) {
210                this.data = new char[buffSize];
211                this.length = 0;
212            }
213    
214            protected abstract char[] copyData(int start, final int length);
215    
216            public int length() {
217                return length;
218            }
219    
220            @Override
221            public String toString() {
222                return new String(copyData(0, length));
223            }
224        }
225    
226        private class CologneOutputBuffer extends CologneBuffer {
227    
228            public CologneOutputBuffer(final int buffSize) {
229                super(buffSize);
230            }
231    
232            public void addRight(final char chr) {
233                data[length] = chr;
234                length++;
235            }
236    
237            @Override
238            protected char[] copyData(final int start, final int length) {
239                final char[] newData = new char[length];
240                System.arraycopy(data, start, newData, 0, length);
241                return newData;
242            }
243        }
244    
245        private class CologneInputBuffer extends CologneBuffer {
246    
247            public CologneInputBuffer(final char[] data) {
248                super(data);
249            }
250    
251            public void addLeft(final char ch) {
252                length++;
253                data[getNextPos()] = ch;
254            }
255    
256            @Override
257            protected char[] copyData(final int start, final int length) {
258                final char[] newData = new char[length];
259                System.arraycopy(data, data.length - this.length + start, newData, 0, length);
260                return newData;
261            }
262    
263            public char getNextChar() {
264                return data[getNextPos()];
265            }
266    
267            protected int getNextPos() {
268                return data.length - length;
269            }
270    
271            public char removeNext() {
272                final char ch = getNextChar();
273                length--;
274                return ch;
275            }
276        }
277    
278        /**
279         * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
280         * <ul>
281         * <li>capital a, umlaut mark</li>
282         * <li>capital u, umlaut mark</li>
283         * <li>capital o, umlaut mark</li>
284         * <li>small sharp s, German</li>
285         * </ul>
286         */
287        private static final char[][] PREPROCESS_MAP = new char[][]{
288            {'\u00C4', 'A'}, // capital a, umlaut mark
289            {'\u00DC', 'U'}, // capital u, umlaut mark
290            {'\u00D6', 'O'}, // capital o, umlaut mark
291            {'\u00DF', 'S'} // small sharp s, German
292        };
293    
294        /*
295         * Returns whether the array contains the key, or not.
296         */
297        private static boolean arrayContains(final char[] arr, final char key) {
298            for (final char element : arr) {
299                if (element == key) {
300                    return true;
301                }
302            }
303            return false;
304        }
305    
306        /**
307         * <p>
308         * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
309         * </p>
310         * <p>
311         * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
312         * </p>
313         *
314         * @param text
315         * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
316         */
317        public String colognePhonetic(String text) {
318            if (text == null) {
319                return null;
320            }
321    
322            text = preprocess(text);
323    
324            final CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
325            final CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
326    
327            char nextChar;
328    
329            char lastChar = '-';
330            char lastCode = '/';
331            char code;
332            char chr;
333    
334            int rightLength = input.length();
335    
336            while (rightLength > 0) {
337                chr = input.removeNext();
338    
339                if ((rightLength = input.length()) > 0) {
340                    nextChar = input.getNextChar();
341                } else {
342                    nextChar = '-';
343                }
344    
345                if (arrayContains(AEIJOUY, chr)) {
346                    code = '0';
347                } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
348                    if (lastCode == '/') {
349                        continue;
350                    }
351                    code = '-';
352                } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
353                    code = '1';
354                } else if ((chr == 'D' || chr == 'T') && !arrayContains(SCZ, nextChar)) {
355                    code = '2';
356                } else if (arrayContains(WFPV, chr)) {
357                    code = '3';
358                } else if (arrayContains(GKQ, chr)) {
359                    code = '4';
360                } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) {
361                    code = '4';
362                    input.addLeft('S');
363                    rightLength++;
364                } else if (chr == 'S' || chr == 'Z') {
365                    code = '8';
366                } else if (chr == 'C') {
367                    if (lastCode == '/') {
368                        if (arrayContains(AHKLOQRUX, nextChar)) {
369                            code = '4';
370                        } else {
371                            code = '8';
372                        }
373                    } else {
374                        if (arrayContains(SZ, lastChar) || !arrayContains(AHOUKQX, nextChar)) {
375                            code = '8';
376                        } else {
377                            code = '4';
378                        }
379                    }
380                } else if (arrayContains(TDX, chr)) {
381                    code = '8';
382                } else if (chr == 'R') {
383                    code = '7';
384                } else if (chr == 'L') {
385                    code = '5';
386                } else if (chr == 'M' || chr == 'N') {
387                    code = '6';
388                } else {
389                    code = chr;
390                }
391    
392                if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
393                    output.addRight(code);
394                }
395    
396                lastChar = chr;
397                lastCode = code;
398            }
399            return output.toString();
400        }
401    
402        @Override
403        public Object encode(final Object object) throws EncoderException {
404            if (!(object instanceof String)) {
405                throw new EncoderException("This method's parameter was expected to be of the type " +
406                    String.class.getName() +
407                    ". But actually it was of the type " +
408                    object.getClass().getName() +
409                    ".");
410            }
411            return encode((String) object);
412        }
413    
414        @Override
415        public String encode(final String text) {
416            return colognePhonetic(text);
417        }
418    
419        public boolean isEncodeEqual(final String text1, final String text2) {
420            return colognePhonetic(text1).equals(colognePhonetic(text2));
421        }
422    
423        /**
424         * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
425         */
426        private String preprocess(String text) {
427            text = text.toUpperCase(Locale.GERMAN);
428    
429            final char[] chrs = text.toCharArray();
430    
431            for (int index = 0; index < chrs.length; index++) {
432                if (chrs[index] > 'Z') {
433                    for (final char[] element : PREPROCESS_MAP) {
434                        if (chrs[index] == element[0]) {
435                            chrs[index] = element[1];
436                            break;
437                        }
438                    }
439                }
440            }
441            return new String(chrs);
442        }
443    }