View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import java.util.Locale;
21  
22  import org.apache.commons.codec.EncoderException;
23  import org.apache.commons.codec.StringEncoder;
24  
25  /**
26   * Encodes a string into a Cologne Phonetic value.
27   * <p>
28   * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a> (Cologne
29   * Phonetic) algorithm issued by Hans Joachim Postel in 1969.
30   * </p>
31   * <p>
32   * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language. It is related to
33   * the well-known soundex algorithm.
34   * </p>
35   *
36   * <h2>Algorithm</h2>
37   *
38   * <ul>
39   *
40   * <li>
41   * <h3>Step 1:</h3>
42   * After preprocessing (conversion to upper case, transcription of <a
43   * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
44   * letters of the supplied text are replaced by their phonetic code according to the following table.
45   * <table border="1">
46   * <caption style="caption-side: bottom"><small><i>(Source: <a
47   * href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes">Wikipedia (de): K&ouml;lner Phonetik --
48   * Buchstabencodes</a>)</i></small></caption> <tbody>
49   * <tr>
50   * <th>Letter</th>
51   * <th>Context</th>
52   * <th align="center">Code</th>
53   * </tr>
54   * <tr>
55   * <td>A, E, I, J, O, U, Y</td>
56   * <td></td>
57   * <td align="center">0</td>
58   * </tr>
59   * <tr>
60   *
61   * <td>H</td>
62   * <td></td>
63   * <td align="center">-</td>
64   * </tr>
65   * <tr>
66   * <td>B</td>
67   * <td></td>
68   * <td rowspan="2" align="center">1</td>
69   * </tr>
70   * <tr>
71   * <td>P</td>
72   * <td>not before H</td>
73   *
74   * </tr>
75   * <tr>
76   * <td>D, T</td>
77   * <td>not before C, S, Z</td>
78   * <td align="center">2</td>
79   * </tr>
80   * <tr>
81   * <td>F, V, W</td>
82   * <td></td>
83   * <td rowspan="2" align="center">3</td>
84   * </tr>
85   * <tr>
86   *
87   * <td>P</td>
88   * <td>before H</td>
89   * </tr>
90   * <tr>
91   * <td>G, K, Q</td>
92   * <td></td>
93   * <td rowspan="3" align="center">4</td>
94   * </tr>
95   * <tr>
96   * <td rowspan="2">C</td>
97   * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
98   *
99   * </tr>
100  * <tr>
101  * <td>before A, H, K, O, Q, U, X except after S, Z</td>
102  * </tr>
103  * <tr>
104  * <td>X</td>
105  * <td>not after C, K, Q</td>
106  * <td align="center">48</td>
107  * </tr>
108  * <tr>
109  * <td>L</td>
110  * <td></td>
111  *
112  * <td align="center">5</td>
113  * </tr>
114  * <tr>
115  * <td>M, N</td>
116  * <td></td>
117  * <td align="center">6</td>
118  * </tr>
119  * <tr>
120  * <td>R</td>
121  * <td></td>
122  * <td align="center">7</td>
123  * </tr>
124  *
125  * <tr>
126  * <td>S, Z</td>
127  * <td></td>
128  * <td rowspan="6" align="center">8</td>
129  * </tr>
130  * <tr>
131  * <td rowspan="3">C</td>
132  * <td>after S, Z</td>
133  * </tr>
134  * <tr>
135  * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
136  * </tr>
137  *
138  * <tr>
139  * <td>not before A, H, K, O, Q, U, X</td>
140  * </tr>
141  * <tr>
142  * <td>D, T</td>
143  * <td>before C, S, Z</td>
144  * </tr>
145  * <tr>
146  * <td>X</td>
147  * <td>after C, K, Q</td>
148  * </tr>
149  * </tbody>
150  * </table>
151  *
152  * <h4>Example:</h4>
153  *
154  * <code>"M</code>&uuml;<code>ller-L</code>&uuml;
155  * <code>denscheidt" =&gt; "MULLERLUDENSCHEIDT" =&gt; "6005507500206880022"</code>
156  *
157  * </li>
158  *
159  * <li>
160  * <h3>Step 2:</h3>
161  * Collapse of all multiple consecutive code digits.
162  * <h4>Example:</h4>
163  * <code>"6005507500206880022" =&gt; "6050750206802"</code></li>
164  *
165  * <li>
166  * <h3>Step 3:</h3>
167  * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
168  * if they occur after removing the "0" digits.
169  *
170  * <h4>Example:</h4>
171  * <code>"6050750206802" =&gt; "65752682"</code></li>
172  *
173  * </ul>
174  *
175  * <p>
176  * This class is thread-safe.
177  * </p>
178  *
179  * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
180  * @since 1.5
181  */
182 public class ColognePhonetic implements StringEncoder {
183 
184     // Predefined char arrays for better performance and less GC load
185     private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
186     private static final char[] SCZ = new char[] { 'S', 'C', 'Z' };
187     private static final char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
188     private static final char[] GKQ = new char[] { 'G', 'K', 'Q' };
189     private static final char[] CKQ = new char[] { 'C', 'K', 'Q' };
190     private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
191     private static final char[] SZ = new char[] { 'S', 'Z' };
192     private static final char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
193     private static final char[] TDX = new char[] { 'T', 'D', 'X' };
194 
195     /**
196      * This class is not thread-safe; the field {@link #length} is mutable.
197      * However, it is not shared between threads, as it is constructed on demand
198      * by the method {@link ColognePhonetic#colognePhonetic(String)}
199      */
200     private abstract class CologneBuffer {
201 
202         protected final char[] data;
203 
204         protected int length = 0;
205 
206         public CologneBuffer(final char[] data) {
207             this.data = data;
208             this.length = data.length;
209         }
210 
211         public CologneBuffer(final int buffSize) {
212             this.data = new char[buffSize];
213             this.length = 0;
214         }
215 
216         protected abstract char[] copyData(int start, final int length);
217 
218         public int length() {
219             return length;
220         }
221 
222         @Override
223         public String toString() {
224             return new String(copyData(0, length));
225         }
226     }
227 
228     private class CologneOutputBuffer extends CologneBuffer {
229 
230         public CologneOutputBuffer(final int buffSize) {
231             super(buffSize);
232         }
233 
234         public void addRight(final char chr) {
235             data[length] = chr;
236             length++;
237         }
238 
239         @Override
240         protected char[] copyData(final int start, final int length) {
241             final char[] newData = new char[length];
242             System.arraycopy(data, start, newData, 0, length);
243             return newData;
244         }
245     }
246 
247     private class CologneInputBuffer extends CologneBuffer {
248 
249         public CologneInputBuffer(final char[] data) {
250             super(data);
251         }
252 
253         public void addLeft(final char ch) {
254             length++;
255             data[getNextPos()] = ch;
256         }
257 
258         @Override
259         protected char[] copyData(final int start, final int length) {
260             final char[] newData = new char[length];
261             System.arraycopy(data, data.length - this.length + start, newData, 0, length);
262             return newData;
263         }
264 
265         public char getNextChar() {
266             return data[getNextPos()];
267         }
268 
269         protected int getNextPos() {
270             return data.length - length;
271         }
272 
273         public char removeNext() {
274             final char ch = getNextChar();
275             length--;
276             return ch;
277         }
278     }
279 
280     /**
281      * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
282      * <ul>
283      * <li>capital a, umlaut mark</li>
284      * <li>capital u, umlaut mark</li>
285      * <li>capital o, umlaut mark</li>
286      * <li>small sharp s, German</li>
287      * </ul>
288      */
289     private static final char[][] PREPROCESS_MAP = new char[][]{
290         {'\u00C4', 'A'}, // capital a, umlaut mark
291         {'\u00DC', 'U'}, // capital u, umlaut mark
292         {'\u00D6', 'O'}, // capital o, umlaut mark
293         {'\u00DF', 'S'} // small sharp s, German
294     };
295 
296     /*
297      * Returns whether the array contains the key, or not.
298      */
299     private static boolean arrayContains(final char[] arr, final char key) {
300         for (final char element : arr) {
301             if (element == key) {
302                 return true;
303             }
304         }
305         return false;
306     }
307 
308     /**
309      * <p>
310      * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
311      * </p>
312      * <p>
313      * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
314      * </p>
315      *
316      * @param text The source text to encode
317      * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
318      */
319     public String colognePhonetic(String text) {
320         if (text == null) {
321             return null;
322         }
323 
324         text = preprocess(text);
325 
326         final CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
327         final CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
328 
329         char nextChar;
330 
331         char lastChar = '-';
332         char lastCode = '/';
333         char code;
334         char chr;
335 
336         int rightLength = input.length();
337 
338         while (rightLength > 0) {
339             chr = input.removeNext();
340 
341             if ((rightLength = input.length()) > 0) {
342                 nextChar = input.getNextChar();
343             } else {
344                 nextChar = '-';
345             }
346 
347             if (arrayContains(AEIJOUY, chr)) {
348                 code = '0';
349             } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
350                 if (lastCode == '/') {
351                     continue;
352                 }
353                 code = '-';
354             } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
355                 code = '1';
356             } else if ((chr == 'D' || chr == 'T') && !arrayContains(SCZ, nextChar)) {
357                 code = '2';
358             } else if (arrayContains(WFPV, chr)) {
359                 code = '3';
360             } else if (arrayContains(GKQ, chr)) {
361                 code = '4';
362             } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) {
363                 code = '4';
364                 input.addLeft('S');
365                 rightLength++;
366             } else if (chr == 'S' || chr == 'Z') {
367                 code = '8';
368             } else if (chr == 'C') {
369                 if (lastCode == '/') {
370                     if (arrayContains(AHKLOQRUX, nextChar)) {
371                         code = '4';
372                     } else {
373                         code = '8';
374                     }
375                 } else {
376                     if (arrayContains(SZ, lastChar) || !arrayContains(AHOUKQX, nextChar)) {
377                         code = '8';
378                     } else {
379                         code = '4';
380                     }
381                 }
382             } else if (arrayContains(TDX, chr)) {
383                 code = '8';
384             } else if (chr == 'R') {
385                 code = '7';
386             } else if (chr == 'L') {
387                 code = '5';
388             } else if (chr == 'M' || chr == 'N') {
389                 code = '6';
390             } else {
391                 code = chr;
392             }
393 
394             if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
395                 output.addRight(code);
396             }
397 
398             lastChar = chr;
399             lastCode = code;
400         }
401         return output.toString();
402     }
403 
404     @Override
405     public Object encode(final Object object) throws EncoderException {
406         if (!(object instanceof String)) {
407             throw new EncoderException("This method's parameter was expected to be of the type " +
408                 String.class.getName() +
409                 ". But actually it was of the type " +
410                 object.getClass().getName() +
411                 ".");
412         }
413         return encode((String) object);
414     }
415 
416     @Override
417     public String encode(final String text) {
418         return colognePhonetic(text);
419     }
420 
421     public boolean isEncodeEqual(final String text1, final String text2) {
422         return colognePhonetic(text1).equals(colognePhonetic(text2));
423     }
424 
425     /**
426      * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
427      */
428     private String preprocess(String text) {
429         text = text.toUpperCase(Locale.GERMAN);
430 
431         final char[] chrs = text.toCharArray();
432 
433         for (int index = 0; index < chrs.length; index++) {
434             if (chrs[index] > 'Z') {
435                 for (final char[] element : PREPROCESS_MAP) {
436                     if (chrs[index] == element[0]) {
437                         chrs[index] = element[1];
438                         break;
439                     }
440                 }
441             }
442         }
443         return new String(chrs);
444     }
445 }