View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.io.UnsupportedEncodingException;
20  import java.util.Arrays;
21  import java.util.Collection;
22  import java.util.Collections;
23  import java.util.HashMap;
24  import java.util.Iterator;
25  import java.util.LinkedHashMap;
26  import java.util.LinkedHashSet;
27  import java.util.Map;
28  import java.util.Map.Entry;
29  import java.util.Objects;
30  import java.util.Set;
31  
32  import org.apache.commons.lang3.ArrayUtils;
33  import org.apache.commons.lang3.StringUtils;
34  
35  /**
36   * <p>
37   * Convert from one alphabet to another, with the possibility of leaving certain
38   * characters unencoded.
39   * </p>
40   *
41   * <p>
42   * The target and 'do not encode' languages must be in the Unicode BMP, but the
43   * source language does not.
44   * </p>
45   *
46   * <p>
47   * The encoding will all be of a fixed length, except for the 'do not encode'
48   * chars, which will be of length 1
49   * </p>
50   *
51   * <h2>Sample usage</h2>
52   *
53   * <pre>
54   * Character[] originals;   // a, b, c, d
55   * Character[] encoding;    // 0, 1, d
56   * Character[] doNotEncode; // d
57   *
58   * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
59   * encoding, doNotEncode);
60   *
61   * ac.encode("a");    // 00
62   * ac.encode("b");    // 01
63   * ac.encode("c");    // 0d
64   * ac.encode("d");    // d
65   * ac.encode("abcd"); // 00010dd
66   * </pre>
67   *
68   * <p>
69   * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
70   * change internal state.
71   * </p>
72   *
73   * @since 1.0
74   */
75  public final class AlphabetConverter {
76  
77      /**
78       * Arrow constant, used for converting the object into a string.
79       */
80      private static final String ARROW = " -> ";
81  
82      /**
83       * Creates new String that contains just the given code point.
84       *
85       * @param i code point
86       * @return a new string with the new code point
87       * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
88       */
89      private static String codePointToString(final int i) {
90          if (Character.charCount(i) == 1) {
91              return String.valueOf((char) i);
92          }
93          return new String(Character.toChars(i));
94      }
95  
96      /**
97       * Converts characters to integers.
98       *
99       * @param chars array of characters
100      * @return an equivalent array of integers
101      */
102     private static Integer[] convertCharsToIntegers(final Character[] chars) {
103         if (ArrayUtils.isEmpty(chars)) {
104             return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
105         }
106         final Integer[] integers = new Integer[chars.length];
107         Arrays.setAll(integers, i -> (int) chars[i]);
108         return integers;
109     }
110 
111     /**
112      * Creates an alphabet converter, for converting from the original alphabet,
113      * to the encoded alphabet, while leaving
114      * the characters in <em>doNotEncode</em> as they are (if possible).
115      *
116      * <p>Duplicate letters in either original or encoding will be ignored.</p>
117      *
118      * @param original an array of ints representing the original alphabet in
119      *                 code points
120      * @param encoding an array of ints representing the alphabet to be used for
121      *                 encoding, in code points
122      * @param doNotEncode an array of ints representing the chars to be encoded
123      *                    using the original alphabet - every char
124      *                    here must appear in both the previous params
125      * @return The AlphabetConverter
126      * @throws IllegalArgumentException if an AlphabetConverter cannot be
127      *                                   constructed
128      */
129     public static AlphabetConverter createConverter(
130             final Integer[] original,
131             final Integer[] encoding,
132             final Integer[] doNotEncode) {
133         final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
134         final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
135         final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
136 
137         final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
138         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
139         final Map<Integer, String> doNotEncodeMap = new HashMap<>();
140 
141         final int encodedLetterLength;
142 
143         for (final int i : doNotEncodeCopy) {
144             if (!originalCopy.contains(i)) {
145                 throw new IllegalArgumentException(
146                         "Can not use 'do not encode' list because original "
147                                 + "alphabet does not contain '"
148                                 + codePointToString(i) + "'");
149             }
150 
151             if (!encodingCopy.contains(i)) {
152                 throw new IllegalArgumentException(
153                         "Can not use 'do not encode' list because encoding alphabet does not contain '"
154                                 + codePointToString(i) + "'");
155             }
156 
157             doNotEncodeMap.put(i, codePointToString(i));
158         }
159 
160         if (encodingCopy.size() >= originalCopy.size()) {
161             encodedLetterLength = 1;
162 
163             final Iterator<Integer> it = encodingCopy.iterator();
164 
165             for (final int originalLetter : originalCopy) {
166                 final String originalLetterAsString = codePointToString(originalLetter);
167 
168                 if (doNotEncodeMap.containsKey(originalLetter)) {
169                     originalToEncoded.put(originalLetter, originalLetterAsString);
170                     encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
171                 } else {
172                     Integer next = it.next();
173 
174                     while (doNotEncodeCopy.contains(next)) {
175                         next = it.next();
176                     }
177 
178                     final String encodedLetter = codePointToString(next);
179 
180                     originalToEncoded.put(originalLetter, encodedLetter);
181                     encodedToOriginal.put(encodedLetter, originalLetterAsString);
182                 }
183             }
184 
185             return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
186 
187         }
188         if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
189             throw new IllegalArgumentException(
190                     "Must have at least two encoding characters (excluding "
191                             + "those in the 'do not encode' list), but has "
192                             + (encodingCopy.size() - doNotEncodeCopy.size()));
193         }
194         // we start with one which is our minimum, and because we do the
195         // first division outside the loop
196         int lettersSoFar = 1;
197 
198         // the first division takes into account that the doNotEncode
199         // letters can't be in the leftmost place
200         int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
201                 / (encodingCopy.size() - doNotEncodeCopy.size());
202 
203         while (lettersLeft / encodingCopy.size() >= 1) {
204             lettersLeft /= encodingCopy.size();
205             lettersSoFar++;
206         }
207 
208         encodedLetterLength = lettersSoFar + 1;
209 
210         final AlphabetConverter ac =
211                 new AlphabetConverter(originalToEncoded,
212                         encodedToOriginal,
213                         encodedLetterLength);
214 
215         ac.addSingleEncoding(encodedLetterLength,
216                 StringUtils.EMPTY,
217                 encodingCopy,
218                 originalCopy.iterator(),
219                 doNotEncodeMap);
220 
221         return ac;
222     }
223 
224     /**
225      * Creates an alphabet converter, for converting from the original alphabet,
226      * to the encoded alphabet, while leaving the characters in
227      * <em>doNotEncode</em> as they are (if possible).
228      *
229      * <p>Duplicate letters in either original or encoding will be ignored.</p>
230      *
231      * @param original an array of chars representing the original alphabet
232      * @param encoding an array of chars representing the alphabet to be used
233      *                 for encoding
234      * @param doNotEncode an array of chars to be encoded using the original
235      *                    alphabet - every char here must appear in
236      *                    both the previous params
237      * @return The AlphabetConverter
238      * @throws IllegalArgumentException if an AlphabetConverter cannot be
239      *                                  constructed
240      */
241     public static AlphabetConverter createConverterFromChars(
242             final Character[] original,
243             final Character[] encoding,
244             final Character[] doNotEncode) {
245         return AlphabetConverter.createConverter(
246                 convertCharsToIntegers(original),
247                 convertCharsToIntegers(encoding),
248                 convertCharsToIntegers(doNotEncode));
249     }
250 
251     /**
252      * Creates a new converter from a map.
253      *
254      * @param originalToEncoded a map returned from getOriginalToEncoded()
255      * @return The reconstructed AlphabetConverter
256      * @see AlphabetConverter#getOriginalToEncoded()
257      */
258     public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
259         final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
260         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
261 
262         int encodedLetterLength = 1;
263 
264         for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
265             encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));
266 
267             if (e.getValue().length() > encodedLetterLength) {
268                 encodedLetterLength = e.getValue().length();
269             }
270         }
271 
272         return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
273     }
274 
275     /**
276      * Original string to be encoded.
277      */
278     private final Map<Integer, String> originalToEncoded;
279 
280     /**
281      * Encoding alphabet.
282      */
283     private final Map<String, String> encodedToOriginal;
284 
285     /**
286      * Length of the encoded letter.
287      */
288     private final int encodedLetterLength;
289 
290     /**
291      * Hidden constructor for alphabet converter. Used by static helper methods.
292      *
293      * @param originalToEncoded original string to be encoded
294      * @param encodedToOriginal encoding alphabet
295      * @param encodedLetterLength length of the encoded letter
296      */
297     private AlphabetConverter(final Map<Integer, String> originalToEncoded,
298                               final Map<String, String> encodedToOriginal,
299                               final int encodedLetterLength) {
300 
301         this.originalToEncoded = originalToEncoded;
302         this.encodedToOriginal = encodedToOriginal;
303         this.encodedLetterLength = encodedLetterLength;
304     }
305 
306     /**
307      * Recursive method used when creating encoder/decoder.
308      *
309      * @param level at which point it should add a single encoding
310      * @param currentEncoding current encoding
311      * @param encoding letters encoding
312      * @param originals original values
313      * @param doNotEncodeMap map of values that should not be encoded
314      */
315     private void addSingleEncoding(final int level,
316                                    final String currentEncoding,
317                                    final Collection<Integer> encoding,
318                                    final Iterator<Integer> originals,
319                                    final Map<Integer, String> doNotEncodeMap) {
320 
321         if (level > 0) {
322             for (final int encodingLetter : encoding) {
323                 if (!originals.hasNext()) {
324                     return; // done encoding all the original alphabet
325                 }
326                 // this skips the doNotEncode chars if they are in the
327                 // leftmost place
328                 if (level != encodedLetterLength
329                         || !doNotEncodeMap.containsKey(encodingLetter)) {
330                     addSingleEncoding(level - 1,
331                             currentEncoding
332                                     + codePointToString(encodingLetter),
333                             encoding,
334                             originals,
335                             doNotEncodeMap
336                     );
337                 }
338             }
339         } else {
340             Integer next = originals.next();
341 
342             while (doNotEncodeMap.containsKey(next)) {
343                 final String originalLetterAsString = codePointToString(next);
344 
345                 originalToEncoded.put(next, originalLetterAsString);
346                 encodedToOriginal.put(originalLetterAsString,
347                         originalLetterAsString);
348 
349                 if (!originals.hasNext()) {
350                     return;
351                 }
352 
353                 next = originals.next();
354             }
355 
356             final String originalLetterAsString = codePointToString(next);
357 
358             originalToEncoded.put(next, currentEncoding);
359             encodedToOriginal.put(currentEncoding, originalLetterAsString);
360         }
361     }
362 
363     /**
364      * Decodes a given string.
365      *
366      * @param encoded a string that has been encoded using this
367      *                AlphabetConverter
368      * @return The decoded string, {@code null} if the given string is null
369      * @throws UnsupportedEncodingException if unexpected characters that
370      *                                      cannot be handled are encountered
371      */
372     public String decode(final String encoded)
373             throws UnsupportedEncodingException {
374         if (encoded == null) {
375             return null;
376         }
377 
378         final StringBuilder result = new StringBuilder();
379 
380         for (int j = 0; j < encoded.length();) {
381             final int i = encoded.codePointAt(j);
382             final String s = codePointToString(i);
383 
384             if (s.equals(originalToEncoded.get(i))) {
385                 result.append(s);
386                 j++; // because we do not encode in Unicode extended the
387                      // length of each encoded char is 1
388             } else {
389                 if (j + encodedLetterLength > encoded.length()) {
390                     throw new UnsupportedEncodingException("Unexpected end "
391                             + "of string while decoding " + encoded);
392                 }
393                 final String nextGroup = encoded.substring(j,
394                         j + encodedLetterLength);
395                 final String next = encodedToOriginal.get(nextGroup);
396                 if (next == null) {
397                     throw new UnsupportedEncodingException(
398                             "Unexpected string without decoding ("
399                                     + nextGroup + ") in " + encoded);
400                 }
401                 result.append(next);
402                 j += encodedLetterLength;
403             }
404         }
405 
406         return result.toString();
407     }
408 
409     /**
410      * Encodes a given string.
411      *
412      * @param original the string to be encoded
413      * @return The encoded string, {@code null} if the given string is null
414      * @throws UnsupportedEncodingException if chars that are not supported are
415      *                                      encountered
416      */
417     public String encode(final String original)
418             throws UnsupportedEncodingException {
419         if (original == null) {
420             return null;
421         }
422 
423         final StringBuilder sb = new StringBuilder();
424 
425         for (int i = 0; i < original.length();) {
426             final int codePoint = original.codePointAt(i);
427 
428             final String nextLetter = originalToEncoded.get(codePoint);
429 
430             if (nextLetter == null) {
431                 throw new UnsupportedEncodingException(
432                         "Couldn't find encoding for '"
433                                 + codePointToString(codePoint)
434                                 + "' in "
435                                 + original
436                 );
437             }
438 
439             sb.append(nextLetter);
440 
441             i += Character.charCount(codePoint);
442         }
443 
444         return sb.toString();
445     }
446 
447     @Override
448     public boolean equals(final Object obj) {
449         if (obj == null) {
450             return false;
451         }
452         if (obj == this) {
453             return true;
454         }
455         if (!(obj instanceof AlphabetConverter)) {
456             return false;
457         }
458         final AlphabetConverter other = (AlphabetConverter) obj;
459         return originalToEncoded.equals(other.originalToEncoded)
460                 && encodedToOriginal.equals(other.encodedToOriginal)
461                 && encodedLetterLength == other.encodedLetterLength;
462     }
463 
464     /**
465      * Gets the length of characters in the encoded alphabet that are necessary
466      * for each character in the original
467      * alphabet.
468      *
469      * @return The length of the encoded char
470      */
471     public int getEncodedCharLength() {
472         return encodedLetterLength;
473     }
474 
475     /**
476      * Gets the mapping from integer code point of source language to encoded
477      * string. Use to reconstruct converter from
478      * serialized map.
479      *
480      * @return The original map
481      */
482     public Map<Integer, String> getOriginalToEncoded() {
483         return Collections.unmodifiableMap(originalToEncoded);
484     }
485 
486     @Override
487     public int hashCode() {
488         return Objects.hash(originalToEncoded,
489                 encodedToOriginal,
490                 encodedLetterLength);
491     }
492 
493     @Override
494     public String toString() {
495         final StringBuilder sb = new StringBuilder();
496         // @formatter:off
497         originalToEncoded.forEach((k, v) ->
498             sb.append(codePointToString(k))
499               .append(ARROW)
500               .append(k)
501               .append(System.lineSeparator()));
502         // @formatter:on
503         return sb.toString();
504     }
505 }