View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.io.UnsupportedEncodingException;
20  import java.util.Arrays;
21  import java.util.Collection;
22  import java.util.Collections;
23  import java.util.HashMap;
24  import java.util.Iterator;
25  import java.util.LinkedHashMap;
26  import java.util.LinkedHashSet;
27  import java.util.Map;
28  import java.util.Map.Entry;
29  import java.util.Objects;
30  import java.util.Set;
31  
32  import org.apache.commons.lang3.ArrayUtils;
33  import org.apache.commons.lang3.StringUtils;
34  
35  /**
36   * <p>
37   * Convert from one alphabet to another, with the possibility of leaving certain
38   * characters unencoded.
39   * </p>
40   *
41   * <p>
42   * The target and 'do not encode' languages must be in the Unicode BMP, but the
43   * source language does not.
44   * </p>
45   *
46   * <p>
47   * The encoding will all be of a fixed length, except for the 'do not encode'
48   * chars, which will be of length 1
49   * </p>
50   *
51   * <h2>Sample usage</h2>
52   *
53   * <pre>
54   * Character[] originals;   // a, b, c, d
55   * Character[] encoding;    // 0, 1, d
56   * Character[] doNotEncode; // d
57   *
58   * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
59   * encoding, doNotEncode);
60   *
61   * ac.encode("a");    // 00
62   * ac.encode("b");    // 01
63   * ac.encode("c");    // 0d
64   * ac.encode("d");    // d
65   * ac.encode("abcd"); // 00010dd
66   * </pre>
67   *
68   * <p>
69   * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
70   * change internal state.
71   * </p>
72   *
73   * @since 1.0
74   */
75  public final class AlphabetConverter {
76  
77      /**
78       * Arrow constant, used for converting the object into a string.
79       */
80      private static final String ARROW = " -> ";
81  
82      /**
83       * Creates new String that contains just the given code point.
84       *
85       * @param i code point.
86       * @return a new string with the new code point.
87       * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
88       */
89      private static String codePointToString(final int i) {
90          if (Character.charCount(i) == 1) {
91              return String.valueOf((char) i);
92          }
93          return new String(Character.toChars(i));
94      }
95  
96      /**
97       * Converts characters to integers.
98       *
99       * @param chars array of characters.
100      * @return an equivalent array of integers.
101      */
102     private static Integer[] convertCharsToIntegers(final Character[] chars) {
103         if (ArrayUtils.isEmpty(chars)) {
104             return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
105         }
106         final Integer[] integers = new Integer[chars.length];
107         Arrays.setAll(integers, i -> (int) chars[i]);
108         return integers;
109     }
110 
111     /**
112      * Creates an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving the characters in <em>doNotEncode</em>
113      * as they are (if possible).
114      *
115      * <p>
116      * Duplicate letters in either original or encoding will be ignored.
117      * </p>
118      *
119      * @param original    an array of ints representing the original alphabet in code points.
120      * @param encoding    an array of ints representing the alphabet to be used for encoding, in code points.
121      * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char here must appear in both the previous
122      *                    params.
123      * @return The AlphabetConverter.
124      * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed.
125      */
126     public static AlphabetConverter createConverter(
127             final Integer[] original,
128             final Integer[] encoding,
129             final Integer[] doNotEncode) {
130         final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
131         final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
132         final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
133 
134         final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
135         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
136         final Map<Integer, String> doNotEncodeMap = new HashMap<>();
137 
138         final int encodedLetterLength;
139 
140         for (final int i : doNotEncodeCopy) {
141             if (!originalCopy.contains(i)) {
142                 throw new IllegalArgumentException(
143                         "Can not use 'do not encode' list because original "
144                                 + "alphabet does not contain '"
145                                 + codePointToString(i) + "'");
146             }
147 
148             if (!encodingCopy.contains(i)) {
149                 throw new IllegalArgumentException(
150                         "Can not use 'do not encode' list because encoding alphabet does not contain '"
151                                 + codePointToString(i) + "'");
152             }
153 
154             doNotEncodeMap.put(i, codePointToString(i));
155         }
156 
157         if (encodingCopy.size() >= originalCopy.size()) {
158             encodedLetterLength = 1;
159 
160             final Iterator<Integer> it = encodingCopy.iterator();
161 
162             for (final int originalLetter : originalCopy) {
163                 final String originalLetterAsString = codePointToString(originalLetter);
164 
165                 if (doNotEncodeMap.containsKey(originalLetter)) {
166                     originalToEncoded.put(originalLetter, originalLetterAsString);
167                     encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
168                 } else {
169                     Integer next = it.next();
170 
171                     while (doNotEncodeCopy.contains(next)) {
172                         next = it.next();
173                     }
174 
175                     final String encodedLetter = codePointToString(next);
176 
177                     originalToEncoded.put(originalLetter, encodedLetter);
178                     encodedToOriginal.put(encodedLetter, originalLetterAsString);
179                 }
180             }
181 
182             return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
183 
184         }
185         if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
186             throw new IllegalArgumentException(
187                     "Must have at least two encoding characters (excluding "
188                             + "those in the 'do not encode' list), but has "
189                             + (encodingCopy.size() - doNotEncodeCopy.size()));
190         }
191         // we start with one which is our minimum, and because we do the
192         // first division outside the loop
193         int lettersSoFar = 1;
194 
195         // the first division takes into account that the doNotEncode
196         // letters can't be in the leftmost place
197         int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
198                 / (encodingCopy.size() - doNotEncodeCopy.size());
199 
200         while (lettersLeft / encodingCopy.size() >= 1) {
201             lettersLeft /= encodingCopy.size();
202             lettersSoFar++;
203         }
204 
205         encodedLetterLength = lettersSoFar + 1;
206 
207         final AlphabetConverter ac =
208                 new AlphabetConverter(originalToEncoded,
209                         encodedToOriginal,
210                         encodedLetterLength);
211 
212         ac.addSingleEncoding(encodedLetterLength,
213                 StringUtils.EMPTY,
214                 encodingCopy,
215                 originalCopy.iterator(),
216                 doNotEncodeMap);
217 
218         return ac;
219     }
220 
221     /**
222      * Creates an alphabet converter, for converting from the original alphabet,
223      * to the encoded alphabet, while leaving the characters in
224      * <em>doNotEncode</em> as they are (if possible).
225      *
226      * <p>Duplicate letters in either original or encoding will be ignored.</p>
227      *
228      * @param original an array of chars representing the original alphabet
229      * @param encoding an array of chars representing the alphabet to be used
230      *                 for encoding
231      * @param doNotEncode an array of chars to be encoded using the original
232      *                    alphabet - every char here must appear in
233      *                    both the previous params
234      * @return The AlphabetConverter
235      * @throws IllegalArgumentException if an AlphabetConverter cannot be
236      *                                  constructed
237      */
238     public static AlphabetConverter createConverterFromChars(
239             final Character[] original,
240             final Character[] encoding,
241             final Character[] doNotEncode) {
242         return createConverter(
243                 convertCharsToIntegers(original),
244                 convertCharsToIntegers(encoding),
245                 convertCharsToIntegers(doNotEncode));
246     }
247 
248     /**
249      * Creates a new converter from a map.
250      *
251      * @param originalToEncoded a map returned from getOriginalToEncoded().
252      * @return The reconstructed AlphabetConverter.
253      * @see AlphabetConverter#getOriginalToEncoded()
254      */
255     public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
256         final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
257         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
258 
259         int encodedLetterLength = 1;
260 
261         for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
262             encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));
263 
264             if (e.getValue().length() > encodedLetterLength) {
265                 encodedLetterLength = e.getValue().length();
266             }
267         }
268 
269         return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
270     }
271 
272     /**
273      * Original string to be encoded.
274      */
275     private final Map<Integer, String> originalToEncoded;
276 
277     /**
278      * Encoding alphabet.
279      */
280     private final Map<String, String> encodedToOriginal;
281 
282     /**
283      * Length of the encoded letter.
284      */
285     private final int encodedLetterLength;
286 
287     /**
288      * Hidden constructor for alphabet converter. Used by static helper methods.
289      *
290      * @param originalToEncoded original string to be encoded.
291      * @param encodedToOriginal encoding alphabet.
292      * @param encodedLetterLength length of the encoded letter.
293      */
294     private AlphabetConverter(final Map<Integer, String> originalToEncoded,
295                               final Map<String, String> encodedToOriginal,
296                               final int encodedLetterLength) {
297 
298         this.originalToEncoded = originalToEncoded;
299         this.encodedToOriginal = encodedToOriginal;
300         this.encodedLetterLength = encodedLetterLength;
301     }
302 
303     /**
304      * Recursive method used when creating encoder/decoder.
305      *
306      * @param level at which point it should add a single encoding.
307      * @param currentEncoding current encoding.
308      * @param encoding letters encoding.
309      * @param originals original values.
310      * @param doNotEncodeMap map of values that should not be encoded.
311      */
312     private void addSingleEncoding(final int level,
313                                    final String currentEncoding,
314                                    final Collection<Integer> encoding,
315                                    final Iterator<Integer> originals,
316                                    final Map<Integer, String> doNotEncodeMap) {
317 
318         if (level > 0) {
319             for (final int encodingLetter : encoding) {
320                 if (!originals.hasNext()) {
321                     return; // done encoding all the original alphabet
322                 }
323                 // this skips the doNotEncode chars if they are in the
324                 // leftmost place
325                 if (level != encodedLetterLength
326                         || !doNotEncodeMap.containsKey(encodingLetter)) {
327                     addSingleEncoding(level - 1,
328                             currentEncoding
329                                     + codePointToString(encodingLetter),
330                             encoding,
331                             originals,
332                             doNotEncodeMap
333                     );
334                 }
335             }
336         } else {
337             Integer next = originals.next();
338 
339             while (doNotEncodeMap.containsKey(next)) {
340                 final String originalLetterAsString = codePointToString(next);
341 
342                 originalToEncoded.put(next, originalLetterAsString);
343                 encodedToOriginal.put(originalLetterAsString,
344                         originalLetterAsString);
345 
346                 if (!originals.hasNext()) {
347                     return;
348                 }
349 
350                 next = originals.next();
351             }
352 
353             final String originalLetterAsString = codePointToString(next);
354 
355             originalToEncoded.put(next, currentEncoding);
356             encodedToOriginal.put(currentEncoding, originalLetterAsString);
357         }
358     }
359 
360     /**
361      * Decodes a given string.
362      *
363      * @param encoded a string that has been encoded using this AlphabetConverter.
364      * @return The decoded string, {@code null} if the given string is null.
365      * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered.
366      */
367     public String decode(final String encoded)
368             throws UnsupportedEncodingException {
369         if (encoded == null) {
370             return null;
371         }
372 
373         final StringBuilder result = new StringBuilder();
374 
375         for (int j = 0; j < encoded.length();) {
376             final int i = encoded.codePointAt(j);
377             final String s = codePointToString(i);
378 
379             if (s.equals(originalToEncoded.get(i))) {
380                 result.append(s);
381                 j++; // because we do not encode in Unicode extended the
382                      // length of each encoded char is 1
383             } else {
384                 if (j + encodedLetterLength > encoded.length()) {
385                     throw new UnsupportedEncodingException("Unexpected end "
386                             + "of string while decoding " + encoded);
387                 }
388                 final String nextGroup = encoded.substring(j,
389                         j + encodedLetterLength);
390                 final String next = encodedToOriginal.get(nextGroup);
391                 if (next == null) {
392                     throw new UnsupportedEncodingException(
393                             "Unexpected string without decoding ("
394                                     + nextGroup + ") in " + encoded);
395                 }
396                 result.append(next);
397                 j += encodedLetterLength;
398             }
399         }
400 
401         return result.toString();
402     }
403 
404     /**
405      * Encodes a given string.
406      *
407      * @param original the string to be encoded.
408      * @return The encoded string, {@code null} if the given string is null.
409      * @throws UnsupportedEncodingException if chars that are not supported are encountered.
410      */
411     public String encode(final String original)
412             throws UnsupportedEncodingException {
413         if (original == null) {
414             return null;
415         }
416 
417         final StringBuilder sb = new StringBuilder();
418 
419         for (int i = 0; i < original.length();) {
420             final int codePoint = original.codePointAt(i);
421 
422             final String nextLetter = originalToEncoded.get(codePoint);
423 
424             if (nextLetter == null) {
425                 throw new UnsupportedEncodingException(
426                         "Couldn't find encoding for '"
427                                 + codePointToString(codePoint)
428                                 + "' in "
429                                 + original
430                 );
431             }
432 
433             sb.append(nextLetter);
434 
435             i += Character.charCount(codePoint);
436         }
437 
438         return sb.toString();
439     }
440 
441     @Override
442     public boolean equals(final Object obj) {
443         if (obj == null) {
444             return false;
445         }
446         if (obj == this) {
447             return true;
448         }
449         if (!(obj instanceof AlphabetConverter)) {
450             return false;
451         }
452         final AlphabetConverter other = (AlphabetConverter) obj;
453         return originalToEncoded.equals(other.originalToEncoded)
454                 && encodedToOriginal.equals(other.encodedToOriginal)
455                 && encodedLetterLength == other.encodedLetterLength;
456     }
457 
458     /**
459      * Gets the length of characters in the encoded alphabet that are necessary for each character in the original alphabet.
460      *
461      * @return The length of the encoded char.
462      */
463     public int getEncodedCharLength() {
464         return encodedLetterLength;
465     }
466 
467     /**
468      * Gets the mapping from integer code point of source language to encoded string. Use to reconstruct converter from serialized map.
469      *
470      * @return The original map.
471      */
472     public Map<Integer, String> getOriginalToEncoded() {
473         return Collections.unmodifiableMap(originalToEncoded);
474     }
475 
476     @Override
477     public int hashCode() {
478         return Objects.hash(originalToEncoded,
479                 encodedToOriginal,
480                 encodedLetterLength);
481     }
482 
483     @Override
484     public String toString() {
485         final StringBuilder sb = new StringBuilder();
486         // @formatter:off
487         originalToEncoded.forEach((k, v) ->
488             sb.append(codePointToString(k))
489               .append(ARROW)
490               .append(k)
491               .append(System.lineSeparator()));
492         // @formatter:on
493         return sb.toString();
494     }
495 }