View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.io.UnsupportedEncodingException;
20  import java.util.Arrays;
21  import java.util.Collection;
22  import java.util.Collections;
23  import java.util.HashMap;
24  import java.util.Iterator;
25  import java.util.LinkedHashMap;
26  import java.util.LinkedHashSet;
27  import java.util.Map;
28  import java.util.Map.Entry;
29  import java.util.Objects;
30  import java.util.Set;
31  
32  /**
33   * <p>
34   * Convert from one alphabet to another, with the possibility of leaving certain characters unencoded.
35   * </p>
36   *
37   * <p>
38   * The target and do not encode languages must be in the Unicode BMP, but the source language does not.
39   * </p>
40   *
41   * <p>
42   * The encoding will all be of a fixed length, except for the 'do not encode' chars, which will be of length 1
43   * </p>
44   *
45   * <h3>Sample usage</h3>
46   *
47   * <pre>
48   * Character[] originals; // a, b, c, d
49   * Character[] encoding; // 0, 1, d
50   * Character[] doNotEncode; // d
51   *
52   * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, encoding, doNotEncode);
53   *
54   * ac.encode("a"); // 00
55   * ac.encode("b"); // 01
56   * ac.encode("c"); // 0d
57   * ac.encode("d"); // d
58   * ac.encode("abcd"); // 00010dd
59   * </pre>
60   *
61   * <p>
62   * #ThreadSafe# AlphabetConverter class methods are threadsafe as they do not change internal state.
63   * </p>
64   *
65   * @since 1.0
66   *
67   */
68  public final class AlphabetConverter {
69  
70      /**
71       * Original string to be encoded.
72       */
73      private final Map<Integer, String> originalToEncoded;
74      /**
75       * Encoding alphabet.
76       */
77      private final Map<String, String> encodedToOriginal;
78      /**
79       * Length of the encoded letter.
80       */
81      private final int encodedLetterLength;
82      /**
83       * Arrow constant, used for converting the object into a string.
84       */
85      private static final String ARROW = " -> ";
86      /**
87       * Line separator, used for converting the object into a string.
88       */
89      private static final String LINE_SEPARATOR = System.getProperty("line.separator");
90  
91      /**
92       * Hidden constructor for alphabet converter. Used by static helper methods.
93       *
94       * @param originalToEncoded original string to be encoded
95       * @param encodedToOriginal encoding alphabet
96       * @param encodedLetterLength length of the encoded letter
97       */
98      private AlphabetConverter(final Map<Integer, String> originalToEncoded, final Map<String, String> encodedToOriginal,
99              final int encodedLetterLength) {
100 
101         this.originalToEncoded = originalToEncoded;
102         this.encodedToOriginal = encodedToOriginal;
103         this.encodedLetterLength = encodedLetterLength;
104     }
105 
106     /**
107      * Encode a given string.
108      *
109      * @param original the string to be encoded
110      * @return the encoded string, {@code null} if the given string is null
111      * @throws UnsupportedEncodingException if chars that are not supported are encountered
112      */
113     public String encode(final String original) throws UnsupportedEncodingException {
114         if (original == null) {
115             return null;
116         }
117 
118         final StringBuilder sb = new StringBuilder();
119 
120         for (int i = 0; i < original.length();) {
121             final int codepoint = original.codePointAt(i);
122 
123             final String nextLetter = originalToEncoded.get(codepoint);
124 
125             if (nextLetter == null) {
126                 throw new UnsupportedEncodingException(
127                         "Couldn't find encoding for '" + codePointToString(codepoint) + "' in " + original);
128             }
129 
130             sb.append(nextLetter);
131 
132             i += Character.charCount(codepoint);
133         }
134 
135         return sb.toString();
136     }
137 
138     /**
139      * Decode a given string.
140      *
141      * @param encoded a string that has been encoded using this AlphabetConverter
142      * @return the decoded string, {@code null} if the given string is null
143      * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered
144      */
145     public String decode(final String encoded) throws UnsupportedEncodingException {
146         if (encoded == null) {
147             return null;
148         }
149 
150         final StringBuilder result = new StringBuilder();
151 
152         for (int j = 0; j < encoded.length();) {
153             final Integer i = encoded.codePointAt(j);
154             final String s = codePointToString(i);
155 
156             if (s.equals(originalToEncoded.get(i))) {
157                 result.append(s);
158                 j++; // because we do not encode in Unicode extended the length of each encoded char is 1
159             } else {
160                 if (j + encodedLetterLength > encoded.length()) {
161                     throw new UnsupportedEncodingException("Unexpected end of string while decoding " + encoded);
162                 }
163                 final String nextGroup = encoded.substring(j, j + encodedLetterLength);
164                 final String next = encodedToOriginal.get(nextGroup);
165                 if (next == null) {
166                     throw new UnsupportedEncodingException(
167                             "Unexpected string without decoding (" + nextGroup + ") in " + encoded);
168                 }
169                 result.append(next);
170                 j += encodedLetterLength;
171             }
172         }
173 
174         return result.toString();
175     }
176 
177     /**
178      * Get the length of characters in the encoded alphabet that are necessary for each character in the original
179      * alphabet.
180      *
181      * @return the length of the encoded char
182      */
183     public int getEncodedCharLength() {
184         return encodedLetterLength;
185     }
186 
187     /**
188      * Get the mapping from integer code point of source language to encoded string. Use to reconstruct converter from
189      * serialized map.
190      *
191      * @return the original map
192      */
193     public Map<Integer, String> getOriginalToEncoded() {
194         return Collections.unmodifiableMap(originalToEncoded);
195     }
196 
197     /**
198      * Recursive method used when creating encoder/decoder.
199      *
200      * @param level at which point it should add a single encoding
201      * @param currentEncoding current encoding
202      * @param encoding letters encoding
203      * @param originals original values
204      * @param doNotEncodeMap map of values that should not be encoded
205      */
206     @SuppressWarnings("PMD")
207     private void addSingleEncoding(final int level, final String currentEncoding, final Collection<Integer> encoding,
208             final Iterator<Integer> originals, final Map<Integer, String> doNotEncodeMap) {
209 
210         if (level > 0) {
211             for (final int encodingLetter : encoding) {
212                 if (originals.hasNext()) {
213 
214                     // this skips the doNotEncode chars if they are in the
215                     // leftmost place
216                     if (level != encodedLetterLength || !doNotEncodeMap.containsKey(encodingLetter)) {
217                         addSingleEncoding(level - 1, currentEncoding + codePointToString(encodingLetter), encoding,
218                                 originals, doNotEncodeMap);
219                     }
220                 } else {
221                     return; // done encoding all the original alphabet
222                 }
223             }
224         } else {
225             Integer next = originals.next();
226 
227             while (doNotEncodeMap.containsKey(next)) {
228                 final String originalLetterAsString = codePointToString(next);
229 
230                 originalToEncoded.put(next, originalLetterAsString);
231                 encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
232 
233                 if (!originals.hasNext()) {
234                     return;
235                 }
236 
237                 next = originals.next();
238             }
239 
240             final String originalLetterAsString = codePointToString(next);
241 
242             originalToEncoded.put(next, currentEncoding);
243             encodedToOriginal.put(currentEncoding, originalLetterAsString);
244         }
245     }
246 
247     @Override
248     public String toString() {
249         final StringBuilder sb = new StringBuilder();
250 
251         for (final Entry<Integer, String> entry : originalToEncoded.entrySet()) {
252             sb.append(codePointToString(entry.getKey())).append(ARROW).append(entry.getValue()).append(LINE_SEPARATOR);
253         }
254 
255         return sb.toString();
256     }
257 
258     @Override
259     public boolean equals(final Object obj) {
260         if (obj == null) {
261             return false;
262         }
263         if (obj == this) {
264             return true;
265         }
266         if (!(obj instanceof AlphabetConverter)) {
267             return false;
268         }
269         final AlphabetConverter other = (AlphabetConverter) obj;
270         return originalToEncoded.equals(other.originalToEncoded) && encodedToOriginal.equals(other.encodedToOriginal)
271                 && encodedLetterLength == other.encodedLetterLength;
272     }
273 
274     @Override
275     public int hashCode() {
276         return Objects.hash(originalToEncoded, encodedToOriginal, encodedLetterLength);
277     }
278 
279     // -- static methods
280 
281     /**
282      * Create a new converter from a map.
283      *
284      * @param originalToEncoded a map returned from getOriginalToEncoded()
285      * @return the reconstructed AlphabetConverter
286      * @see AlphabetConverter#getOriginalToEncoded()
287      */
288     public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
289         final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
290         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
291         final Map<Integer, String> doNotEncodeMap = new HashMap<>();
292 
293         int encodedLetterLength = 1;
294 
295         for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
296             final String originalAsString = codePointToString(e.getKey());
297             encodedToOriginal.put(e.getValue(), originalAsString);
298 
299             if (e.getValue().equals(originalAsString)) {
300                 doNotEncodeMap.put(e.getKey(), e.getValue());
301             }
302 
303             if (e.getValue().length() > encodedLetterLength) {
304                 encodedLetterLength = e.getValue().length();
305             }
306         }
307 
308         return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
309     }
310 
311     /**
312      * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
313      * the characters in <em>doNotEncode</em> as they are (if possible).
314      *
315      * <p>Duplicate letters in either original or encoding will be ignored.</p>
316      *
317      * @param original an array of chars representing the original alphabet
318      * @param encoding an array of chars representing the alphabet to be used for encoding
319      * @param doNotEncode an array of chars to be encoded using the original alphabet - every char here must appear in
320      *            both the previous params
321      * @return the AlphabetConverter
322      * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
323      */
324     public static AlphabetConverter createConverterFromChars(final Character[] original, final Character[] encoding,
325             final Character[] doNotEncode) {
326         return AlphabetConverter.createConverter(convertCharsToIntegers(original), convertCharsToIntegers(encoding),
327                 convertCharsToIntegers(doNotEncode));
328     }
329 
330     /**
331      * Convert characters to integers.
332      *
333      * @param chars array of characters
334      * @return an equivalent array of integers
335      */
336     private static Integer[] convertCharsToIntegers(final Character[] chars) {
337         if (chars == null || chars.length == 0) {
338             return new Integer[0];
339         }
340         final Integer[] integers = new Integer[chars.length];
341         for (int i = 0; i < chars.length; i++) {
342             integers[i] = (int) chars[i];
343         }
344         return integers;
345     }
346 
347     /**
348      * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
349      * the characters in <em>doNotEncode</em> as they are (if possible).
350      *
351      * <p>Duplicate letters in either original or encoding will be ignored.</p>
352      *
353      * @param original an array of ints representing the original alphabet in codepoints
354      * @param encoding an array of ints representing the alphabet to be used for encoding, in codepoints
355      * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char
356      *            here must appear in both the previous params
357      * @return the AlphabetConverter
358      * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
359      */
360     public static AlphabetConverter createConverter(final Integer[] original, final Integer[] encoding, final Integer[] doNotEncode) {
361 
362         final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original));
363         final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding));
364         final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode));
365 
366         final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
367         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
368         final Map<Integer, String> doNotEncodeMap = new HashMap<>();
369 
370         int encodedLetterLength;
371 
372         for (final int i : doNotEncodeCopy) {
373             if (!originalCopy.contains(i)) {
374                 throw new IllegalArgumentException(
375                         "Can not use 'do not encode' list because original alphabet does not contain '"
376                                 + codePointToString(i) + "'");
377             }
378 
379             if (!encodingCopy.contains(i)) {
380                 throw new IllegalArgumentException(
381                         "Can not use 'do not encode' list because encoding alphabet does not contain '"
382                                 + codePointToString(i) + "'");
383             }
384 
385             doNotEncodeMap.put(i, codePointToString(i));
386         }
387 
388         if (encodingCopy.size() >= originalCopy.size()) {
389             encodedLetterLength = 1;
390 
391             final Iterator<Integer> it = encodingCopy.iterator();
392 
393             for (final int originalLetter : originalCopy) {
394                 final String originalLetterAsString = codePointToString(originalLetter);
395 
396                 if (doNotEncodeMap.containsKey(originalLetter)) {
397                     originalToEncoded.put(originalLetter, originalLetterAsString);
398                     encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
399                 } else {
400                     Integer next = it.next();
401 
402                     while (doNotEncodeCopy.contains(next)) {
403                         next = it.next();
404                     }
405 
406                     final String encodedLetter = codePointToString(next);
407 
408                     originalToEncoded.put(originalLetter, encodedLetter);
409                     encodedToOriginal.put(encodedLetter, originalLetterAsString);
410                 }
411             }
412 
413             return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
414 
415         } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
416             throw new IllegalArgumentException(
417                     "Must have at least two encoding characters (excluding those in the 'do not encode' list), but has "
418                             + (encodingCopy.size() - doNotEncodeCopy.size()));
419         } else {
420             // we start with one which is our minimum, and because we do the
421             // first division outside the loop
422             int lettersSoFar = 1;
423 
424             // the first division takes into account that the doNotEncode
425             // letters can't be in the leftmost place
426             int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
427                     / (encodingCopy.size() - doNotEncodeCopy.size());
428 
429             while (lettersLeft / encodingCopy.size() >= 1) {
430                 lettersLeft = lettersLeft / encodingCopy.size();
431                 lettersSoFar++;
432             }
433 
434             encodedLetterLength = lettersSoFar + 1;
435 
436             final AlphabetConverter ac = new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
437 
438             ac.addSingleEncoding(encodedLetterLength, "", encodingCopy, originalCopy.iterator(), doNotEncodeMap);
439 
440             return ac;
441         }
442     }
443 
444     /**
445      * Create new String that contains just the given code point.
446      *
447      * @param i code point
448      * @return a new string with the new code point
449      * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
450      */
451     private static String codePointToString(final int i) {
452         if (Character.charCount(i) == 1) {
453             return String.valueOf((char) i);
454         }
455         return new String(Character.toChars(i));
456     }
457 }