001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032import org.apache.commons.lang3.ArrayUtils;
033import org.apache.commons.lang3.StringUtils;
034
035/**
036 * <p>
037 * Convert from one alphabet to another, with the possibility of leaving certain
038 * characters unencoded.
039 * </p>
040 *
041 * <p>
042 * The target and 'do not encode' languages must be in the Unicode BMP, but the
043 * source language does not.
044 * </p>
045 *
046 * <p>
047 * The encoding will all be of a fixed length, except for the 'do not encode'
048 * chars, which will be of length 1
049 * </p>
050 *
051 * <h2>Sample usage</h2>
052 *
053 * <pre>
054 * Character[] originals;   // a, b, c, d
055 * Character[] encoding;    // 0, 1, d
056 * Character[] doNotEncode; // d
057 *
058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
059 * encoding, doNotEncode);
060 *
061 * ac.encode("a");    // 00
062 * ac.encode("b");    // 01
063 * ac.encode("c");    // 0d
064 * ac.encode("d");    // d
065 * ac.encode("abcd"); // 00010dd
066 * </pre>
067 *
068 * <p>
069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
070 * change internal state.
071 * </p>
072 *
073 * @since 1.0
074 */
075public final class AlphabetConverter {
076
077    /**
078     * Arrow constant, used for converting the object into a string.
079     */
080    private static final String ARROW = " -> ";
081
082    /**
083     * Creates new String that contains just the given code point.
084     *
085     * @param i code point.
086     * @return a new string with the new code point.
087     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
088     */
089    private static String codePointToString(final int i) {
090        if (Character.charCount(i) == 1) {
091            return String.valueOf((char) i);
092        }
093        return new String(Character.toChars(i));
094    }
095
096    /**
097     * Converts characters to integers.
098     *
099     * @param chars array of characters.
100     * @return an equivalent array of integers.
101     */
102    private static Integer[] convertCharsToIntegers(final Character[] chars) {
103        if (ArrayUtils.isEmpty(chars)) {
104            return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
105        }
106        final Integer[] integers = new Integer[chars.length];
107        Arrays.setAll(integers, i -> (int) chars[i]);
108        return integers;
109    }
110
111    /**
112     * Creates an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving the characters in <em>doNotEncode</em>
113     * as they are (if possible).
114     *
115     * <p>
116     * Duplicate letters in either original or encoding will be ignored.
117     * </p>
118     *
119     * @param original    an array of ints representing the original alphabet in code points.
120     * @param encoding    an array of ints representing the alphabet to be used for encoding, in code points.
121     * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char here must appear in both the previous
122     *                    params.
123     * @return The AlphabetConverter.
124     * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed.
125     */
126    public static AlphabetConverter createConverter(
127            final Integer[] original,
128            final Integer[] encoding,
129            final Integer[] doNotEncode) {
130        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
131        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
132        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
133
134        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
135        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
136        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
137
138        final int encodedLetterLength;
139
140        for (final int i : doNotEncodeCopy) {
141            if (!originalCopy.contains(i)) {
142                throw new IllegalArgumentException(
143                        "Can not use 'do not encode' list because original "
144                                + "alphabet does not contain '"
145                                + codePointToString(i) + "'");
146            }
147
148            if (!encodingCopy.contains(i)) {
149                throw new IllegalArgumentException(
150                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
151                                + codePointToString(i) + "'");
152            }
153
154            doNotEncodeMap.put(i, codePointToString(i));
155        }
156
157        if (encodingCopy.size() >= originalCopy.size()) {
158            encodedLetterLength = 1;
159
160            final Iterator<Integer> it = encodingCopy.iterator();
161
162            for (final int originalLetter : originalCopy) {
163                final String originalLetterAsString = codePointToString(originalLetter);
164
165                if (doNotEncodeMap.containsKey(originalLetter)) {
166                    originalToEncoded.put(originalLetter, originalLetterAsString);
167                    encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
168                } else {
169                    Integer next = it.next();
170
171                    while (doNotEncodeCopy.contains(next)) {
172                        next = it.next();
173                    }
174
175                    final String encodedLetter = codePointToString(next);
176
177                    originalToEncoded.put(originalLetter, encodedLetter);
178                    encodedToOriginal.put(encodedLetter, originalLetterAsString);
179                }
180            }
181
182            return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
183
184        }
185        if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
186            throw new IllegalArgumentException(
187                    "Must have at least two encoding characters (excluding "
188                            + "those in the 'do not encode' list), but has "
189                            + (encodingCopy.size() - doNotEncodeCopy.size()));
190        }
191        // we start with one which is our minimum, and because we do the
192        // first division outside the loop
193        int lettersSoFar = 1;
194
195        // the first division takes into account that the doNotEncode
196        // letters can't be in the leftmost place
197        int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
198                / (encodingCopy.size() - doNotEncodeCopy.size());
199
200        while (lettersLeft / encodingCopy.size() >= 1) {
201            lettersLeft /= encodingCopy.size();
202            lettersSoFar++;
203        }
204
205        encodedLetterLength = lettersSoFar + 1;
206
207        final AlphabetConverter ac =
208                new AlphabetConverter(originalToEncoded,
209                        encodedToOriginal,
210                        encodedLetterLength);
211
212        ac.addSingleEncoding(encodedLetterLength,
213                StringUtils.EMPTY,
214                encodingCopy,
215                originalCopy.iterator(),
216                doNotEncodeMap);
217
218        return ac;
219    }
220
221    /**
222     * Creates an alphabet converter, for converting from the original alphabet,
223     * to the encoded alphabet, while leaving the characters in
224     * <em>doNotEncode</em> as they are (if possible).
225     *
226     * <p>Duplicate letters in either original or encoding will be ignored.</p>
227     *
228     * @param original an array of chars representing the original alphabet
229     * @param encoding an array of chars representing the alphabet to be used
230     *                 for encoding
231     * @param doNotEncode an array of chars to be encoded using the original
232     *                    alphabet - every char here must appear in
233     *                    both the previous params
234     * @return The AlphabetConverter
235     * @throws IllegalArgumentException if an AlphabetConverter cannot be
236     *                                  constructed
237     */
238    public static AlphabetConverter createConverterFromChars(
239            final Character[] original,
240            final Character[] encoding,
241            final Character[] doNotEncode) {
242        return createConverter(
243                convertCharsToIntegers(original),
244                convertCharsToIntegers(encoding),
245                convertCharsToIntegers(doNotEncode));
246    }
247
248    /**
249     * Creates a new converter from a map.
250     *
251     * @param originalToEncoded a map returned from getOriginalToEncoded().
252     * @return The reconstructed AlphabetConverter.
253     * @see AlphabetConverter#getOriginalToEncoded()
254     */
255    public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
256        final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
257        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
258
259        int encodedLetterLength = 1;
260
261        for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
262            encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));
263
264            if (e.getValue().length() > encodedLetterLength) {
265                encodedLetterLength = e.getValue().length();
266            }
267        }
268
269        return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
270    }
271
272    /**
273     * Original string to be encoded.
274     */
275    private final Map<Integer, String> originalToEncoded;
276
277    /**
278     * Encoding alphabet.
279     */
280    private final Map<String, String> encodedToOriginal;
281
282    /**
283     * Length of the encoded letter.
284     */
285    private final int encodedLetterLength;
286
287    /**
288     * Hidden constructor for alphabet converter. Used by static helper methods.
289     *
290     * @param originalToEncoded original string to be encoded.
291     * @param encodedToOriginal encoding alphabet.
292     * @param encodedLetterLength length of the encoded letter.
293     */
294    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
295                              final Map<String, String> encodedToOriginal,
296                              final int encodedLetterLength) {
297
298        this.originalToEncoded = originalToEncoded;
299        this.encodedToOriginal = encodedToOriginal;
300        this.encodedLetterLength = encodedLetterLength;
301    }
302
303    /**
304     * Recursive method used when creating encoder/decoder.
305     *
306     * @param level at which point it should add a single encoding.
307     * @param currentEncoding current encoding.
308     * @param encoding letters encoding.
309     * @param originals original values.
310     * @param doNotEncodeMap map of values that should not be encoded.
311     */
312    private void addSingleEncoding(final int level,
313                                   final String currentEncoding,
314                                   final Collection<Integer> encoding,
315                                   final Iterator<Integer> originals,
316                                   final Map<Integer, String> doNotEncodeMap) {
317
318        if (level > 0) {
319            for (final int encodingLetter : encoding) {
320                if (!originals.hasNext()) {
321                    return; // done encoding all the original alphabet
322                }
323                // this skips the doNotEncode chars if they are in the
324                // leftmost place
325                if (level != encodedLetterLength
326                        || !doNotEncodeMap.containsKey(encodingLetter)) {
327                    addSingleEncoding(level - 1,
328                            currentEncoding
329                                    + codePointToString(encodingLetter),
330                            encoding,
331                            originals,
332                            doNotEncodeMap
333                    );
334                }
335            }
336        } else {
337            Integer next = originals.next();
338
339            while (doNotEncodeMap.containsKey(next)) {
340                final String originalLetterAsString = codePointToString(next);
341
342                originalToEncoded.put(next, originalLetterAsString);
343                encodedToOriginal.put(originalLetterAsString,
344                        originalLetterAsString);
345
346                if (!originals.hasNext()) {
347                    return;
348                }
349
350                next = originals.next();
351            }
352
353            final String originalLetterAsString = codePointToString(next);
354
355            originalToEncoded.put(next, currentEncoding);
356            encodedToOriginal.put(currentEncoding, originalLetterAsString);
357        }
358    }
359
360    /**
361     * Decodes a given string.
362     *
363     * @param encoded a string that has been encoded using this AlphabetConverter.
364     * @return The decoded string, {@code null} if the given string is null.
365     * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered.
366     */
367    public String decode(final String encoded)
368            throws UnsupportedEncodingException {
369        if (encoded == null) {
370            return null;
371        }
372
373        final StringBuilder result = new StringBuilder();
374
375        for (int j = 0; j < encoded.length();) {
376            final int i = encoded.codePointAt(j);
377            final String s = codePointToString(i);
378
379            if (s.equals(originalToEncoded.get(i))) {
380                result.append(s);
381                j++; // because we do not encode in Unicode extended the
382                     // length of each encoded char is 1
383            } else {
384                if (j + encodedLetterLength > encoded.length()) {
385                    throw new UnsupportedEncodingException("Unexpected end "
386                            + "of string while decoding " + encoded);
387                }
388                final String nextGroup = encoded.substring(j,
389                        j + encodedLetterLength);
390                final String next = encodedToOriginal.get(nextGroup);
391                if (next == null) {
392                    throw new UnsupportedEncodingException(
393                            "Unexpected string without decoding ("
394                                    + nextGroup + ") in " + encoded);
395                }
396                result.append(next);
397                j += encodedLetterLength;
398            }
399        }
400
401        return result.toString();
402    }
403
404    /**
405     * Encodes a given string.
406     *
407     * @param original the string to be encoded.
408     * @return The encoded string, {@code null} if the given string is null.
409     * @throws UnsupportedEncodingException if chars that are not supported are encountered.
410     */
411    public String encode(final String original)
412            throws UnsupportedEncodingException {
413        if (original == null) {
414            return null;
415        }
416
417        final StringBuilder sb = new StringBuilder();
418
419        for (int i = 0; i < original.length();) {
420            final int codePoint = original.codePointAt(i);
421
422            final String nextLetter = originalToEncoded.get(codePoint);
423
424            if (nextLetter == null) {
425                throw new UnsupportedEncodingException(
426                        "Couldn't find encoding for '"
427                                + codePointToString(codePoint)
428                                + "' in "
429                                + original
430                );
431            }
432
433            sb.append(nextLetter);
434
435            i += Character.charCount(codePoint);
436        }
437
438        return sb.toString();
439    }
440
441    @Override
442    public boolean equals(final Object obj) {
443        if (obj == null) {
444            return false;
445        }
446        if (obj == this) {
447            return true;
448        }
449        if (!(obj instanceof AlphabetConverter)) {
450            return false;
451        }
452        final AlphabetConverter other = (AlphabetConverter) obj;
453        return originalToEncoded.equals(other.originalToEncoded)
454                && encodedToOriginal.equals(other.encodedToOriginal)
455                && encodedLetterLength == other.encodedLetterLength;
456    }
457
458    /**
459     * Gets the length of characters in the encoded alphabet that are necessary for each character in the original alphabet.
460     *
461     * @return The length of the encoded char.
462     */
463    public int getEncodedCharLength() {
464        return encodedLetterLength;
465    }
466
467    /**
468     * Gets the mapping from integer code point of source language to encoded string. Use to reconstruct converter from serialized map.
469     *
470     * @return The original map.
471     */
472    public Map<Integer, String> getOriginalToEncoded() {
473        return Collections.unmodifiableMap(originalToEncoded);
474    }
475
476    @Override
477    public int hashCode() {
478        return Objects.hash(originalToEncoded,
479                encodedToOriginal,
480                encodedLetterLength);
481    }
482
483    @Override
484    public String toString() {
485        final StringBuilder sb = new StringBuilder();
486        // @formatter:off
487        originalToEncoded.forEach((k, v) ->
488            sb.append(codePointToString(k))
489              .append(ARROW)
490              .append(k)
491              .append(System.lineSeparator()));
492        // @formatter:on
493        return sb.toString();
494    }
495}