Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032import org.apache.commons.lang3.ArrayUtils;
033import org.apache.commons.lang3.StringUtils;
034
035/**
036 * <p>
037 * Convert from one alphabet to another, with the possibility of leaving certain
038 * characters unencoded.
039 * </p>
040 *
041 * <p>
042 * The target and 'do not encode' languages must be in the Unicode BMP, but the
043 * source language does not.
044 * </p>
045 *
046 * <p>
047 * The encoding will all be of a fixed length, except for the 'do not encode'
048 * chars, which will be of length 1
049 * </p>
050 *
051 * <h2>Sample usage</h2>
052 *
053 * <pre>
054 * Character[] originals;   // a, b, c, d
055 * Character[] encoding;    // 0, 1, d
056 * Character[] doNotEncode; // d
057 *
058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
059 * encoding, doNotEncode);
060 *
061 * ac.encode("a");    // 00
062 * ac.encode("b");    // 01
063 * ac.encode("c");    // 0d
064 * ac.encode("d");    // d
065 * ac.encode("abcd"); // 00010dd
066 * </pre>
067 *
068 * <p>
069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
070 * change internal state.
071 * </p>
072 *
073 * @since 1.0
074 */
075public final class AlphabetConverter {
076
077    /**
078     * Arrow constant, used for converting the object into a string.
079     */
080    private static final String ARROW = " -> ";
081
082    /**
083     * Creates new String that contains just the given code point.
084     *
085     * @param i code point
086     * @return a new string with the new code point
087     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
088     */
089    private static String codePointToString(final int i) {
090        if (Character.charCount(i) == 1) {
091            return String.valueOf((char) i);
092        }
093        return new String(Character.toChars(i));
094    }
095
096    /**
097     * Converts characters to integers.
098     *
099     * @param chars array of characters
100     * @return an equivalent array of integers
101     */
102    private static Integer[] convertCharsToIntegers(final Character[] chars) {
103        if (ArrayUtils.isEmpty(chars)) {
104            return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
105        }
106        final Integer[] integers = new Integer[chars.length];
107        Arrays.setAll(integers, i -> (int) chars[i]);
108        return integers;
109    }
110
111    /**
112     * Creates an alphabet converter, for converting from the original alphabet,
113     * to the encoded alphabet, while leaving
114     * the characters in <em>doNotEncode</em> as they are (if possible).
115     *
116     * <p>Duplicate letters in either original or encoding will be ignored.</p>
117     *
118     * @param original an array of ints representing the original alphabet in
119     *                 code points
120     * @param encoding an array of ints representing the alphabet to be used for
121     *                 encoding, in code points
122     * @param doNotEncode an array of ints representing the chars to be encoded
123     *                    using the original alphabet - every char
124     *                    here must appear in both the previous params
125     * @return The AlphabetConverter
126     * @throws IllegalArgumentException if an AlphabetConverter cannot be
127     *                                   constructed
128     */
129    public static AlphabetConverter createConverter(
130            final Integer[] original,
131            final Integer[] encoding,
132            final Integer[] doNotEncode) {
133        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
134        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
135        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));
136
137        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
138        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
139        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
140
141        final int encodedLetterLength;
142
143        for (final int i : doNotEncodeCopy) {
144            if (!originalCopy.contains(i)) {
145                throw new IllegalArgumentException(
146                        "Can not use 'do not encode' list because original "
147                                + "alphabet does not contain '"
148                                + codePointToString(i) + "'");
149            }
150
151            if (!encodingCopy.contains(i)) {
152                throw new IllegalArgumentException(
153                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
154                                + codePointToString(i) + "'");
155            }
156
157            doNotEncodeMap.put(i, codePointToString(i));
158        }
159
160        if (encodingCopy.size() >= originalCopy.size()) {
161            encodedLetterLength = 1;
162
163            final Iterator<Integer> it = encodingCopy.iterator();
164
165            for (final int originalLetter : originalCopy) {
166                final String originalLetterAsString = codePointToString(originalLetter);
167
168                if (doNotEncodeMap.containsKey(originalLetter)) {
169                    originalToEncoded.put(originalLetter, originalLetterAsString);
170                    encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
171                } else {
172                    Integer next = it.next();
173
174                    while (doNotEncodeCopy.contains(next)) {
175                        next = it.next();
176                    }
177
178                    final String encodedLetter = codePointToString(next);
179
180                    originalToEncoded.put(originalLetter, encodedLetter);
181                    encodedToOriginal.put(encodedLetter, originalLetterAsString);
182                }
183            }
184
185            return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
186
187        }
188        if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
189            throw new IllegalArgumentException(
190                    "Must have at least two encoding characters (excluding "
191                            + "those in the 'do not encode' list), but has "
192                            + (encodingCopy.size() - doNotEncodeCopy.size()));
193        }
194        // we start with one which is our minimum, and because we do the
195        // first division outside the loop
196        int lettersSoFar = 1;
197
198        // the first division takes into account that the doNotEncode
199        // letters can't be in the leftmost place
200        int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
201                / (encodingCopy.size() - doNotEncodeCopy.size());
202
203        while (lettersLeft / encodingCopy.size() >= 1) {
204            lettersLeft /= encodingCopy.size();
205            lettersSoFar++;
206        }
207
208        encodedLetterLength = lettersSoFar + 1;
209
210        final AlphabetConverter ac =
211                new AlphabetConverter(originalToEncoded,
212                        encodedToOriginal,
213                        encodedLetterLength);
214
215        ac.addSingleEncoding(encodedLetterLength,
216                StringUtils.EMPTY,
217                encodingCopy,
218                originalCopy.iterator(),
219                doNotEncodeMap);
220
221        return ac;
222    }
223
224    /**
225     * Creates an alphabet converter, for converting from the original alphabet,
226     * to the encoded alphabet, while leaving the characters in
227     * <em>doNotEncode</em> as they are (if possible).
228     *
229     * <p>Duplicate letters in either original or encoding will be ignored.</p>
230     *
231     * @param original an array of chars representing the original alphabet
232     * @param encoding an array of chars representing the alphabet to be used
233     *                 for encoding
234     * @param doNotEncode an array of chars to be encoded using the original
235     *                    alphabet - every char here must appear in
236     *                    both the previous params
237     * @return The AlphabetConverter
238     * @throws IllegalArgumentException if an AlphabetConverter cannot be
239     *                                  constructed
240     */
241    public static AlphabetConverter createConverterFromChars(
242            final Character[] original,
243            final Character[] encoding,
244            final Character[] doNotEncode) {
245        return AlphabetConverter.createConverter(
246                convertCharsToIntegers(original),
247                convertCharsToIntegers(encoding),
248                convertCharsToIntegers(doNotEncode));
249    }
250
251    /**
252     * Creates a new converter from a map.
253     *
254     * @param originalToEncoded a map returned from getOriginalToEncoded()
255     * @return The reconstructed AlphabetConverter
256     * @see AlphabetConverter#getOriginalToEncoded()
257     */
258    public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
259        final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
260        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
261
262        int encodedLetterLength = 1;
263
264        for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
265            encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));
266
267            if (e.getValue().length() > encodedLetterLength) {
268                encodedLetterLength = e.getValue().length();
269            }
270        }
271
272        return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
273    }
274
275    /**
276     * Original string to be encoded.
277     */
278    private final Map<Integer, String> originalToEncoded;
279
280    /**
281     * Encoding alphabet.
282     */
283    private final Map<String, String> encodedToOriginal;
284
285    /**
286     * Length of the encoded letter.
287     */
288    private final int encodedLetterLength;
289
290    /**
291     * Hidden constructor for alphabet converter. Used by static helper methods.
292     *
293     * @param originalToEncoded original string to be encoded
294     * @param encodedToOriginal encoding alphabet
295     * @param encodedLetterLength length of the encoded letter
296     */
297    private AlphabetConverter(final Map<Integer, String> originalToEncoded,
298                              final Map<String, String> encodedToOriginal,
299                              final int encodedLetterLength) {
300
301        this.originalToEncoded = originalToEncoded;
302        this.encodedToOriginal = encodedToOriginal;
303        this.encodedLetterLength = encodedLetterLength;
304    }
305
306    /**
307     * Recursive method used when creating encoder/decoder.
308     *
309     * @param level at which point it should add a single encoding
310     * @param currentEncoding current encoding
311     * @param encoding letters encoding
312     * @param originals original values
313     * @param doNotEncodeMap map of values that should not be encoded
314     */
315    private void addSingleEncoding(final int level,
316                                   final String currentEncoding,
317                                   final Collection<Integer> encoding,
318                                   final Iterator<Integer> originals,
319                                   final Map<Integer, String> doNotEncodeMap) {
320
321        if (level > 0) {
322            for (final int encodingLetter : encoding) {
323                if (!originals.hasNext()) {
324                    return; // done encoding all the original alphabet
325                }
326                // this skips the doNotEncode chars if they are in the
327                // leftmost place
328                if (level != encodedLetterLength
329                        || !doNotEncodeMap.containsKey(encodingLetter)) {
330                    addSingleEncoding(level - 1,
331                            currentEncoding
332                                    + codePointToString(encodingLetter),
333                            encoding,
334                            originals,
335                            doNotEncodeMap
336                    );
337                }
338            }
339        } else {
340            Integer next = originals.next();
341
342            while (doNotEncodeMap.containsKey(next)) {
343                final String originalLetterAsString = codePointToString(next);
344
345                originalToEncoded.put(next, originalLetterAsString);
346                encodedToOriginal.put(originalLetterAsString,
347                        originalLetterAsString);
348
349                if (!originals.hasNext()) {
350                    return;
351                }
352
353                next = originals.next();
354            }
355
356            final String originalLetterAsString = codePointToString(next);
357
358            originalToEncoded.put(next, currentEncoding);
359            encodedToOriginal.put(currentEncoding, originalLetterAsString);
360        }
361    }
362
363    /**
364     * Decodes a given string.
365     *
366     * @param encoded a string that has been encoded using this
367     *                AlphabetConverter
368     * @return The decoded string, {@code null} if the given string is null
369     * @throws UnsupportedEncodingException if unexpected characters that
370     *                                      cannot be handled are encountered
371     */
372    public String decode(final String encoded)
373            throws UnsupportedEncodingException {
374        if (encoded == null) {
375            return null;
376        }
377
378        final StringBuilder result = new StringBuilder();
379
380        for (int j = 0; j < encoded.length();) {
381            final int i = encoded.codePointAt(j);
382            final String s = codePointToString(i);
383
384            if (s.equals(originalToEncoded.get(i))) {
385                result.append(s);
386                j++; // because we do not encode in Unicode extended the
387                     // length of each encoded char is 1
388            } else {
389                if (j + encodedLetterLength > encoded.length()) {
390                    throw new UnsupportedEncodingException("Unexpected end "
391                            + "of string while decoding " + encoded);
392                }
393                final String nextGroup = encoded.substring(j,
394                        j + encodedLetterLength);
395                final String next = encodedToOriginal.get(nextGroup);
396                if (next == null) {
397                    throw new UnsupportedEncodingException(
398                            "Unexpected string without decoding ("
399                                    + nextGroup + ") in " + encoded);
400                }
401                result.append(next);
402                j += encodedLetterLength;
403            }
404        }
405
406        return result.toString();
407    }
408
409    /**
410     * Encodes a given string.
411     *
412     * @param original the string to be encoded
413     * @return The encoded string, {@code null} if the given string is null
414     * @throws UnsupportedEncodingException if chars that are not supported are
415     *                                      encountered
416     */
417    public String encode(final String original)
418            throws UnsupportedEncodingException {
419        if (original == null) {
420            return null;
421        }
422
423        final StringBuilder sb = new StringBuilder();
424
425        for (int i = 0; i < original.length();) {
426            final int codePoint = original.codePointAt(i);
427
428            final String nextLetter = originalToEncoded.get(codePoint);
429
430            if (nextLetter == null) {
431                throw new UnsupportedEncodingException(
432                        "Couldn't find encoding for '"
433                                + codePointToString(codePoint)
434                                + "' in "
435                                + original
436                );
437            }
438
439            sb.append(nextLetter);
440
441            i += Character.charCount(codePoint);
442        }
443
444        return sb.toString();
445    }
446
447    @Override
448    public boolean equals(final Object obj) {
449        if (obj == null) {
450            return false;
451        }
452        if (obj == this) {
453            return true;
454        }
455        if (!(obj instanceof AlphabetConverter)) {
456            return false;
457        }
458        final AlphabetConverter other = (AlphabetConverter) obj;
459        return originalToEncoded.equals(other.originalToEncoded)
460                && encodedToOriginal.equals(other.encodedToOriginal)
461                && encodedLetterLength == other.encodedLetterLength;
462    }
463
464    /**
465     * Gets the length of characters in the encoded alphabet that are necessary
466     * for each character in the original
467     * alphabet.
468     *
469     * @return The length of the encoded char
470     */
471    public int getEncodedCharLength() {
472        return encodedLetterLength;
473    }
474
475    /**
476     * Gets the mapping from integer code point of source language to encoded
477     * string. Use to reconstruct converter from
478     * serialized map.
479     *
480     * @return The original map
481     */
482    public Map<Integer, String> getOriginalToEncoded() {
483        return Collections.unmodifiableMap(originalToEncoded);
484    }
485
486    @Override
487    public int hashCode() {
488        return Objects.hash(originalToEncoded,
489                encodedToOriginal,
490                encodedLetterLength);
491    }
492
493    @Override
494    public String toString() {
495        final StringBuilder sb = new StringBuilder();
496        // @formatter:off
497        originalToEncoded.forEach((k, v) ->
498            sb.append(codePointToString(k))
499              .append(ARROW)
500              .append(k)
501              .append(System.lineSeparator()));
502        // @formatter:on
503        return sb.toString();
504    }
505}