AlphabetConverter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text;

  18. import java.io.UnsupportedEncodingException;
  19. import java.util.Arrays;
  20. import java.util.Collection;
  21. import java.util.Collections;
  22. import java.util.HashMap;
  23. import java.util.Iterator;
  24. import java.util.LinkedHashMap;
  25. import java.util.LinkedHashSet;
  26. import java.util.Map;
  27. import java.util.Map.Entry;
  28. import java.util.Objects;
  29. import java.util.Set;

  30. import org.apache.commons.lang3.ArrayUtils;
  31. import org.apache.commons.lang3.StringUtils;

  32. /**
  33.  * <p>
  34.  * Convert from one alphabet to another, with the possibility of leaving certain
  35.  * characters unencoded.
  36.  * </p>
  37.  *
  38.  * <p>
  39.  * The target and 'do not encode' languages must be in the Unicode BMP, but the
  40.  * source language does not.
  41.  * </p>
  42.  *
  43.  * <p>
  44.  * The encoding will all be of a fixed length, except for the 'do not encode'
  45.  * chars, which will be of length 1
  46.  * </p>
  47.  *
  48.  * <h2>Sample usage</h2>
  49.  *
  50.  * <pre>
  51.  * Character[] originals;   // a, b, c, d
  52.  * Character[] encoding;    // 0, 1, d
  53.  * Character[] doNotEncode; // d
  54.  *
  55.  * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals,
  56.  * encoding, doNotEncode);
  57.  *
  58.  * ac.encode("a");    // 00
  59.  * ac.encode("b");    // 01
  60.  * ac.encode("c");    // 0d
  61.  * ac.encode("d");    // d
  62.  * ac.encode("abcd"); // 00010dd
  63.  * </pre>
  64.  *
  65.  * <p>
  66.  * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not
  67.  * change internal state.
  68.  * </p>
  69.  *
  70.  * @since 1.0
  71.  */
  72. public final class AlphabetConverter {

  73.     /**
  74.      * Arrow constant, used for converting the object into a string.
  75.      */
  76.     private static final String ARROW = " -> ";

  77.     /**
  78.      * Creates new String that contains just the given code point.
  79.      *
  80.      * @param i code point
  81.      * @return a new string with the new code point
  82.      * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
  83.      */
  84.     private static String codePointToString(final int i) {
  85.         if (Character.charCount(i) == 1) {
  86.             return String.valueOf((char) i);
  87.         }
  88.         return new String(Character.toChars(i));
  89.     }

  90.     /**
  91.      * Converts characters to integers.
  92.      *
  93.      * @param chars array of characters
  94.      * @return an equivalent array of integers
  95.      */
  96.     private static Integer[] convertCharsToIntegers(final Character[] chars) {
  97.         if (ArrayUtils.isEmpty(chars)) {
  98.             return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY;
  99.         }
  100.         final Integer[] integers = new Integer[chars.length];
  101.         Arrays.setAll(integers, i -> (int) chars[i]);
  102.         return integers;
  103.     }

  104.     /**
  105.      * Creates an alphabet converter, for converting from the original alphabet,
  106.      * to the encoded alphabet, while leaving
  107.      * the characters in <em>doNotEncode</em> as they are (if possible).
  108.      *
  109.      * <p>Duplicate letters in either original or encoding will be ignored.</p>
  110.      *
  111.      * @param original an array of ints representing the original alphabet in
  112.      *                 code points
  113.      * @param encoding an array of ints representing the alphabet to be used for
  114.      *                 encoding, in code points
  115.      * @param doNotEncode an array of ints representing the chars to be encoded
  116.      *                    using the original alphabet - every char
  117.      *                    here must appear in both the previous params
  118.      * @return The AlphabetConverter
  119.      * @throws IllegalArgumentException if an AlphabetConverter cannot be
  120.      *                                   constructed
  121.      */
  122.     public static AlphabetConverter createConverter(
  123.             final Integer[] original,
  124.             final Integer[] encoding,
  125.             final Integer[] doNotEncode) {
  126.         final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original));
  127.         final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding));
  128.         final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode));

  129.         final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
  130.         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
  131.         final Map<Integer, String> doNotEncodeMap = new HashMap<>();

  132.         final int encodedLetterLength;

  133.         for (final int i : doNotEncodeCopy) {
  134.             if (!originalCopy.contains(i)) {
  135.                 throw new IllegalArgumentException(
  136.                         "Can not use 'do not encode' list because original "
  137.                                 + "alphabet does not contain '"
  138.                                 + codePointToString(i) + "'");
  139.             }

  140.             if (!encodingCopy.contains(i)) {
  141.                 throw new IllegalArgumentException(
  142.                         "Can not use 'do not encode' list because encoding alphabet does not contain '"
  143.                                 + codePointToString(i) + "'");
  144.             }

  145.             doNotEncodeMap.put(i, codePointToString(i));
  146.         }

  147.         if (encodingCopy.size() >= originalCopy.size()) {
  148.             encodedLetterLength = 1;

  149.             final Iterator<Integer> it = encodingCopy.iterator();

  150.             for (final int originalLetter : originalCopy) {
  151.                 final String originalLetterAsString = codePointToString(originalLetter);

  152.                 if (doNotEncodeMap.containsKey(originalLetter)) {
  153.                     originalToEncoded.put(originalLetter, originalLetterAsString);
  154.                     encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
  155.                 } else {
  156.                     Integer next = it.next();

  157.                     while (doNotEncodeCopy.contains(next)) {
  158.                         next = it.next();
  159.                     }

  160.                     final String encodedLetter = codePointToString(next);

  161.                     originalToEncoded.put(originalLetter, encodedLetter);
  162.                     encodedToOriginal.put(encodedLetter, originalLetterAsString);
  163.                 }
  164.             }

  165.             return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);

  166.         }
  167.         if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
  168.             throw new IllegalArgumentException(
  169.                     "Must have at least two encoding characters (excluding "
  170.                             + "those in the 'do not encode' list), but has "
  171.                             + (encodingCopy.size() - doNotEncodeCopy.size()));
  172.         }
  173.         // we start with one which is our minimum, and because we do the
  174.         // first division outside the loop
  175.         int lettersSoFar = 1;

  176.         // the first division takes into account that the doNotEncode
  177.         // letters can't be in the leftmost place
  178.         int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
  179.                 / (encodingCopy.size() - doNotEncodeCopy.size());

  180.         while (lettersLeft / encodingCopy.size() >= 1) {
  181.             lettersLeft /= encodingCopy.size();
  182.             lettersSoFar++;
  183.         }

  184.         encodedLetterLength = lettersSoFar + 1;

  185.         final AlphabetConverter ac =
  186.                 new AlphabetConverter(originalToEncoded,
  187.                         encodedToOriginal,
  188.                         encodedLetterLength);

  189.         ac.addSingleEncoding(encodedLetterLength,
  190.                 StringUtils.EMPTY,
  191.                 encodingCopy,
  192.                 originalCopy.iterator(),
  193.                 doNotEncodeMap);

  194.         return ac;
  195.     }

  196.     /**
  197.      * Creates an alphabet converter, for converting from the original alphabet,
  198.      * to the encoded alphabet, while leaving the characters in
  199.      * <em>doNotEncode</em> as they are (if possible).
  200.      *
  201.      * <p>Duplicate letters in either original or encoding will be ignored.</p>
  202.      *
  203.      * @param original an array of chars representing the original alphabet
  204.      * @param encoding an array of chars representing the alphabet to be used
  205.      *                 for encoding
  206.      * @param doNotEncode an array of chars to be encoded using the original
  207.      *                    alphabet - every char here must appear in
  208.      *                    both the previous params
  209.      * @return The AlphabetConverter
  210.      * @throws IllegalArgumentException if an AlphabetConverter cannot be
  211.      *                                  constructed
  212.      */
  213.     public static AlphabetConverter createConverterFromChars(
  214.             final Character[] original,
  215.             final Character[] encoding,
  216.             final Character[] doNotEncode) {
  217.         return AlphabetConverter.createConverter(
  218.                 convertCharsToIntegers(original),
  219.                 convertCharsToIntegers(encoding),
  220.                 convertCharsToIntegers(doNotEncode));
  221.     }

  222.     /**
  223.      * Creates a new converter from a map.
  224.      *
  225.      * @param originalToEncoded a map returned from getOriginalToEncoded()
  226.      * @return The reconstructed AlphabetConverter
  227.      * @see AlphabetConverter#getOriginalToEncoded()
  228.      */
  229.     public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
  230.         final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
  231.         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();

  232.         int encodedLetterLength = 1;

  233.         for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
  234.             encodedToOriginal.put(e.getValue(), codePointToString(e.getKey()));

  235.             if (e.getValue().length() > encodedLetterLength) {
  236.                 encodedLetterLength = e.getValue().length();
  237.             }
  238.         }

  239.         return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
  240.     }

  241.     /**
  242.      * Original string to be encoded.
  243.      */
  244.     private final Map<Integer, String> originalToEncoded;

  245.     /**
  246.      * Encoding alphabet.
  247.      */
  248.     private final Map<String, String> encodedToOriginal;

  249.     /**
  250.      * Length of the encoded letter.
  251.      */
  252.     private final int encodedLetterLength;

  253.     /**
  254.      * Hidden constructor for alphabet converter. Used by static helper methods.
  255.      *
  256.      * @param originalToEncoded original string to be encoded
  257.      * @param encodedToOriginal encoding alphabet
  258.      * @param encodedLetterLength length of the encoded letter
  259.      */
  260.     private AlphabetConverter(final Map<Integer, String> originalToEncoded,
  261.                               final Map<String, String> encodedToOriginal,
  262.                               final int encodedLetterLength) {

  263.         this.originalToEncoded = originalToEncoded;
  264.         this.encodedToOriginal = encodedToOriginal;
  265.         this.encodedLetterLength = encodedLetterLength;
  266.     }

  267.     /**
  268.      * Recursive method used when creating encoder/decoder.
  269.      *
  270.      * @param level at which point it should add a single encoding
  271.      * @param currentEncoding current encoding
  272.      * @param encoding letters encoding
  273.      * @param originals original values
  274.      * @param doNotEncodeMap map of values that should not be encoded
  275.      */
  276.     private void addSingleEncoding(final int level,
  277.                                    final String currentEncoding,
  278.                                    final Collection<Integer> encoding,
  279.                                    final Iterator<Integer> originals,
  280.                                    final Map<Integer, String> doNotEncodeMap) {

  281.         if (level > 0) {
  282.             for (final int encodingLetter : encoding) {
  283.                 if (!originals.hasNext()) {
  284.                     return; // done encoding all the original alphabet
  285.                 }
  286.                 // this skips the doNotEncode chars if they are in the
  287.                 // leftmost place
  288.                 if (level != encodedLetterLength
  289.                         || !doNotEncodeMap.containsKey(encodingLetter)) {
  290.                     addSingleEncoding(level - 1,
  291.                             currentEncoding
  292.                                     + codePointToString(encodingLetter),
  293.                             encoding,
  294.                             originals,
  295.                             doNotEncodeMap
  296.                     );
  297.                 }
  298.             }
  299.         } else {
  300.             Integer next = originals.next();

  301.             while (doNotEncodeMap.containsKey(next)) {
  302.                 final String originalLetterAsString = codePointToString(next);

  303.                 originalToEncoded.put(next, originalLetterAsString);
  304.                 encodedToOriginal.put(originalLetterAsString,
  305.                         originalLetterAsString);

  306.                 if (!originals.hasNext()) {
  307.                     return;
  308.                 }

  309.                 next = originals.next();
  310.             }

  311.             final String originalLetterAsString = codePointToString(next);

  312.             originalToEncoded.put(next, currentEncoding);
  313.             encodedToOriginal.put(currentEncoding, originalLetterAsString);
  314.         }
  315.     }

  316.     /**
  317.      * Decodes a given string.
  318.      *
  319.      * @param encoded a string that has been encoded using this
  320.      *                AlphabetConverter
  321.      * @return The decoded string, {@code null} if the given string is null
  322.      * @throws UnsupportedEncodingException if unexpected characters that
  323.      *                                      cannot be handled are encountered
  324.      */
  325.     public String decode(final String encoded)
  326.             throws UnsupportedEncodingException {
  327.         if (encoded == null) {
  328.             return null;
  329.         }

  330.         final StringBuilder result = new StringBuilder();

  331.         for (int j = 0; j < encoded.length();) {
  332.             final int i = encoded.codePointAt(j);
  333.             final String s = codePointToString(i);

  334.             if (s.equals(originalToEncoded.get(i))) {
  335.                 result.append(s);
  336.                 j++; // because we do not encode in Unicode extended the
  337.                      // length of each encoded char is 1
  338.             } else {
  339.                 if (j + encodedLetterLength > encoded.length()) {
  340.                     throw new UnsupportedEncodingException("Unexpected end "
  341.                             + "of string while decoding " + encoded);
  342.                 }
  343.                 final String nextGroup = encoded.substring(j,
  344.                         j + encodedLetterLength);
  345.                 final String next = encodedToOriginal.get(nextGroup);
  346.                 if (next == null) {
  347.                     throw new UnsupportedEncodingException(
  348.                             "Unexpected string without decoding ("
  349.                                     + nextGroup + ") in " + encoded);
  350.                 }
  351.                 result.append(next);
  352.                 j += encodedLetterLength;
  353.             }
  354.         }

  355.         return result.toString();
  356.     }

  357.     /**
  358.      * Encodes a given string.
  359.      *
  360.      * @param original the string to be encoded
  361.      * @return The encoded string, {@code null} if the given string is null
  362.      * @throws UnsupportedEncodingException if chars that are not supported are
  363.      *                                      encountered
  364.      */
  365.     public String encode(final String original)
  366.             throws UnsupportedEncodingException {
  367.         if (original == null) {
  368.             return null;
  369.         }

  370.         final StringBuilder sb = new StringBuilder();

  371.         for (int i = 0; i < original.length();) {
  372.             final int codePoint = original.codePointAt(i);

  373.             final String nextLetter = originalToEncoded.get(codePoint);

  374.             if (nextLetter == null) {
  375.                 throw new UnsupportedEncodingException(
  376.                         "Couldn't find encoding for '"
  377.                                 + codePointToString(codePoint)
  378.                                 + "' in "
  379.                                 + original
  380.                 );
  381.             }

  382.             sb.append(nextLetter);

  383.             i += Character.charCount(codePoint);
  384.         }

  385.         return sb.toString();
  386.     }

  387.     @Override
  388.     public boolean equals(final Object obj) {
  389.         if (obj == null) {
  390.             return false;
  391.         }
  392.         if (obj == this) {
  393.             return true;
  394.         }
  395.         if (!(obj instanceof AlphabetConverter)) {
  396.             return false;
  397.         }
  398.         final AlphabetConverter other = (AlphabetConverter) obj;
  399.         return originalToEncoded.equals(other.originalToEncoded)
  400.                 && encodedToOriginal.equals(other.encodedToOriginal)
  401.                 && encodedLetterLength == other.encodedLetterLength;
  402.     }

  403.     /**
  404.      * Gets the length of characters in the encoded alphabet that are necessary
  405.      * for each character in the original
  406.      * alphabet.
  407.      *
  408.      * @return The length of the encoded char
  409.      */
  410.     public int getEncodedCharLength() {
  411.         return encodedLetterLength;
  412.     }

  413.     /**
  414.      * Gets the mapping from integer code point of source language to encoded
  415.      * string. Use to reconstruct converter from
  416.      * serialized map.
  417.      *
  418.      * @return The original map
  419.      */
  420.     public Map<Integer, String> getOriginalToEncoded() {
  421.         return Collections.unmodifiableMap(originalToEncoded);
  422.     }

  423.     @Override
  424.     public int hashCode() {
  425.         return Objects.hash(originalToEncoded,
  426.                 encodedToOriginal,
  427.                 encodedLetterLength);
  428.     }

  429.     @Override
  430.     public String toString() {
  431.         final StringBuilder sb = new StringBuilder();
  432.         // @formatter:off
  433.         originalToEncoded.forEach((k, v) ->
  434.             sb.append(codePointToString(k))
  435.               .append(ARROW)
  436.               .append(k)
  437.               .append(System.lineSeparator()));
  438.         // @formatter:on
  439.         return sb.toString();
  440.     }
  441. }