AlphabetConverter.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.text;

  18. import java.io.UnsupportedEncodingException;
  19. import java.util.Arrays;
  20. import java.util.Collection;
  21. import java.util.Collections;
  22. import java.util.HashMap;
  23. import java.util.Iterator;
  24. import java.util.LinkedHashMap;
  25. import java.util.LinkedHashSet;
  26. import java.util.Map;
  27. import java.util.Map.Entry;
  28. import java.util.Objects;
  29. import java.util.Set;

  30. /**
  31.  * <p>
  32.  * Convert from one alphabet to another, with the possibility of leaving certain characters unencoded.
  33.  * </p>
  34.  *
  35.  * <p>
  36.  * The target and do not encode languages must be in the Unicode BMP, but the source language does not.
  37.  * </p>
  38.  *
  39.  * <p>
  40.  * The encoding will all be of a fixed length, except for the 'do not encode' chars, which will be of length 1
  41.  * </p>
  42.  *
  43.  * <h3>Sample usage</h3>
  44.  *
  45.  * <pre>
  46.  * Character[] originals; // a, b, c, d
  47.  * Character[] encoding; // 0, 1, d
  48.  * Character[] doNotEncode; // d
  49.  *
  50.  * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, encoding, doNotEncode);
  51.  *
  52.  * ac.encode("a"); // 00
  53.  * ac.encode("b"); // 01
  54.  * ac.encode("c"); // 0d
  55.  * ac.encode("d"); // d
  56.  * ac.encode("abcd"); // 00010dd
  57.  * </pre>
  58.  *
  59.  * <p>
  60.  * #ThreadSafe# AlphabetConverter class methods are threadsafe as they do not change internal state.
  61.  * </p>
  62.  *
  63.  * @since 1.0
  64.  *
  65.  */
  66. public final class AlphabetConverter {

  67.     /**
  68.      * Original string to be encoded.
  69.      */
  70.     private final Map<Integer, String> originalToEncoded;
  71.     /**
  72.      * Encoding alphabet.
  73.      */
  74.     private final Map<String, String> encodedToOriginal;
  75.     /**
  76.      * Length of the encoded letter.
  77.      */
  78.     private final int encodedLetterLength;
  79.     /**
  80.      * Arrow constant, used for converting the object into a string.
  81.      */
  82.     private static final String ARROW = " -> ";
  83.     /**
  84.      * Line separator, used for converting the object into a string.
  85.      */
  86.     private static final String LINE_SEPARATOR = System.getProperty("line.separator");

  87.     /**
  88.      * Hidden constructor for alphabet converter. Used by static helper methods.
  89.      *
  90.      * @param originalToEncoded original string to be encoded
  91.      * @param encodedToOriginal encoding alphabet
  92.      * @param encodedLetterLength length of the encoded letter
  93.      */
  94.     private AlphabetConverter(final Map<Integer, String> originalToEncoded, final Map<String, String> encodedToOriginal,
  95.             final int encodedLetterLength) {

  96.         this.originalToEncoded = originalToEncoded;
  97.         this.encodedToOriginal = encodedToOriginal;
  98.         this.encodedLetterLength = encodedLetterLength;
  99.     }

  100.     /**
  101.      * Encode a given string.
  102.      *
  103.      * @param original the string to be encoded
  104.      * @return the encoded string, {@code null} if the given string is null
  105.      * @throws UnsupportedEncodingException if chars that are not supported are encountered
  106.      */
  107.     public String encode(final String original) throws UnsupportedEncodingException {
  108.         if (original == null) {
  109.             return null;
  110.         }

  111.         final StringBuilder sb = new StringBuilder();

  112.         for (int i = 0; i < original.length();) {
  113.             final int codepoint = original.codePointAt(i);

  114.             final String nextLetter = originalToEncoded.get(codepoint);

  115.             if (nextLetter == null) {
  116.                 throw new UnsupportedEncodingException(
  117.                         "Couldn't find encoding for '" + codePointToString(codepoint) + "' in " + original);
  118.             }

  119.             sb.append(nextLetter);

  120.             i += Character.charCount(codepoint);
  121.         }

  122.         return sb.toString();
  123.     }

  124.     /**
  125.      * Decode a given string.
  126.      *
  127.      * @param encoded a string that has been encoded using this AlphabetConverter
  128.      * @return the decoded string, {@code null} if the given string is null
  129.      * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered
  130.      */
  131.     public String decode(final String encoded) throws UnsupportedEncodingException {
  132.         if (encoded == null) {
  133.             return null;
  134.         }

  135.         final StringBuilder result = new StringBuilder();

  136.         for (int j = 0; j < encoded.length();) {
  137.             final Integer i = encoded.codePointAt(j);
  138.             final String s = codePointToString(i);

  139.             if (s.equals(originalToEncoded.get(i))) {
  140.                 result.append(s);
  141.                 j++; // because we do not encode in Unicode extended the length of each encoded char is 1
  142.             } else {
  143.                 if (j + encodedLetterLength > encoded.length()) {
  144.                     throw new UnsupportedEncodingException("Unexpected end of string while decoding " + encoded);
  145.                 }
  146.                 final String nextGroup = encoded.substring(j, j + encodedLetterLength);
  147.                 final String next = encodedToOriginal.get(nextGroup);
  148.                 if (next == null) {
  149.                     throw new UnsupportedEncodingException(
  150.                             "Unexpected string without decoding (" + nextGroup + ") in " + encoded);
  151.                 }
  152.                 result.append(next);
  153.                 j += encodedLetterLength;
  154.             }
  155.         }

  156.         return result.toString();
  157.     }

  158.     /**
  159.      * Get the length of characters in the encoded alphabet that are necessary for each character in the original
  160.      * alphabet.
  161.      *
  162.      * @return the length of the encoded char
  163.      */
  164.     public int getEncodedCharLength() {
  165.         return encodedLetterLength;
  166.     }

  167.     /**
  168.      * Get the mapping from integer code point of source language to encoded string. Use to reconstruct converter from
  169.      * serialized map.
  170.      *
  171.      * @return the original map
  172.      */
  173.     public Map<Integer, String> getOriginalToEncoded() {
  174.         return Collections.unmodifiableMap(originalToEncoded);
  175.     }

  176.     /**
  177.      * Recursive method used when creating encoder/decoder.
  178.      *
  179.      * @param level at which point it should add a single encoding
  180.      * @param currentEncoding current encoding
  181.      * @param encoding letters encoding
  182.      * @param originals original values
  183.      * @param doNotEncodeMap map of values that should not be encoded
  184.      */
  185.     @SuppressWarnings("PMD")
  186.     private void addSingleEncoding(final int level, final String currentEncoding, final Collection<Integer> encoding,
  187.             final Iterator<Integer> originals, final Map<Integer, String> doNotEncodeMap) {

  188.         if (level > 0) {
  189.             for (final int encodingLetter : encoding) {
  190.                 if (originals.hasNext()) {

  191.                     // this skips the doNotEncode chars if they are in the
  192.                     // leftmost place
  193.                     if (level != encodedLetterLength || !doNotEncodeMap.containsKey(encodingLetter)) {
  194.                         addSingleEncoding(level - 1, currentEncoding + codePointToString(encodingLetter), encoding,
  195.                                 originals, doNotEncodeMap);
  196.                     }
  197.                 } else {
  198.                     return; // done encoding all the original alphabet
  199.                 }
  200.             }
  201.         } else {
  202.             Integer next = originals.next();

  203.             while (doNotEncodeMap.containsKey(next)) {
  204.                 final String originalLetterAsString = codePointToString(next);

  205.                 originalToEncoded.put(next, originalLetterAsString);
  206.                 encodedToOriginal.put(originalLetterAsString, originalLetterAsString);

  207.                 if (!originals.hasNext()) {
  208.                     return;
  209.                 }

  210.                 next = originals.next();
  211.             }

  212.             final String originalLetterAsString = codePointToString(next);

  213.             originalToEncoded.put(next, currentEncoding);
  214.             encodedToOriginal.put(currentEncoding, originalLetterAsString);
  215.         }
  216.     }

  217.     @Override
  218.     public String toString() {
  219.         final StringBuilder sb = new StringBuilder();

  220.         for (final Entry<Integer, String> entry : originalToEncoded.entrySet()) {
  221.             sb.append(codePointToString(entry.getKey())).append(ARROW).append(entry.getValue()).append(LINE_SEPARATOR);
  222.         }

  223.         return sb.toString();
  224.     }

  225.     @Override
  226.     public boolean equals(final Object obj) {
  227.         if (obj == null) {
  228.             return false;
  229.         }
  230.         if (obj == this) {
  231.             return true;
  232.         }
  233.         if (!(obj instanceof AlphabetConverter)) {
  234.             return false;
  235.         }
  236.         final AlphabetConverter other = (AlphabetConverter) obj;
  237.         return originalToEncoded.equals(other.originalToEncoded) && encodedToOriginal.equals(other.encodedToOriginal)
  238.                 && encodedLetterLength == other.encodedLetterLength;
  239.     }

  240.     @Override
  241.     public int hashCode() {
  242.         return Objects.hash(originalToEncoded, encodedToOriginal, encodedLetterLength);
  243.     }

  244.     // -- static methods

  245.     /**
  246.      * Create a new converter from a map.
  247.      *
  248.      * @param originalToEncoded a map returned from getOriginalToEncoded()
  249.      * @return the reconstructed AlphabetConverter
  250.      * @see AlphabetConverter#getOriginalToEncoded()
  251.      */
  252.     public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
  253.         final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
  254.         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
  255.         final Map<Integer, String> doNotEncodeMap = new HashMap<>();

  256.         int encodedLetterLength = 1;

  257.         for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
  258.             final String originalAsString = codePointToString(e.getKey());
  259.             encodedToOriginal.put(e.getValue(), originalAsString);

  260.             if (e.getValue().equals(originalAsString)) {
  261.                 doNotEncodeMap.put(e.getKey(), e.getValue());
  262.             }

  263.             if (e.getValue().length() > encodedLetterLength) {
  264.                 encodedLetterLength = e.getValue().length();
  265.             }
  266.         }

  267.         return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
  268.     }

  269.     /**
  270.      * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
  271.      * the characters in <em>doNotEncode</em> as they are (if possible).
  272.      *
  273.      * <p>Duplicate letters in either original or encoding will be ignored.</p>
  274.      *
  275.      * @param original an array of chars representing the original alphabet
  276.      * @param encoding an array of chars representing the alphabet to be used for encoding
  277.      * @param doNotEncode an array of chars to be encoded using the original alphabet - every char here must appear in
  278.      *            both the previous params
  279.      * @return the AlphabetConverter
  280.      * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
  281.      */
  282.     public static AlphabetConverter createConverterFromChars(final Character[] original, final Character[] encoding,
  283.             final Character[] doNotEncode) {
  284.         return AlphabetConverter.createConverter(convertCharsToIntegers(original), convertCharsToIntegers(encoding),
  285.                 convertCharsToIntegers(doNotEncode));
  286.     }

  287.     /**
  288.      * Convert characters to integers.
  289.      *
  290.      * @param chars array of characters
  291.      * @return an equivalent array of integers
  292.      */
  293.     private static Integer[] convertCharsToIntegers(final Character[] chars) {
  294.         if (chars == null || chars.length == 0) {
  295.             return new Integer[0];
  296.         }
  297.         final Integer[] integers = new Integer[chars.length];
  298.         for (int i = 0; i < chars.length; i++) {
  299.             integers[i] = (int) chars[i];
  300.         }
  301.         return integers;
  302.     }

  303.     /**
  304.      * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
  305.      * the characters in <em>doNotEncode</em> as they are (if possible).
  306.      *
  307.      * <p>Duplicate letters in either original or encoding will be ignored.</p>
  308.      *
  309.      * @param original an array of ints representing the original alphabet in codepoints
  310.      * @param encoding an array of ints representing the alphabet to be used for encoding, in codepoints
  311.      * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char
  312.      *            here must appear in both the previous params
  313.      * @return the AlphabetConverter
  314.      * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
  315.      */
  316.     public static AlphabetConverter createConverter(final Integer[] original, final Integer[] encoding, final Integer[] doNotEncode) {

  317.         final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original));
  318.         final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding));
  319.         final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode));

  320.         final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
  321.         final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
  322.         final Map<Integer, String> doNotEncodeMap = new HashMap<>();

  323.         int encodedLetterLength;

  324.         for (final int i : doNotEncodeCopy) {
  325.             if (!originalCopy.contains(i)) {
  326.                 throw new IllegalArgumentException(
  327.                         "Can not use 'do not encode' list because original alphabet does not contain '"
  328.                                 + codePointToString(i) + "'");
  329.             }

  330.             if (!encodingCopy.contains(i)) {
  331.                 throw new IllegalArgumentException(
  332.                         "Can not use 'do not encode' list because encoding alphabet does not contain '"
  333.                                 + codePointToString(i) + "'");
  334.             }

  335.             doNotEncodeMap.put(i, codePointToString(i));
  336.         }

  337.         if (encodingCopy.size() >= originalCopy.size()) {
  338.             encodedLetterLength = 1;

  339.             final Iterator<Integer> it = encodingCopy.iterator();

  340.             for (final int originalLetter : originalCopy) {
  341.                 final String originalLetterAsString = codePointToString(originalLetter);

  342.                 if (doNotEncodeMap.containsKey(originalLetter)) {
  343.                     originalToEncoded.put(originalLetter, originalLetterAsString);
  344.                     encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
  345.                 } else {
  346.                     Integer next = it.next();

  347.                     while (doNotEncodeCopy.contains(next)) {
  348.                         next = it.next();
  349.                     }

  350.                     final String encodedLetter = codePointToString(next);

  351.                     originalToEncoded.put(originalLetter, encodedLetter);
  352.                     encodedToOriginal.put(encodedLetter, originalLetterAsString);
  353.                 }
  354.             }

  355.             return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);

  356.         } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
  357.             throw new IllegalArgumentException(
  358.                     "Must have at least two encoding characters (excluding those in the 'do not encode' list), but has "
  359.                             + (encodingCopy.size() - doNotEncodeCopy.size()));
  360.         } else {
  361.             // we start with one which is our minimum, and because we do the
  362.             // first division outside the loop
  363.             int lettersSoFar = 1;

  364.             // the first division takes into account that the doNotEncode
  365.             // letters can't be in the leftmost place
  366.             int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
  367.                     / (encodingCopy.size() - doNotEncodeCopy.size());

  368.             while (lettersLeft / encodingCopy.size() >= 1) {
  369.                 lettersLeft = lettersLeft / encodingCopy.size();
  370.                 lettersSoFar++;
  371.             }

  372.             encodedLetterLength = lettersSoFar + 1;

  373.             final AlphabetConverter ac = new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);

  374.             ac.addSingleEncoding(encodedLetterLength, "", encodingCopy, originalCopy.iterator(), doNotEncodeMap);

  375.             return ac;
  376.         }
  377.     }

  378.     /**
  379.      * Create new String that contains just the given code point.
  380.      *
  381.      * @param i code point
  382.      * @return a new string with the new code point
  383.      * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
  384.      */
  385.     private static String codePointToString(final int i) {
  386.         if (Character.charCount(i) == 1) {
  387.             return String.valueOf((char) i);
  388.         }
  389.         return new String(Character.toChars(i));
  390.     }
  391. }