Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.io.UnsupportedEncodingException;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.LinkedHashSet;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Objects;
030import java.util.Set;
031
032/**
033 * <p>
034 * Convert from one alphabet to another, with the possibility of leaving certain characters unencoded.
035 * </p>
036 *
037 * <p>
038 * The target and do not encode languages must be in the Unicode BMP, but the source language does not.
039 * </p>
040 *
041 * <p>
042 * The encoding will all be of a fixed length, except for the 'do not encode' chars, which will be of length 1
043 * </p>
044 *
045 * <h3>Sample usage</h3>
046 *
047 * <pre>
048 * Character[] originals; // a, b, c, d
049 * Character[] encoding; // 0, 1, d
050 * Character[] doNotEncode; // d
051 *
052 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, encoding, doNotEncode);
053 *
054 * ac.encode("a"); // 00
055 * ac.encode("b"); // 01
056 * ac.encode("c"); // 0d
057 * ac.encode("d"); // d
058 * ac.encode("abcd"); // 00010dd
059 * </pre>
060 *
061 * <p>
062 * #ThreadSafe# AlphabetConverter class methods are threadsafe as they do not change internal state.
063 * </p>
064 *
065 * @since 1.0
066 *
067 */
068public final class AlphabetConverter {
069
070    /**
071     * Original string to be encoded.
072     */
073    private final Map<Integer, String> originalToEncoded;
074    /**
075     * Encoding alphabet.
076     */
077    private final Map<String, String> encodedToOriginal;
078    /**
079     * Length of the encoded letter.
080     */
081    private final int encodedLetterLength;
082    /**
083     * Arrow constant, used for converting the object into a string.
084     */
085    private static final String ARROW = " -> ";
086    /**
087     * Line separator, used for converting the object into a string.
088     */
089    private static final String LINE_SEPARATOR = System.getProperty("line.separator");
090
091    /**
092     * Hidden constructor for alphabet converter. Used by static helper methods.
093     *
094     * @param originalToEncoded original string to be encoded
095     * @param encodedToOriginal encoding alphabet
096     * @param encodedLetterLength length of the encoded letter
097     */
098    private AlphabetConverter(final Map<Integer, String> originalToEncoded, final Map<String, String> encodedToOriginal,
099            final int encodedLetterLength) {
100
101        this.originalToEncoded = originalToEncoded;
102        this.encodedToOriginal = encodedToOriginal;
103        this.encodedLetterLength = encodedLetterLength;
104    }
105
106    /**
107     * Encode a given string.
108     *
109     * @param original the string to be encoded
110     * @return the encoded string, {@code null} if the given string is null
111     * @throws UnsupportedEncodingException if chars that are not supported are encountered
112     */
113    public String encode(final String original) throws UnsupportedEncodingException {
114        if (original == null) {
115            return null;
116        }
117
118        final StringBuilder sb = new StringBuilder();
119
120        for (int i = 0; i < original.length();) {
121            final int codepoint = original.codePointAt(i);
122
123            final String nextLetter = originalToEncoded.get(codepoint);
124
125            if (nextLetter == null) {
126                throw new UnsupportedEncodingException(
127                        "Couldn't find encoding for '" + codePointToString(codepoint) + "' in " + original);
128            }
129
130            sb.append(nextLetter);
131
132            i += Character.charCount(codepoint);
133        }
134
135        return sb.toString();
136    }
137
138    /**
139     * Decode a given string.
140     *
141     * @param encoded a string that has been encoded using this AlphabetConverter
142     * @return the decoded string, {@code null} if the given string is null
143     * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered
144     */
145    public String decode(final String encoded) throws UnsupportedEncodingException {
146        if (encoded == null) {
147            return null;
148        }
149
150        final StringBuilder result = new StringBuilder();
151
152        for (int j = 0; j < encoded.length();) {
153            final Integer i = encoded.codePointAt(j);
154            final String s = codePointToString(i);
155
156            if (s.equals(originalToEncoded.get(i))) {
157                result.append(s);
158                j++; // because we do not encode in Unicode extended the length of each encoded char is 1
159            } else {
160                if (j + encodedLetterLength > encoded.length()) {
161                    throw new UnsupportedEncodingException("Unexpected end of string while decoding " + encoded);
162                }
163                final String nextGroup = encoded.substring(j, j + encodedLetterLength);
164                final String next = encodedToOriginal.get(nextGroup);
165                if (next == null) {
166                    throw new UnsupportedEncodingException(
167                            "Unexpected string without decoding (" + nextGroup + ") in " + encoded);
168                }
169                result.append(next);
170                j += encodedLetterLength;
171            }
172        }
173
174        return result.toString();
175    }
176
177    /**
178     * Get the length of characters in the encoded alphabet that are necessary for each character in the original
179     * alphabet.
180     *
181     * @return the length of the encoded char
182     */
183    public int getEncodedCharLength() {
184        return encodedLetterLength;
185    }
186
187    /**
188     * Get the mapping from integer code point of source language to encoded string. Use to reconstruct converter from
189     * serialized map.
190     *
191     * @return the original map
192     */
193    public Map<Integer, String> getOriginalToEncoded() {
194        return Collections.unmodifiableMap(originalToEncoded);
195    }
196
197    /**
198     * Recursive method used when creating encoder/decoder.
199     *
200     * @param level at which point it should add a single encoding
201     * @param currentEncoding current encoding
202     * @param encoding letters encoding
203     * @param originals original values
204     * @param doNotEncodeMap map of values that should not be encoded
205     */
206    @SuppressWarnings("PMD")
207    private void addSingleEncoding(final int level, final String currentEncoding, final Collection<Integer> encoding,
208            final Iterator<Integer> originals, final Map<Integer, String> doNotEncodeMap) {
209
210        if (level > 0) {
211            for (final int encodingLetter : encoding) {
212                if (originals.hasNext()) {
213
214                    // this skips the doNotEncode chars if they are in the
215                    // leftmost place
216                    if (level != encodedLetterLength || !doNotEncodeMap.containsKey(encodingLetter)) {
217                        addSingleEncoding(level - 1, currentEncoding + codePointToString(encodingLetter), encoding,
218                                originals, doNotEncodeMap);
219                    }
220                } else {
221                    return; // done encoding all the original alphabet
222                }
223            }
224        } else {
225            Integer next = originals.next();
226
227            while (doNotEncodeMap.containsKey(next)) {
228                final String originalLetterAsString = codePointToString(next);
229
230                originalToEncoded.put(next, originalLetterAsString);
231                encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
232
233                if (!originals.hasNext()) {
234                    return;
235                }
236
237                next = originals.next();
238            }
239
240            final String originalLetterAsString = codePointToString(next);
241
242            originalToEncoded.put(next, currentEncoding);
243            encodedToOriginal.put(currentEncoding, originalLetterAsString);
244        }
245    }
246
247    @Override
248    public String toString() {
249        final StringBuilder sb = new StringBuilder();
250
251        for (final Entry<Integer, String> entry : originalToEncoded.entrySet()) {
252            sb.append(codePointToString(entry.getKey())).append(ARROW).append(entry.getValue()).append(LINE_SEPARATOR);
253        }
254
255        return sb.toString();
256    }
257
258    @Override
259    public boolean equals(final Object obj) {
260        if (obj == null) {
261            return false;
262        }
263        if (obj == this) {
264            return true;
265        }
266        if (!(obj instanceof AlphabetConverter)) {
267            return false;
268        }
269        final AlphabetConverter other = (AlphabetConverter) obj;
270        return originalToEncoded.equals(other.originalToEncoded) && encodedToOriginal.equals(other.encodedToOriginal)
271                && encodedLetterLength == other.encodedLetterLength;
272    }
273
274    @Override
275    public int hashCode() {
276        return Objects.hash(originalToEncoded, encodedToOriginal, encodedLetterLength);
277    }
278
279    // -- static methods
280
281    /**
282     * Create a new converter from a map.
283     *
284     * @param originalToEncoded a map returned from getOriginalToEncoded()
285     * @return the reconstructed AlphabetConverter
286     * @see AlphabetConverter#getOriginalToEncoded()
287     */
288    public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) {
289        final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded);
290        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
291        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
292
293        int encodedLetterLength = 1;
294
295        for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) {
296            final String originalAsString = codePointToString(e.getKey());
297            encodedToOriginal.put(e.getValue(), originalAsString);
298
299            if (e.getValue().equals(originalAsString)) {
300                doNotEncodeMap.put(e.getKey(), e.getValue());
301            }
302
303            if (e.getValue().length() > encodedLetterLength) {
304                encodedLetterLength = e.getValue().length();
305            }
306        }
307
308        return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength);
309    }
310
311    /**
312     * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
313     * the characters in <em>doNotEncode</em> as they are (if possible).
314     *
315     * <p>Duplicate letters in either original or encoding will be ignored.</p>
316     *
317     * @param original an array of chars representing the original alphabet
318     * @param encoding an array of chars representing the alphabet to be used for encoding
319     * @param doNotEncode an array of chars to be encoded using the original alphabet - every char here must appear in
320     *            both the previous params
321     * @return the AlphabetConverter
322     * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
323     */
324    public static AlphabetConverter createConverterFromChars(final Character[] original, final Character[] encoding,
325            final Character[] doNotEncode) {
326        return AlphabetConverter.createConverter(convertCharsToIntegers(original), convertCharsToIntegers(encoding),
327                convertCharsToIntegers(doNotEncode));
328    }
329
330    /**
331     * Convert characters to integers.
332     *
333     * @param chars array of characters
334     * @return an equivalent array of integers
335     */
336    private static Integer[] convertCharsToIntegers(final Character[] chars) {
337        if (chars == null || chars.length == 0) {
338            return new Integer[0];
339        }
340        final Integer[] integers = new Integer[chars.length];
341        for (int i = 0; i < chars.length; i++) {
342            integers[i] = (int) chars[i];
343        }
344        return integers;
345    }
346
347    /**
348     * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving
349     * the characters in <em>doNotEncode</em> as they are (if possible).
350     *
351     * <p>Duplicate letters in either original or encoding will be ignored.</p>
352     *
353     * @param original an array of ints representing the original alphabet in codepoints
354     * @param encoding an array of ints representing the alphabet to be used for encoding, in codepoints
355     * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char
356     *            here must appear in both the previous params
357     * @return the AlphabetConverter
358     * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed
359     */
360    public static AlphabetConverter createConverter(final Integer[] original, final Integer[] encoding, final Integer[] doNotEncode) {
361
362        final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original));
363        final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding));
364        final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode));
365
366        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
367        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
368        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
369
370        int encodedLetterLength;
371
372        for (final int i : doNotEncodeCopy) {
373            if (!originalCopy.contains(i)) {
374                throw new IllegalArgumentException(
375                        "Can not use 'do not encode' list because original alphabet does not contain '"
376                                + codePointToString(i) + "'");
377            }
378
379            if (!encodingCopy.contains(i)) {
380                throw new IllegalArgumentException(
381                        "Can not use 'do not encode' list because encoding alphabet does not contain '"
382                                + codePointToString(i) + "'");
383            }
384
385            doNotEncodeMap.put(i, codePointToString(i));
386        }
387
388        if (encodingCopy.size() >= originalCopy.size()) {
389            encodedLetterLength = 1;
390
391            final Iterator<Integer> it = encodingCopy.iterator();
392
393            for (final int originalLetter : originalCopy) {
394                final String originalLetterAsString = codePointToString(originalLetter);
395
396                if (doNotEncodeMap.containsKey(originalLetter)) {
397                    originalToEncoded.put(originalLetter, originalLetterAsString);
398                    encodedToOriginal.put(originalLetterAsString, originalLetterAsString);
399                } else {
400                    Integer next = it.next();
401
402                    while (doNotEncodeCopy.contains(next)) {
403                        next = it.next();
404                    }
405
406                    final String encodedLetter = codePointToString(next);
407
408                    originalToEncoded.put(originalLetter, encodedLetter);
409                    encodedToOriginal.put(encodedLetter, originalLetterAsString);
410                }
411            }
412
413            return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
414
415        } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
416            throw new IllegalArgumentException(
417                    "Must have at least two encoding characters (excluding those in the 'do not encode' list), but has "
418                            + (encodingCopy.size() - doNotEncodeCopy.size()));
419        } else {
420            // we start with one which is our minimum, and because we do the
421            // first division outside the loop
422            int lettersSoFar = 1;
423
424            // the first division takes into account that the doNotEncode
425            // letters can't be in the leftmost place
426            int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
427                    / (encodingCopy.size() - doNotEncodeCopy.size());
428
429            while (lettersLeft / encodingCopy.size() >= 1) {
430                lettersLeft = lettersLeft / encodingCopy.size();
431                lettersSoFar++;
432            }
433
434            encodedLetterLength = lettersSoFar + 1;
435
436            final AlphabetConverter ac = new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength);
437
438            ac.addSingleEncoding(encodedLetterLength, "", encodingCopy, originalCopy.iterator(), doNotEncodeMap);
439
440            return ac;
441        }
442    }
443
444    /**
445     * Create new String that contains just the given code point.
446     *
447     * @param i code point
448     * @return a new string with the new code point
449     * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html"
450     */
451    private static String codePointToString(final int i) {
452        if (Character.charCount(i) == 1) {
453            return String.valueOf((char) i);
454        }
455        return new String(Character.toChars(i));
456    }
457}