001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032/** 033 * <p> 034 * Convert from one alphabet to another, with the possibility of leaving certain characters unencoded. 035 * </p> 036 * 037 * <p> 038 * The target and do not encode languages must be in the Unicode BMP, but the source language does not. 039 * </p> 040 * 041 * <p> 042 * The encoding will all be of a fixed length, except for the 'do not encode' chars, which will be of length 1 043 * </p> 044 * 045 * <h3>Sample usage</h3> 046 * 047 * <pre> 048 * Character[] originals; // a, b, c, d 049 * Character[] encoding; // 0, 1, d 050 * Character[] doNotEncode; // d 051 * 052 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, encoding, doNotEncode); 053 * 054 * ac.encode("a"); // 00 055 * ac.encode("b"); // 01 056 * ac.encode("c"); // 0d 057 * ac.encode("d"); // d 058 * ac.encode("abcd"); // 00010dd 059 * </pre> 060 * 061 * <p> 062 * #ThreadSafe# AlphabetConverter class methods are threadsafe as they do not change internal state. 063 * </p> 064 * 065 * @since 1.0 066 * 067 */ 068public final class AlphabetConverter { 069 070 /** 071 * Original string to be encoded. 072 */ 073 private final Map<Integer, String> originalToEncoded; 074 /** 075 * Encoding alphabet. 076 */ 077 private final Map<String, String> encodedToOriginal; 078 /** 079 * Length of the encoded letter. 080 */ 081 private final int encodedLetterLength; 082 /** 083 * Arrow constant, used for converting the object into a string. 084 */ 085 private static final String ARROW = " -> "; 086 /** 087 * Line separator, used for converting the object into a string. 088 */ 089 private static final String LINE_SEPARATOR = System.getProperty("line.separator"); 090 091 /** 092 * Hidden constructor for alphabet converter. Used by static helper methods. 093 * 094 * @param originalToEncoded original string to be encoded 095 * @param encodedToOriginal encoding alphabet 096 * @param encodedLetterLength length of the encoded letter 097 */ 098 private AlphabetConverter(final Map<Integer, String> originalToEncoded, final Map<String, String> encodedToOriginal, 099 final int encodedLetterLength) { 100 101 this.originalToEncoded = originalToEncoded; 102 this.encodedToOriginal = encodedToOriginal; 103 this.encodedLetterLength = encodedLetterLength; 104 } 105 106 /** 107 * Encode a given string. 108 * 109 * @param original the string to be encoded 110 * @return the encoded string, {@code null} if the given string is null 111 * @throws UnsupportedEncodingException if chars that are not supported are encountered 112 */ 113 public String encode(final String original) throws UnsupportedEncodingException { 114 if (original == null) { 115 return null; 116 } 117 118 final StringBuilder sb = new StringBuilder(); 119 120 for (int i = 0; i < original.length();) { 121 final int codepoint = original.codePointAt(i); 122 123 final String nextLetter = originalToEncoded.get(codepoint); 124 125 if (nextLetter == null) { 126 throw new UnsupportedEncodingException( 127 "Couldn't find encoding for '" + codePointToString(codepoint) + "' in " + original); 128 } 129 130 sb.append(nextLetter); 131 132 i += Character.charCount(codepoint); 133 } 134 135 return sb.toString(); 136 } 137 138 /** 139 * Decode a given string. 140 * 141 * @param encoded a string that has been encoded using this AlphabetConverter 142 * @return the decoded string, {@code null} if the given string is null 143 * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered 144 */ 145 public String decode(final String encoded) throws UnsupportedEncodingException { 146 if (encoded == null) { 147 return null; 148 } 149 150 final StringBuilder result = new StringBuilder(); 151 152 for (int j = 0; j < encoded.length();) { 153 final Integer i = encoded.codePointAt(j); 154 final String s = codePointToString(i); 155 156 if (s.equals(originalToEncoded.get(i))) { 157 result.append(s); 158 j++; // because we do not encode in Unicode extended the length of each encoded char is 1 159 } else { 160 if (j + encodedLetterLength > encoded.length()) { 161 throw new UnsupportedEncodingException("Unexpected end of string while decoding " + encoded); 162 } 163 final String nextGroup = encoded.substring(j, j + encodedLetterLength); 164 final String next = encodedToOriginal.get(nextGroup); 165 if (next == null) { 166 throw new UnsupportedEncodingException( 167 "Unexpected string without decoding (" + nextGroup + ") in " + encoded); 168 } 169 result.append(next); 170 j += encodedLetterLength; 171 } 172 } 173 174 return result.toString(); 175 } 176 177 /** 178 * Get the length of characters in the encoded alphabet that are necessary for each character in the original 179 * alphabet. 180 * 181 * @return the length of the encoded char 182 */ 183 public int getEncodedCharLength() { 184 return encodedLetterLength; 185 } 186 187 /** 188 * Get the mapping from integer code point of source language to encoded string. Use to reconstruct converter from 189 * serialized map. 190 * 191 * @return the original map 192 */ 193 public Map<Integer, String> getOriginalToEncoded() { 194 return Collections.unmodifiableMap(originalToEncoded); 195 } 196 197 /** 198 * Recursive method used when creating encoder/decoder. 199 * 200 * @param level at which point it should add a single encoding 201 * @param currentEncoding current encoding 202 * @param encoding letters encoding 203 * @param originals original values 204 * @param doNotEncodeMap map of values that should not be encoded 205 */ 206 @SuppressWarnings("PMD") 207 private void addSingleEncoding(final int level, final String currentEncoding, final Collection<Integer> encoding, 208 final Iterator<Integer> originals, final Map<Integer, String> doNotEncodeMap) { 209 210 if (level > 0) { 211 for (final int encodingLetter : encoding) { 212 if (originals.hasNext()) { 213 214 // this skips the doNotEncode chars if they are in the 215 // leftmost place 216 if (level != encodedLetterLength || !doNotEncodeMap.containsKey(encodingLetter)) { 217 addSingleEncoding(level - 1, currentEncoding + codePointToString(encodingLetter), encoding, 218 originals, doNotEncodeMap); 219 } 220 } else { 221 return; // done encoding all the original alphabet 222 } 223 } 224 } else { 225 Integer next = originals.next(); 226 227 while (doNotEncodeMap.containsKey(next)) { 228 final String originalLetterAsString = codePointToString(next); 229 230 originalToEncoded.put(next, originalLetterAsString); 231 encodedToOriginal.put(originalLetterAsString, originalLetterAsString); 232 233 if (!originals.hasNext()) { 234 return; 235 } 236 237 next = originals.next(); 238 } 239 240 final String originalLetterAsString = codePointToString(next); 241 242 originalToEncoded.put(next, currentEncoding); 243 encodedToOriginal.put(currentEncoding, originalLetterAsString); 244 } 245 } 246 247 @Override 248 public String toString() { 249 final StringBuilder sb = new StringBuilder(); 250 251 for (final Entry<Integer, String> entry : originalToEncoded.entrySet()) { 252 sb.append(codePointToString(entry.getKey())).append(ARROW).append(entry.getValue()).append(LINE_SEPARATOR); 253 } 254 255 return sb.toString(); 256 } 257 258 @Override 259 public boolean equals(final Object obj) { 260 if (obj == null) { 261 return false; 262 } 263 if (obj == this) { 264 return true; 265 } 266 if (!(obj instanceof AlphabetConverter)) { 267 return false; 268 } 269 final AlphabetConverter other = (AlphabetConverter) obj; 270 return originalToEncoded.equals(other.originalToEncoded) && encodedToOriginal.equals(other.encodedToOriginal) 271 && encodedLetterLength == other.encodedLetterLength; 272 } 273 274 @Override 275 public int hashCode() { 276 return Objects.hash(originalToEncoded, encodedToOriginal, encodedLetterLength); 277 } 278 279 // -- static methods 280 281 /** 282 * Create a new converter from a map. 283 * 284 * @param originalToEncoded a map returned from getOriginalToEncoded() 285 * @return the reconstructed AlphabetConverter 286 * @see AlphabetConverter#getOriginalToEncoded() 287 */ 288 public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) { 289 final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded); 290 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 291 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 292 293 int encodedLetterLength = 1; 294 295 for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) { 296 final String originalAsString = codePointToString(e.getKey()); 297 encodedToOriginal.put(e.getValue(), originalAsString); 298 299 if (e.getValue().equals(originalAsString)) { 300 doNotEncodeMap.put(e.getKey(), e.getValue()); 301 } 302 303 if (e.getValue().length() > encodedLetterLength) { 304 encodedLetterLength = e.getValue().length(); 305 } 306 } 307 308 return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength); 309 } 310 311 /** 312 * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving 313 * the characters in <em>doNotEncode</em> as they are (if possible). 314 * 315 * <p>Duplicate letters in either original or encoding will be ignored.</p> 316 * 317 * @param original an array of chars representing the original alphabet 318 * @param encoding an array of chars representing the alphabet to be used for encoding 319 * @param doNotEncode an array of chars to be encoded using the original alphabet - every char here must appear in 320 * both the previous params 321 * @return the AlphabetConverter 322 * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed 323 */ 324 public static AlphabetConverter createConverterFromChars(final Character[] original, final Character[] encoding, 325 final Character[] doNotEncode) { 326 return AlphabetConverter.createConverter(convertCharsToIntegers(original), convertCharsToIntegers(encoding), 327 convertCharsToIntegers(doNotEncode)); 328 } 329 330 /** 331 * Convert characters to integers. 332 * 333 * @param chars array of characters 334 * @return an equivalent array of integers 335 */ 336 private static Integer[] convertCharsToIntegers(final Character[] chars) { 337 if (chars == null || chars.length == 0) { 338 return new Integer[0]; 339 } 340 final Integer[] integers = new Integer[chars.length]; 341 for (int i = 0; i < chars.length; i++) { 342 integers[i] = (int) chars[i]; 343 } 344 return integers; 345 } 346 347 /** 348 * Create an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving 349 * the characters in <em>doNotEncode</em> as they are (if possible). 350 * 351 * <p>Duplicate letters in either original or encoding will be ignored.</p> 352 * 353 * @param original an array of ints representing the original alphabet in codepoints 354 * @param encoding an array of ints representing the alphabet to be used for encoding, in codepoints 355 * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char 356 * here must appear in both the previous params 357 * @return the AlphabetConverter 358 * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed 359 */ 360 public static AlphabetConverter createConverter(final Integer[] original, final Integer[] encoding, final Integer[] doNotEncode) { 361 362 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original)); 363 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding)); 364 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode)); 365 366 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 367 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 368 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 369 370 int encodedLetterLength; 371 372 for (final int i : doNotEncodeCopy) { 373 if (!originalCopy.contains(i)) { 374 throw new IllegalArgumentException( 375 "Can not use 'do not encode' list because original alphabet does not contain '" 376 + codePointToString(i) + "'"); 377 } 378 379 if (!encodingCopy.contains(i)) { 380 throw new IllegalArgumentException( 381 "Can not use 'do not encode' list because encoding alphabet does not contain '" 382 + codePointToString(i) + "'"); 383 } 384 385 doNotEncodeMap.put(i, codePointToString(i)); 386 } 387 388 if (encodingCopy.size() >= originalCopy.size()) { 389 encodedLetterLength = 1; 390 391 final Iterator<Integer> it = encodingCopy.iterator(); 392 393 for (final int originalLetter : originalCopy) { 394 final String originalLetterAsString = codePointToString(originalLetter); 395 396 if (doNotEncodeMap.containsKey(originalLetter)) { 397 originalToEncoded.put(originalLetter, originalLetterAsString); 398 encodedToOriginal.put(originalLetterAsString, originalLetterAsString); 399 } else { 400 Integer next = it.next(); 401 402 while (doNotEncodeCopy.contains(next)) { 403 next = it.next(); 404 } 405 406 final String encodedLetter = codePointToString(next); 407 408 originalToEncoded.put(originalLetter, encodedLetter); 409 encodedToOriginal.put(encodedLetter, originalLetterAsString); 410 } 411 } 412 413 return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength); 414 415 } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 416 throw new IllegalArgumentException( 417 "Must have at least two encoding characters (excluding those in the 'do not encode' list), but has " 418 + (encodingCopy.size() - doNotEncodeCopy.size())); 419 } else { 420 // we start with one which is our minimum, and because we do the 421 // first division outside the loop 422 int lettersSoFar = 1; 423 424 // the first division takes into account that the doNotEncode 425 // letters can't be in the leftmost place 426 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 427 / (encodingCopy.size() - doNotEncodeCopy.size()); 428 429 while (lettersLeft / encodingCopy.size() >= 1) { 430 lettersLeft = lettersLeft / encodingCopy.size(); 431 lettersSoFar++; 432 } 433 434 encodedLetterLength = lettersSoFar + 1; 435 436 final AlphabetConverter ac = new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength); 437 438 ac.addSingleEncoding(encodedLetterLength, "", encodingCopy, originalCopy.iterator(), doNotEncodeMap); 439 440 return ac; 441 } 442 } 443 444 /** 445 * Create new String that contains just the given code point. 446 * 447 * @param i code point 448 * @return a new string with the new code point 449 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 450 */ 451 private static String codePointToString(final int i) { 452 if (Character.charCount(i) == 1) { 453 return String.valueOf((char) i); 454 } 455 return new String(Character.toChars(i)); 456 } 457}