001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032import org.apache.commons.lang3.ArrayUtils; 033import org.apache.commons.lang3.StringUtils; 034 035/** 036 * <p> 037 * Convert from one alphabet to another, with the possibility of leaving certain 038 * characters unencoded. 039 * </p> 040 * 041 * <p> 042 * The target and 'do not encode' languages must be in the Unicode BMP, but the 043 * source language does not. 044 * </p> 045 * 046 * <p> 047 * The encoding will all be of a fixed length, except for the 'do not encode' 048 * chars, which will be of length 1 049 * </p> 050 * 051 * <h2>Sample usage</h2> 052 * 053 * <pre> 054 * Character[] originals; // a, b, c, d 055 * Character[] encoding; // 0, 1, d 056 * Character[] doNotEncode; // d 057 * 058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, 059 * encoding, doNotEncode); 060 * 061 * ac.encode("a"); // 00 062 * ac.encode("b"); // 01 063 * ac.encode("c"); // 0d 064 * ac.encode("d"); // d 065 * ac.encode("abcd"); // 00010dd 066 * </pre> 067 * 068 * <p> 069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not 070 * change internal state. 071 * </p> 072 * 073 * @since 1.0 074 */ 075public final class AlphabetConverter { 076 077 /** 078 * Arrow constant, used for converting the object into a string. 079 */ 080 private static final String ARROW = " -> "; 081 082 /** 083 * Creates new String that contains just the given code point. 084 * 085 * @param i code point. 086 * @return a new string with the new code point. 087 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 088 */ 089 private static String codePointToString(final int i) { 090 if (Character.charCount(i) == 1) { 091 return String.valueOf((char) i); 092 } 093 return new String(Character.toChars(i)); 094 } 095 096 /** 097 * Converts characters to integers. 098 * 099 * @param chars array of characters. 100 * @return an equivalent array of integers. 101 */ 102 private static Integer[] convertCharsToIntegers(final Character[] chars) { 103 if (ArrayUtils.isEmpty(chars)) { 104 return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY; 105 } 106 final Integer[] integers = new Integer[chars.length]; 107 Arrays.setAll(integers, i -> (int) chars[i]); 108 return integers; 109 } 110 111 /** 112 * Creates an alphabet converter, for converting from the original alphabet, to the encoded alphabet, while leaving the characters in <em>doNotEncode</em> 113 * as they are (if possible). 114 * 115 * <p> 116 * Duplicate letters in either original or encoding will be ignored. 117 * </p> 118 * 119 * @param original an array of ints representing the original alphabet in code points. 120 * @param encoding an array of ints representing the alphabet to be used for encoding, in code points. 121 * @param doNotEncode an array of ints representing the chars to be encoded using the original alphabet - every char here must appear in both the previous 122 * params. 123 * @return The AlphabetConverter. 124 * @throws IllegalArgumentException if an AlphabetConverter cannot be constructed. 125 */ 126 public static AlphabetConverter createConverter( 127 final Integer[] original, 128 final Integer[] encoding, 129 final Integer[] doNotEncode) { 130 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original)); 131 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding)); 132 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode)); 133 134 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 135 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 136 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 137 138 final int encodedLetterLength; 139 140 for (final int i : doNotEncodeCopy) { 141 if (!originalCopy.contains(i)) { 142 throw new IllegalArgumentException( 143 "Can not use 'do not encode' list because original " 144 + "alphabet does not contain '" 145 + codePointToString(i) + "'"); 146 } 147 148 if (!encodingCopy.contains(i)) { 149 throw new IllegalArgumentException( 150 "Can not use 'do not encode' list because encoding alphabet does not contain '" 151 + codePointToString(i) + "'"); 152 } 153 154 doNotEncodeMap.put(i, codePointToString(i)); 155 } 156 157 if (encodingCopy.size() >= originalCopy.size()) { 158 encodedLetterLength = 1; 159 160 final Iterator<Integer> it = encodingCopy.iterator(); 161 162 for (final int originalLetter : originalCopy) { 163 final String originalLetterAsString = codePointToString(originalLetter); 164 165 if (doNotEncodeMap.containsKey(originalLetter)) { 166 originalToEncoded.put(originalLetter, originalLetterAsString); 167 encodedToOriginal.put(originalLetterAsString, originalLetterAsString); 168 } else { 169 Integer next = it.next(); 170 171 while (doNotEncodeCopy.contains(next)) { 172 next = it.next(); 173 } 174 175 final String encodedLetter = codePointToString(next); 176 177 originalToEncoded.put(originalLetter, encodedLetter); 178 encodedToOriginal.put(encodedLetter, originalLetterAsString); 179 } 180 } 181 182 return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength); 183 184 } 185 if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 186 throw new IllegalArgumentException( 187 "Must have at least two encoding characters (excluding " 188 + "those in the 'do not encode' list), but has " 189 + (encodingCopy.size() - doNotEncodeCopy.size())); 190 } 191 // we start with one which is our minimum, and because we do the 192 // first division outside the loop 193 int lettersSoFar = 1; 194 195 // the first division takes into account that the doNotEncode 196 // letters can't be in the leftmost place 197 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 198 / (encodingCopy.size() - doNotEncodeCopy.size()); 199 200 while (lettersLeft / encodingCopy.size() >= 1) { 201 lettersLeft /= encodingCopy.size(); 202 lettersSoFar++; 203 } 204 205 encodedLetterLength = lettersSoFar + 1; 206 207 final AlphabetConverter ac = 208 new AlphabetConverter(originalToEncoded, 209 encodedToOriginal, 210 encodedLetterLength); 211 212 ac.addSingleEncoding(encodedLetterLength, 213 StringUtils.EMPTY, 214 encodingCopy, 215 originalCopy.iterator(), 216 doNotEncodeMap); 217 218 return ac; 219 } 220 221 /** 222 * Creates an alphabet converter, for converting from the original alphabet, 223 * to the encoded alphabet, while leaving the characters in 224 * <em>doNotEncode</em> as they are (if possible). 225 * 226 * <p>Duplicate letters in either original or encoding will be ignored.</p> 227 * 228 * @param original an array of chars representing the original alphabet 229 * @param encoding an array of chars representing the alphabet to be used 230 * for encoding 231 * @param doNotEncode an array of chars to be encoded using the original 232 * alphabet - every char here must appear in 233 * both the previous params 234 * @return The AlphabetConverter 235 * @throws IllegalArgumentException if an AlphabetConverter cannot be 236 * constructed 237 */ 238 public static AlphabetConverter createConverterFromChars( 239 final Character[] original, 240 final Character[] encoding, 241 final Character[] doNotEncode) { 242 return createConverter( 243 convertCharsToIntegers(original), 244 convertCharsToIntegers(encoding), 245 convertCharsToIntegers(doNotEncode)); 246 } 247 248 /** 249 * Creates a new converter from a map. 250 * 251 * @param originalToEncoded a map returned from getOriginalToEncoded(). 252 * @return The reconstructed AlphabetConverter. 253 * @see AlphabetConverter#getOriginalToEncoded() 254 */ 255 public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) { 256 final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded); 257 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 258 259 int encodedLetterLength = 1; 260 261 for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) { 262 encodedToOriginal.put(e.getValue(), codePointToString(e.getKey())); 263 264 if (e.getValue().length() > encodedLetterLength) { 265 encodedLetterLength = e.getValue().length(); 266 } 267 } 268 269 return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength); 270 } 271 272 /** 273 * Original string to be encoded. 274 */ 275 private final Map<Integer, String> originalToEncoded; 276 277 /** 278 * Encoding alphabet. 279 */ 280 private final Map<String, String> encodedToOriginal; 281 282 /** 283 * Length of the encoded letter. 284 */ 285 private final int encodedLetterLength; 286 287 /** 288 * Hidden constructor for alphabet converter. Used by static helper methods. 289 * 290 * @param originalToEncoded original string to be encoded. 291 * @param encodedToOriginal encoding alphabet. 292 * @param encodedLetterLength length of the encoded letter. 293 */ 294 private AlphabetConverter(final Map<Integer, String> originalToEncoded, 295 final Map<String, String> encodedToOriginal, 296 final int encodedLetterLength) { 297 298 this.originalToEncoded = originalToEncoded; 299 this.encodedToOriginal = encodedToOriginal; 300 this.encodedLetterLength = encodedLetterLength; 301 } 302 303 /** 304 * Recursive method used when creating encoder/decoder. 305 * 306 * @param level at which point it should add a single encoding. 307 * @param currentEncoding current encoding. 308 * @param encoding letters encoding. 309 * @param originals original values. 310 * @param doNotEncodeMap map of values that should not be encoded. 311 */ 312 private void addSingleEncoding(final int level, 313 final String currentEncoding, 314 final Collection<Integer> encoding, 315 final Iterator<Integer> originals, 316 final Map<Integer, String> doNotEncodeMap) { 317 318 if (level > 0) { 319 for (final int encodingLetter : encoding) { 320 if (!originals.hasNext()) { 321 return; // done encoding all the original alphabet 322 } 323 // this skips the doNotEncode chars if they are in the 324 // leftmost place 325 if (level != encodedLetterLength 326 || !doNotEncodeMap.containsKey(encodingLetter)) { 327 addSingleEncoding(level - 1, 328 currentEncoding 329 + codePointToString(encodingLetter), 330 encoding, 331 originals, 332 doNotEncodeMap 333 ); 334 } 335 } 336 } else { 337 Integer next = originals.next(); 338 339 while (doNotEncodeMap.containsKey(next)) { 340 final String originalLetterAsString = codePointToString(next); 341 342 originalToEncoded.put(next, originalLetterAsString); 343 encodedToOriginal.put(originalLetterAsString, 344 originalLetterAsString); 345 346 if (!originals.hasNext()) { 347 return; 348 } 349 350 next = originals.next(); 351 } 352 353 final String originalLetterAsString = codePointToString(next); 354 355 originalToEncoded.put(next, currentEncoding); 356 encodedToOriginal.put(currentEncoding, originalLetterAsString); 357 } 358 } 359 360 /** 361 * Decodes a given string. 362 * 363 * @param encoded a string that has been encoded using this AlphabetConverter. 364 * @return The decoded string, {@code null} if the given string is null. 365 * @throws UnsupportedEncodingException if unexpected characters that cannot be handled are encountered. 366 */ 367 public String decode(final String encoded) 368 throws UnsupportedEncodingException { 369 if (encoded == null) { 370 return null; 371 } 372 373 final StringBuilder result = new StringBuilder(); 374 375 for (int j = 0; j < encoded.length();) { 376 final int i = encoded.codePointAt(j); 377 final String s = codePointToString(i); 378 379 if (s.equals(originalToEncoded.get(i))) { 380 result.append(s); 381 j++; // because we do not encode in Unicode extended the 382 // length of each encoded char is 1 383 } else { 384 if (j + encodedLetterLength > encoded.length()) { 385 throw new UnsupportedEncodingException("Unexpected end " 386 + "of string while decoding " + encoded); 387 } 388 final String nextGroup = encoded.substring(j, 389 j + encodedLetterLength); 390 final String next = encodedToOriginal.get(nextGroup); 391 if (next == null) { 392 throw new UnsupportedEncodingException( 393 "Unexpected string without decoding (" 394 + nextGroup + ") in " + encoded); 395 } 396 result.append(next); 397 j += encodedLetterLength; 398 } 399 } 400 401 return result.toString(); 402 } 403 404 /** 405 * Encodes a given string. 406 * 407 * @param original the string to be encoded. 408 * @return The encoded string, {@code null} if the given string is null. 409 * @throws UnsupportedEncodingException if chars that are not supported are encountered. 410 */ 411 public String encode(final String original) 412 throws UnsupportedEncodingException { 413 if (original == null) { 414 return null; 415 } 416 417 final StringBuilder sb = new StringBuilder(); 418 419 for (int i = 0; i < original.length();) { 420 final int codePoint = original.codePointAt(i); 421 422 final String nextLetter = originalToEncoded.get(codePoint); 423 424 if (nextLetter == null) { 425 throw new UnsupportedEncodingException( 426 "Couldn't find encoding for '" 427 + codePointToString(codePoint) 428 + "' in " 429 + original 430 ); 431 } 432 433 sb.append(nextLetter); 434 435 i += Character.charCount(codePoint); 436 } 437 438 return sb.toString(); 439 } 440 441 @Override 442 public boolean equals(final Object obj) { 443 if (obj == null) { 444 return false; 445 } 446 if (obj == this) { 447 return true; 448 } 449 if (!(obj instanceof AlphabetConverter)) { 450 return false; 451 } 452 final AlphabetConverter other = (AlphabetConverter) obj; 453 return originalToEncoded.equals(other.originalToEncoded) 454 && encodedToOriginal.equals(other.encodedToOriginal) 455 && encodedLetterLength == other.encodedLetterLength; 456 } 457 458 /** 459 * Gets the length of characters in the encoded alphabet that are necessary for each character in the original alphabet. 460 * 461 * @return The length of the encoded char. 462 */ 463 public int getEncodedCharLength() { 464 return encodedLetterLength; 465 } 466 467 /** 468 * Gets the mapping from integer code point of source language to encoded string. Use to reconstruct converter from serialized map. 469 * 470 * @return The original map. 471 */ 472 public Map<Integer, String> getOriginalToEncoded() { 473 return Collections.unmodifiableMap(originalToEncoded); 474 } 475 476 @Override 477 public int hashCode() { 478 return Objects.hash(originalToEncoded, 479 encodedToOriginal, 480 encodedLetterLength); 481 } 482 483 @Override 484 public String toString() { 485 final StringBuilder sb = new StringBuilder(); 486 // @formatter:off 487 originalToEncoded.forEach((k, v) -> 488 sb.append(codePointToString(k)) 489 .append(ARROW) 490 .append(k) 491 .append(System.lineSeparator())); 492 // @formatter:on 493 return sb.toString(); 494 } 495}