001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.net; 019 020import java.io.ByteArrayOutputStream; 021import java.io.UnsupportedEncodingException; 022import java.nio.charset.Charset; 023import java.nio.charset.IllegalCharsetNameException; 024import java.nio.charset.StandardCharsets; 025import java.nio.charset.UnsupportedCharsetException; 026import java.util.BitSet; 027 028import org.apache.commons.codec.BinaryDecoder; 029import org.apache.commons.codec.BinaryEncoder; 030import org.apache.commons.codec.DecoderException; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.StringDecoder; 033import org.apache.commons.codec.StringEncoder; 034import org.apache.commons.codec.binary.StringUtils; 035 036/** 037 * Codec for the Quoted-Printable section of <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 038 * <p> 039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to printable characters in the ASCII character 040 * set. It encodes the data in such a way that the resulting octets are unlikely to be modified by mail transport. If the data being encoded are mostly ASCII 041 * text, the encoded form of the data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable to ensure 042 * the integrity of the data should the message pass through a character- translating, and/or line-wrapping gateway. 043 * </p> 044 * <p> 045 * Note: 046 * </p> 047 * <p> 048 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the quoted-printable spec: 049 * </p> 050 * <ul> 051 * <li>{@code strict=false}: only rules #1 and #2 are implemented</li> 052 * <li>{@code strict=true}: all rules #1 through #5 are implemented</li> 053 * </ul> 054 * <p> 055 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used for certain applications that do not 056 * require quoted-printable line formatting (rules #3, #4, #5), for instance Q codec. The strict mode has been added in 1.10. 057 * </p> 058 * <p> 059 * This class is immutable and thread-safe. 060 * </p> 061 * 062 * @see <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: Mechanisms for Specifying and Describing 063 * the Format of Internet Message Bodies </a> 064 * 065 * @since 1.3 066 */ 067public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 068 069 /** 070 * BitSet of printable characters as defined in RFC 1521. 071 */ 072 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 073 private static final byte ESCAPE_CHAR = '='; 074 private static final byte TAB = 9; 075 private static final byte SPACE = 32; 076 private static final byte CR = 13; 077 private static final byte LF = 10; 078 079 /** 080 * Minimum length required for the byte arrays used by encodeQuotedPrintable method. 081 */ 082 private static final int MIN_BYTES = 3; 083 084 /** 085 * Safe line length for quoted printable encoded text. 086 */ 087 private static final int SAFE_LENGTH = 73; 088 089 // Static initializer for printable chars collection 090 static { 091 // alpha characters 092 for (int i = 33; i <= 60; i++) { 093 PRINTABLE_CHARS.set(i); 094 } 095 for (int i = 62; i <= 126; i++) { 096 PRINTABLE_CHARS.set(i); 097 } 098 PRINTABLE_CHARS.set(TAB); 099 PRINTABLE_CHARS.set(SPACE); 100 } 101 102 /** 103 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation. 104 * <p> 105 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521. 106 * </p> 107 * 108 * @param bytes array of quoted-printable characters. 109 * @return array of original bytes. 110 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. 111 */ 112 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 113 if (bytes == null) { 114 return null; 115 } 116 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 117 for (int i = 0; i < bytes.length; i++) { 118 final int b = bytes[i]; 119 if (b == ESCAPE_CHAR) { 120 try { 121 // if the next octet is a CR we have found a soft line break 122 if (bytes[++i] == CR) { 123 continue; 124 } 125 final int u = Utils.digit16(bytes[i]); 126 final int l = Utils.digit16(bytes[++i]); 127 buffer.write((char) ((u << 4) + l)); 128 } catch (final ArrayIndexOutOfBoundsException e) { 129 throw new DecoderException("Invalid quoted-printable encoding", e); 130 } 131 } else if (b != CR && b != LF) { 132 // every other octet is appended except for CR & LF 133 buffer.write(b); 134 } 135 } 136 return buffer.toByteArray(); 137 } 138 139 /** 140 * Encodes a byte in the buffer. 141 * 142 * @param b byte to write. 143 * @param encode indicates whether the octet shall be encoded. 144 * @param buffer the buffer to write to. 145 * @return the number of bytes that have been written to the buffer. 146 */ 147 private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) { 148 if (encode) { 149 return encodeQuotedPrintable(b, buffer); 150 } 151 buffer.write(b); 152 return 1; 153 } 154 155 /** 156 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 157 * <p> 158 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding 159 * binary data and unformatted text. 160 * </p> 161 * 162 * @param printable bitset of characters deemed quoted-printable. 163 * @param bytes array of bytes to be encoded. 164 * @return array of bytes containing quoted-printable data. 165 */ 166 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) { 167 return encodeQuotedPrintable(printable, bytes, false); 168 } 169 170 /** 171 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 172 * <p> 173 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable 174 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text. 175 * </p> 176 * 177 * @param printable bitset of characters deemed quoted-printable. 178 * @param bytes array of bytes to be encoded. 179 * @param strict if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2. 180 * @return array of bytes containing quoted-printable data. 181 * @since 1.10 182 */ 183 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) { 184 if (bytes == null) { 185 return null; 186 } 187 if (printable == null) { 188 printable = PRINTABLE_CHARS; 189 } 190 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 191 final int bytesLength = bytes.length; 192 if (strict) { 193 if (bytesLength < MIN_BYTES) { 194 return null; 195 } 196 int pos = 1; 197 // encode up to buffer.length - 3, the last three octets will be treated 198 // separately for simplification of note #3 199 for (int i = 0; i < bytesLength - 3; i++) { 200 final int b = getUnsignedOctet(i, bytes); 201 if (pos < SAFE_LENGTH) { 202 // up to this length it is safe to add any byte, encoded or not 203 pos += encodeByte(b, !printable.get(b), buffer); 204 } else { 205 // rule #3: whitespace at the end of a line *must* be encoded 206 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 207 // rule #5: soft line break 208 buffer.write(ESCAPE_CHAR); 209 buffer.write(CR); 210 buffer.write(LF); 211 pos = 1; 212 } 213 } 214 // rule #3: whitespace at the end of a line *must* be encoded 215 // if we would do a soft break line after this octet, encode whitespace 216 int b = getUnsignedOctet(bytesLength - 3, bytes); 217 boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5; 218 pos += encodeByte(b, encode, buffer); 219 // note #3: '=' *must not* be the ultimate or penultimate character 220 // simplification: if < 6 bytes left, do a soft line break as we may need 221 // exactly 6 bytes space for the last 2 bytes 222 if (pos > SAFE_LENGTH - 2) { 223 buffer.write(ESCAPE_CHAR); 224 buffer.write(CR); 225 buffer.write(LF); 226 } 227 for (int i = bytesLength - 2; i < bytesLength; i++) { 228 b = getUnsignedOctet(i, bytes); 229 // rule #3: trailing whitespace shall be encoded 230 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b); 231 encodeByte(b, encode, buffer); 232 } 233 } else { 234 for (final byte c : bytes) { 235 int b = c; 236 if (b < 0) { 237 b = 256 + b; 238 } 239 if (printable.get(b)) { 240 buffer.write(b); 241 } else { 242 encodeQuotedPrintable(b, buffer); 243 } 244 } 245 } 246 return buffer.toByteArray(); 247 } 248 249 /** 250 * Encodes byte into its quoted-printable representation. 251 * 252 * @param b byte to encode. 253 * @param buffer the buffer to write to. 254 * @return The number of bytes written to the {@code buffer}. 255 */ 256 private static int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 257 buffer.write(ESCAPE_CHAR); 258 final char hex1 = Utils.hexChar(b >> 4); 259 final char hex2 = Utils.hexChar(b); 260 buffer.write(hex1); 261 buffer.write(hex2); 262 return 3; 263 } 264 265 /** 266 * Gets the byte at position {@code index} of the byte array and make sure it is unsigned. 267 * 268 * @param index position in the array. 269 * @param bytes the byte array. 270 * @return the unsigned octet at position {@code index} from the array. 271 */ 272 private static int getUnsignedOctet(final int index, final byte[] bytes) { 273 int b = bytes[index]; 274 if (b < 0) { 275 b = 256 + b; 276 } 277 return b; 278 } 279 280 /** 281 * Checks whether the given byte is whitespace. 282 * 283 * @param b byte to be checked. 284 * @return {@code true} if the byte is either a space or tab character. 285 */ 286 private static boolean isWhitespace(final int b) { 287 return b == SPACE || b == TAB; 288 } 289 290 /** 291 * The default Charset used for string decoding and encoding. 292 */ 293 private final Charset charset; 294 295 /** 296 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 297 */ 298 private final boolean strict; 299 300 /** 301 * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8} 302 */ 303 public QuotedPrintableCodec() { 304 this(StandardCharsets.UTF_8, false); 305 } 306 307 /** 308 * Constructor which allows for the selection of the strict mode. 309 * 310 * @param strict if {@code true}, soft line breaks will be used. 311 * @since 1.10 312 */ 313 public QuotedPrintableCodec(final boolean strict) { 314 this(StandardCharsets.UTF_8, strict); 315 } 316 317 /** 318 * Constructor which allows for the selection of a default Charset. 319 * 320 * @param charset the default string Charset to use. 321 * @since 1.7 322 */ 323 public QuotedPrintableCodec(final Charset charset) { 324 this(charset, false); 325 } 326 327 /** 328 * Constructor which allows for the selection of a default Charset and strict mode. 329 * 330 * @param charset the default string Charset to use. 331 * @param strict if {@code true}, soft line breaks will be used. 332 * @since 1.10 333 */ 334 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 335 this.charset = charset; 336 this.strict = strict; 337 } 338 339 /** 340 * Constructor which allows for the selection of a default Charset. 341 * 342 * @param charsetName the default string Charset to use. 343 * @throws UnsupportedCharsetException If no support for the named Charset is available in this instance of the Java virtual machine. 344 * @throws IllegalArgumentException If the given charsetName is null. 345 * @throws IllegalCharsetNameException If the given Charset name is illegal. 346 * 347 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable 348 */ 349 public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 350 this(Charset.forName(charsetName), false); 351 } 352 353 /** 354 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation. 355 * <p> 356 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521. 357 * </p> 358 * 359 * @param bytes array of quoted-printable characters. 360 * @return array of original bytes. 361 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. 362 */ 363 @Override 364 public byte[] decode(final byte[] bytes) throws DecoderException { 365 return decodeQuotedPrintable(bytes); 366 } 367 368 /** 369 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original representation. 370 * 371 * @param obj quoted-printable object to convert into its original form. 372 * @return original object. 373 * @throws DecoderException Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure condition is encountered during the decode 374 * process. 375 */ 376 @Override 377 public Object decode(final Object obj) throws DecoderException { 378 if (obj == null) { 379 return null; 380 } 381 if (obj instanceof byte[]) { 382 return decode((byte[]) obj); 383 } 384 if (obj instanceof String) { 385 return decode((String) obj); 386 } 387 throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded"); 388 } 389 390 /** 391 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are converted back to their original 392 * representation. 393 * 394 * @param sourceStr quoted-printable string to convert into its original form. 395 * @return original string. 396 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported. 397 * @see #getCharset() 398 */ 399 @Override 400 public String decode(final String sourceStr) throws DecoderException { 401 return this.decode(sourceStr, getCharset()); 402 } 403 404 /** 405 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original 406 * representation. 407 * 408 * @param sourceStr quoted-printable string to convert into its original form. 409 * @param sourceCharset the original string Charset. 410 * @return original string. 411 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. 412 * @since 1.7 413 */ 414 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException { 415 if (sourceStr == null) { 416 return null; 417 } 418 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 419 } 420 421 /** 422 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original 423 * representation. 424 * 425 * @param sourceStr quoted-printable string to convert into its original form. 426 * @param sourceCharset the original string Charset. 427 * @return original string. 428 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. 429 * @throws UnsupportedEncodingException Thrown if Charset is not supported. 430 */ 431 public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException { 432 if (sourceStr == null) { 433 return null; 434 } 435 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 436 } 437 438 /** 439 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 440 * <p> 441 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable 442 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text. 443 * </p> 444 * 445 * @param bytes array of bytes to be encoded. 446 * @return array of bytes containing quoted-printable data. 447 */ 448 @Override 449 public byte[] encode(final byte[] bytes) { 450 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 451 } 452 453 /** 454 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 455 * 456 * @param obj string to convert to a quoted-printable form. 457 * @return quoted-printable object. 458 * @throws EncoderException Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is unsuccessful. 459 */ 460 @Override 461 public Object encode(final Object obj) throws EncoderException { 462 if (obj == null) { 463 return null; 464 } 465 if (obj instanceof byte[]) { 466 return encode((byte[]) obj); 467 } 468 if (obj instanceof String) { 469 return encode((String) obj); 470 } 471 throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded"); 472 } 473 474 /** 475 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped. 476 * <p> 477 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable 478 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text. 479 * </p> 480 * 481 * @param sourceStr string to convert to quoted-printable form. 482 * @return quoted-printable string. 483 * @throws EncoderException Thrown if quoted-printable encoding is unsuccessful. 484 * 485 * @see #getCharset() 486 */ 487 @Override 488 public String encode(final String sourceStr) throws EncoderException { 489 return encode(sourceStr, getCharset()); 490 } 491 492 /** 493 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 494 * <p> 495 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable 496 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text. 497 * </p> 498 * 499 * @param sourceStr string to convert to quoted-printable form. 500 * @param sourceCharset the Charset for sourceStr. 501 * @return quoted-printable string. 502 * @since 1.7 503 */ 504 public String encode(final String sourceStr, final Charset sourceCharset) { 505 if (sourceStr == null) { 506 return null; 507 } 508 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset))); 509 } 510 511 /** 512 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 513 * <p> 514 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable 515 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text. 516 * </p> 517 * 518 * @param sourceStr string to convert to quoted-printable form. 519 * @param sourceCharset the Charset for sourceStr. 520 * @return quoted-printable string. 521 * @throws UnsupportedEncodingException Thrown if the Charset is not supported. 522 */ 523 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException { 524 if (sourceStr == null) { 525 return null; 526 } 527 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset))); 528 } 529 530 /** 531 * Gets the default Charset name used for string decoding and encoding. 532 * 533 * @return the default Charset name. 534 * @since 1.7 535 */ 536 public Charset getCharset() { 537 return this.charset; 538 } 539 540 /** 541 * Gets the default Charset name used for string decoding and encoding. 542 * 543 * @return the default Charset name. 544 */ 545 public String getDefaultCharset() { 546 return this.charset.name(); 547 } 548}