1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.commons.codec.net; 19 20 import java.io.ByteArrayOutputStream; 21 import java.io.UnsupportedEncodingException; 22 import java.nio.charset.Charset; 23 import java.nio.charset.IllegalCharsetNameException; 24 import java.nio.charset.StandardCharsets; 25 import java.nio.charset.UnsupportedCharsetException; 26 import java.util.BitSet; 27 28 import org.apache.commons.codec.BinaryDecoder; 29 import org.apache.commons.codec.BinaryEncoder; 30 import org.apache.commons.codec.DecoderException; 31 import org.apache.commons.codec.EncoderException; 32 import org.apache.commons.codec.StringDecoder; 33 import org.apache.commons.codec.StringEncoder; 34 import org.apache.commons.codec.binary.StringUtils; 35 36 /** 37 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 38 * <p> 39 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to 40 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are 41 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the 42 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable 43 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping 44 * gateway. 45 * </p> 46 * <p> 47 * Note: 48 * </p> 49 * <p> 50 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the 51 * quoted-printable spec: 52 * </p> 53 * <ul> 54 * <li>{@code strict=false}: only rules #1 and #2 are implemented</li> 55 * <li>{@code strict=true}: all rules #1 through #5 are implemented</li> 56 * </ul> 57 * <p> 58 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used 59 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance 60 * Q codec. The strict mode has been added in 1.10. 61 * </p> 62 * <p> 63 * This class is immutable and thread-safe. 64 * </p> 65 * 66 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: 67 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> 68 * 69 * @since 1.3 70 */ 71 public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 72 /** 73 * BitSet of printable characters as defined in RFC 1521. 74 */ 75 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 76 77 private static final byte ESCAPE_CHAR = '='; 78 79 private static final byte TAB = 9; 80 81 private static final byte SPACE = 32; 82 83 private static final byte CR = 13; 84 85 private static final byte LF = 10; 86 87 /** 88 * Minimum length required for the byte arrays used by encodeQuotedPrintable method 89 */ 90 private static final int MIN_BYTES = 3; 91 92 /** 93 * Safe line length for quoted printable encoded text. 94 */ 95 private static final int SAFE_LENGTH = 73; 96 97 // Static initializer for printable chars collection 98 static { 99 // alpha characters 100 for (int i = 33; i <= 60; i++) { 101 PRINTABLE_CHARS.set(i); 102 } 103 for (int i = 62; i <= 126; i++) { 104 PRINTABLE_CHARS.set(i); 105 } 106 PRINTABLE_CHARS.set(TAB); 107 PRINTABLE_CHARS.set(SPACE); 108 } 109 110 /** 111 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted 112 * back to their original representation. 113 * <p> 114 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 115 * defined in RFC 1521. 116 * </p> 117 * 118 * @param bytes 119 * array of quoted-printable characters 120 * @return array of original bytes 121 * @throws DecoderException 122 * Thrown if quoted-printable decoding is unsuccessful 123 */ 124 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 125 if (bytes == null) { 126 return null; 127 } 128 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 129 for (int i = 0; i < bytes.length; i++) { 130 final int b = bytes[i]; 131 if (b == ESCAPE_CHAR) { 132 try { 133 // if the next octet is a CR we have found a soft line break 134 if (bytes[++i] == CR) { 135 continue; 136 } 137 final int u = Utils.digit16(bytes[i]); 138 final int l = Utils.digit16(bytes[++i]); 139 buffer.write((char) ((u << 4) + l)); 140 } catch (final ArrayIndexOutOfBoundsException e) { 141 throw new DecoderException("Invalid quoted-printable encoding", e); 142 } 143 } else if (b != CR && b != LF) { 144 // every other octet is appended except for CR & LF 145 buffer.write(b); 146 } 147 } 148 return buffer.toByteArray(); 149 } 150 151 /** 152 * Write a byte to the buffer. 153 * 154 * @param b 155 * byte to write 156 * @param encode 157 * indicates whether the octet shall be encoded 158 * @param buffer 159 * the buffer to write to 160 * @return the number of bytes that have been written to the buffer 161 */ 162 private static int encodeByte(final int b, final boolean encode, 163 final ByteArrayOutputStream buffer) { 164 if (encode) { 165 return encodeQuotedPrintable(b, buffer); 166 } 167 buffer.write(b); 168 return 1; 169 } 170 171 /** 172 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 173 * <p> 174 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 175 * RFC 1521 and is suitable for encoding binary data and unformatted text. 176 * </p> 177 * 178 * @param printable 179 * bitset of characters deemed quoted-printable 180 * @param bytes 181 * array of bytes to be encoded 182 * @return array of bytes containing quoted-printable data 183 */ 184 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) { 185 return encodeQuotedPrintable(printable, bytes, false); 186 } 187 188 /** 189 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 190 * <p> 191 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 192 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 193 * RFC 1521 and is suitable for encoding binary data and unformatted text. 194 * </p> 195 * 196 * @param printable 197 * bitset of characters deemed quoted-printable 198 * @param bytes 199 * array of bytes to be encoded 200 * @param strict 201 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2 202 * @return array of bytes containing quoted-printable data 203 * @since 1.10 204 */ 205 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) { 206 if (bytes == null) { 207 return null; 208 } 209 if (printable == null) { 210 printable = PRINTABLE_CHARS; 211 } 212 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 213 final int bytesLength = bytes.length; 214 215 if (strict) { 216 if (bytesLength < MIN_BYTES) { 217 return null; 218 } 219 220 int pos = 1; 221 // encode up to buffer.length - 3, the last three octets will be treated 222 // separately for simplification of note #3 223 for (int i = 0; i < bytesLength - 3; i++) { 224 final int b = getUnsignedOctet(i, bytes); 225 if (pos < SAFE_LENGTH) { 226 // up to this length it is safe to add any byte, encoded or not 227 pos += encodeByte(b, !printable.get(b), buffer); 228 } else { 229 // rule #3: whitespace at the end of a line *must* be encoded 230 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 231 232 // rule #5: soft line break 233 buffer.write(ESCAPE_CHAR); 234 buffer.write(CR); 235 buffer.write(LF); 236 pos = 1; 237 } 238 } 239 240 // rule #3: whitespace at the end of a line *must* be encoded 241 // if we would do a soft break line after this octet, encode whitespace 242 int b = getUnsignedOctet(bytesLength - 3, bytes); 243 boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5; 244 pos += encodeByte(b, encode, buffer); 245 246 // note #3: '=' *must not* be the ultimate or penultimate character 247 // simplification: if < 6 bytes left, do a soft line break as we may need 248 // exactly 6 bytes space for the last 2 bytes 249 if (pos > SAFE_LENGTH - 2) { 250 buffer.write(ESCAPE_CHAR); 251 buffer.write(CR); 252 buffer.write(LF); 253 } 254 for (int i = bytesLength - 2; i < bytesLength; i++) { 255 b = getUnsignedOctet(i, bytes); 256 // rule #3: trailing whitespace shall be encoded 257 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b); 258 encodeByte(b, encode, buffer); 259 } 260 } else { 261 for (final byte c : bytes) { 262 int b = c; 263 if (b < 0) { 264 b = 256 + b; 265 } 266 if (printable.get(b)) { 267 buffer.write(b); 268 } else { 269 encodeQuotedPrintable(b, buffer); 270 } 271 } 272 } 273 return buffer.toByteArray(); 274 } 275 276 /** 277 * Encodes byte into its quoted-printable representation. 278 * 279 * @param b 280 * byte to encode 281 * @param buffer 282 * the buffer to write to 283 * @return The number of bytes written to the {@code buffer} 284 */ 285 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 286 buffer.write(ESCAPE_CHAR); 287 final char hex1 = Utils.hexDigit(b >> 4); 288 final char hex2 = Utils.hexDigit(b); 289 buffer.write(hex1); 290 buffer.write(hex2); 291 return 3; 292 } 293 294 /** 295 * Gets the byte at position {@code index} of the byte array and 296 * make sure it is unsigned. 297 * 298 * @param index 299 * position in the array 300 * @param bytes 301 * the byte array 302 * @return the unsigned octet at position {@code index} from the array 303 */ 304 private static int getUnsignedOctet(final int index, final byte[] bytes) { 305 int b = bytes[index]; 306 if (b < 0) { 307 b = 256 + b; 308 } 309 return b; 310 } 311 312 /** 313 * Checks whether the given byte is whitespace. 314 * 315 * @param b 316 * byte to be checked 317 * @return {@code true} if the byte is either a space or tab character 318 */ 319 private static boolean isWhitespace(final int b) { 320 return b == SPACE || b == TAB; 321 } 322 323 /** 324 * The default Charset used for string decoding and encoding. 325 */ 326 private final Charset charset; 327 328 /** 329 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 330 */ 331 private final boolean strict; 332 333 /** 334 * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8} 335 */ 336 public QuotedPrintableCodec() { 337 this(StandardCharsets.UTF_8, false); 338 } 339 340 /** 341 * Constructor which allows for the selection of the strict mode. 342 * 343 * @param strict 344 * if {@code true}, soft line breaks will be used 345 * @since 1.10 346 */ 347 public QuotedPrintableCodec(final boolean strict) { 348 this(StandardCharsets.UTF_8, strict); 349 } 350 351 /** 352 * Constructor which allows for the selection of a default Charset. 353 * 354 * @param charset 355 * the default string Charset to use. 356 * @since 1.7 357 */ 358 public QuotedPrintableCodec(final Charset charset) { 359 this(charset, false); 360 } 361 362 /** 363 * Constructor which allows for the selection of a default Charset and strict mode. 364 * 365 * @param charset 366 * the default string Charset to use. 367 * @param strict 368 * if {@code true}, soft line breaks will be used 369 * @since 1.10 370 */ 371 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 372 this.charset = charset; 373 this.strict = strict; 374 } 375 376 /** 377 * Constructor which allows for the selection of a default Charset. 378 * 379 * @param charsetName 380 * the default string Charset to use. 381 * @throws UnsupportedCharsetException 382 * If no support for the named Charset is available 383 * in this instance of the Java virtual machine 384 * @throws IllegalArgumentException 385 * If the given charsetName is null 386 * @throws IllegalCharsetNameException 387 * If the given Charset name is illegal 388 * 389 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable 390 */ 391 public QuotedPrintableCodec(final String charsetName) 392 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 393 this(Charset.forName(charsetName), false); 394 } 395 396 /** 397 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted 398 * back to their original representation. 399 * <p> 400 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 401 * defined in RFC 1521. 402 * </p> 403 * 404 * @param bytes 405 * array of quoted-printable characters 406 * @return array of original bytes 407 * @throws DecoderException 408 * Thrown if quoted-printable decoding is unsuccessful 409 */ 410 @Override 411 public byte[] decode(final byte[] bytes) throws DecoderException { 412 return decodeQuotedPrintable(bytes); 413 } 414 415 /** 416 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original 417 * representation. 418 * 419 * @param obj 420 * quoted-printable object to convert into its original form 421 * @return original object 422 * @throws DecoderException 423 * Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure 424 * condition is encountered during the decode process. 425 */ 426 @Override 427 public Object decode(final Object obj) throws DecoderException { 428 if (obj == null) { 429 return null; 430 } 431 if (obj instanceof byte[]) { 432 return decode((byte[]) obj); 433 } 434 if (obj instanceof String) { 435 return decode((String) obj); 436 } 437 throw new DecoderException("Objects of type " + 438 obj.getClass().getName() + 439 " cannot be quoted-printable decoded"); 440 } 441 442 /** 443 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are 444 * converted back to their original representation. 445 * 446 * @param sourceStr 447 * quoted-printable string to convert into its original form 448 * @return original string 449 * @throws DecoderException 450 * Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported. 451 * @see #getCharset() 452 */ 453 @Override 454 public String decode(final String sourceStr) throws DecoderException { 455 return this.decode(sourceStr, this.getCharset()); 456 } 457 458 /** 459 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 460 * are converted back to their original representation. 461 * 462 * @param sourceStr 463 * quoted-printable string to convert into its original form 464 * @param sourceCharset 465 * the original string Charset 466 * @return original string 467 * @throws DecoderException 468 * Thrown if quoted-printable decoding is unsuccessful 469 * @since 1.7 470 */ 471 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException { 472 if (sourceStr == null) { 473 return null; 474 } 475 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 476 } 477 478 /** 479 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 480 * are converted back to their original representation. 481 * 482 * @param sourceStr 483 * quoted-printable string to convert into its original form 484 * @param sourceCharset 485 * the original string Charset 486 * @return original string 487 * @throws DecoderException 488 * Thrown if quoted-printable decoding is unsuccessful 489 * @throws UnsupportedEncodingException 490 * Thrown if Charset is not supported 491 */ 492 public String decode(final String sourceStr, final String sourceCharset) 493 throws DecoderException, UnsupportedEncodingException { 494 if (sourceStr == null) { 495 return null; 496 } 497 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 498 } 499 500 /** 501 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 502 * <p> 503 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 504 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 505 * RFC 1521 and is suitable for encoding binary data and unformatted text. 506 * </p> 507 * 508 * @param bytes 509 * array of bytes to be encoded 510 * @return array of bytes containing quoted-printable data 511 */ 512 @Override 513 public byte[] encode(final byte[] bytes) { 514 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 515 } 516 517 /** 518 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 519 * 520 * @param obj 521 * string to convert to a quoted-printable form 522 * @return quoted-printable object 523 * @throws EncoderException 524 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is 525 * unsuccessful 526 */ 527 @Override 528 public Object encode(final Object obj) throws EncoderException { 529 if (obj == null) { 530 return null; 531 } 532 if (obj instanceof byte[]) { 533 return encode((byte[]) obj); 534 } 535 if (obj instanceof String) { 536 return encode((String) obj); 537 } 538 throw new EncoderException("Objects of type " + 539 obj.getClass().getName() + 540 " cannot be quoted-printable encoded"); 541 } 542 543 /** 544 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped. 545 * <p> 546 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 547 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 548 * RFC 1521 and is suitable for encoding binary data and unformatted text. 549 * </p> 550 * 551 * @param sourceStr 552 * string to convert to quoted-printable form 553 * @return quoted-printable string 554 * @throws EncoderException 555 * Thrown if quoted-printable encoding is unsuccessful 556 * 557 * @see #getCharset() 558 */ 559 @Override 560 public String encode(final String sourceStr) throws EncoderException { 561 return this.encode(sourceStr, getCharset()); 562 } 563 564 /** 565 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 566 * <p> 567 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 568 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 569 * RFC 1521 and is suitable for encoding binary data and unformatted text. 570 * </p> 571 * 572 * @param sourceStr 573 * string to convert to quoted-printable form 574 * @param sourceCharset 575 * the Charset for sourceStr 576 * @return quoted-printable string 577 * @since 1.7 578 */ 579 public String encode(final String sourceStr, final Charset sourceCharset) { 580 if (sourceStr == null) { 581 return null; 582 } 583 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset))); 584 } 585 586 /** 587 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 588 * <p> 589 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 590 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 591 * RFC 1521 and is suitable for encoding binary data and unformatted text. 592 * </p> 593 * 594 * @param sourceStr 595 * string to convert to quoted-printable form 596 * @param sourceCharset 597 * the Charset for sourceStr 598 * @return quoted-printable string 599 * @throws UnsupportedEncodingException 600 * Thrown if the Charset is not supported 601 */ 602 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException { 603 if (sourceStr == null) { 604 return null; 605 } 606 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset))); 607 } 608 609 /** 610 * Gets the default Charset name used for string decoding and encoding. 611 * 612 * @return the default Charset name 613 * @since 1.7 614 */ 615 public Charset getCharset() { 616 return this.charset; 617 } 618 619 /** 620 * Gets the default Charset name used for string decoding and encoding. 621 * 622 * @return the default Charset name 623 */ 624 public String getDefaultCharset() { 625 return this.charset.name(); 626 } 627 }