001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.net; 019 020import java.io.ByteArrayOutputStream; 021import java.io.UnsupportedEncodingException; 022import java.nio.charset.Charset; 023import java.nio.charset.IllegalCharsetNameException; 024import java.nio.charset.StandardCharsets; 025import java.nio.charset.UnsupportedCharsetException; 026import java.util.BitSet; 027 028import org.apache.commons.codec.BinaryDecoder; 029import org.apache.commons.codec.BinaryEncoder; 030import org.apache.commons.codec.DecoderException; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.StringDecoder; 033import org.apache.commons.codec.StringEncoder; 034import org.apache.commons.codec.binary.StringUtils; 035 036/** 037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 038 * <p> 039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to 040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are 041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the 042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable 043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping 044 * gateway. 045 * </p> 046 * <p> 047 * Note: 048 * </p> 049 * <p> 050 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the 051 * quoted-printable spec: 052 * </p> 053 * <ul> 054 * <li>{@code strict=false}: only rules #1 and #2 are implemented</li> 055 * <li>{@code strict=true}: all rules #1 through #5 are implemented</li> 056 * </ul> 057 * <p> 058 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used 059 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance 060 * Q codec. The strict mode has been added in 1.10. 061 * </p> 062 * <p> 063 * This class is immutable and thread-safe. 064 * </p> 065 * 066 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: 067 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> 068 * 069 * @since 1.3 070 */ 071public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 072 /** 073 * BitSet of printable characters as defined in RFC 1521. 074 */ 075 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 076 077 private static final byte ESCAPE_CHAR = '='; 078 079 private static final byte TAB = 9; 080 081 private static final byte SPACE = 32; 082 083 private static final byte CR = 13; 084 085 private static final byte LF = 10; 086 087 /** 088 * Minimum length required for the byte arrays used by encodeQuotedPrintable method 089 */ 090 private static final int MIN_BYTES = 3; 091 092 /** 093 * Safe line length for quoted printable encoded text. 094 */ 095 private static final int SAFE_LENGTH = 73; 096 097 // Static initializer for printable chars collection 098 static { 099 // alpha characters 100 for (int i = 33; i <= 60; i++) { 101 PRINTABLE_CHARS.set(i); 102 } 103 for (int i = 62; i <= 126; i++) { 104 PRINTABLE_CHARS.set(i); 105 } 106 PRINTABLE_CHARS.set(TAB); 107 PRINTABLE_CHARS.set(SPACE); 108 } 109 110 /** 111 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted 112 * back to their original representation. 113 * <p> 114 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 115 * defined in RFC 1521. 116 * </p> 117 * 118 * @param bytes 119 * array of quoted-printable characters 120 * @return array of original bytes 121 * @throws DecoderException 122 * Thrown if quoted-printable decoding is unsuccessful 123 */ 124 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 125 if (bytes == null) { 126 return null; 127 } 128 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 129 for (int i = 0; i < bytes.length; i++) { 130 final int b = bytes[i]; 131 if (b == ESCAPE_CHAR) { 132 try { 133 // if the next octet is a CR we have found a soft line break 134 if (bytes[++i] == CR) { 135 continue; 136 } 137 final int u = Utils.digit16(bytes[i]); 138 final int l = Utils.digit16(bytes[++i]); 139 buffer.write((char) ((u << 4) + l)); 140 } catch (final ArrayIndexOutOfBoundsException e) { 141 throw new DecoderException("Invalid quoted-printable encoding", e); 142 } 143 } else if (b != CR && b != LF) { 144 // every other octet is appended except for CR & LF 145 buffer.write(b); 146 } 147 } 148 return buffer.toByteArray(); 149 } 150 151 /** 152 * Encodes a byte in the buffer. 153 * 154 * @param b 155 * byte to write 156 * @param encode 157 * indicates whether the octet shall be encoded 158 * @param buffer 159 * the buffer to write to 160 * @return the number of bytes that have been written to the buffer 161 */ 162 private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) { 163 if (encode) { 164 return encodeQuotedPrintable(b, buffer); 165 } 166 buffer.write(b); 167 return 1; 168 } 169 170 /** 171 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 172 * <p> 173 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 174 * RFC 1521 and is suitable for encoding binary data and unformatted text. 175 * </p> 176 * 177 * @param printable 178 * bitset of characters deemed quoted-printable 179 * @param bytes 180 * array of bytes to be encoded 181 * @return array of bytes containing quoted-printable data 182 */ 183 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) { 184 return encodeQuotedPrintable(printable, bytes, false); 185 } 186 187 /** 188 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 189 * <p> 190 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 191 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 192 * RFC 1521 and is suitable for encoding binary data and unformatted text. 193 * </p> 194 * 195 * @param printable 196 * bitset of characters deemed quoted-printable 197 * @param bytes 198 * array of bytes to be encoded 199 * @param strict 200 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2 201 * @return array of bytes containing quoted-printable data 202 * @since 1.10 203 */ 204 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) { 205 if (bytes == null) { 206 return null; 207 } 208 if (printable == null) { 209 printable = PRINTABLE_CHARS; 210 } 211 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 212 final int bytesLength = bytes.length; 213 214 if (strict) { 215 if (bytesLength < MIN_BYTES) { 216 return null; 217 } 218 219 int pos = 1; 220 // encode up to buffer.length - 3, the last three octets will be treated 221 // separately for simplification of note #3 222 for (int i = 0; i < bytesLength - 3; i++) { 223 final int b = getUnsignedOctet(i, bytes); 224 if (pos < SAFE_LENGTH) { 225 // up to this length it is safe to add any byte, encoded or not 226 pos += encodeByte(b, !printable.get(b), buffer); 227 } else { 228 // rule #3: whitespace at the end of a line *must* be encoded 229 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 230 231 // rule #5: soft line break 232 buffer.write(ESCAPE_CHAR); 233 buffer.write(CR); 234 buffer.write(LF); 235 pos = 1; 236 } 237 } 238 239 // rule #3: whitespace at the end of a line *must* be encoded 240 // if we would do a soft break line after this octet, encode whitespace 241 int b = getUnsignedOctet(bytesLength - 3, bytes); 242 boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5; 243 pos += encodeByte(b, encode, buffer); 244 245 // note #3: '=' *must not* be the ultimate or penultimate character 246 // simplification: if < 6 bytes left, do a soft line break as we may need 247 // exactly 6 bytes space for the last 2 bytes 248 if (pos > SAFE_LENGTH - 2) { 249 buffer.write(ESCAPE_CHAR); 250 buffer.write(CR); 251 buffer.write(LF); 252 } 253 for (int i = bytesLength - 2; i < bytesLength; i++) { 254 b = getUnsignedOctet(i, bytes); 255 // rule #3: trailing whitespace shall be encoded 256 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b); 257 encodeByte(b, encode, buffer); 258 } 259 } else { 260 for (final byte c : bytes) { 261 int b = c; 262 if (b < 0) { 263 b = 256 + b; 264 } 265 if (printable.get(b)) { 266 buffer.write(b); 267 } else { 268 encodeQuotedPrintable(b, buffer); 269 } 270 } 271 } 272 return buffer.toByteArray(); 273 } 274 275 /** 276 * Encodes byte into its quoted-printable representation. 277 * 278 * @param b 279 * byte to encode 280 * @param buffer 281 * the buffer to write to 282 * @return The number of bytes written to the {@code buffer} 283 */ 284 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 285 buffer.write(ESCAPE_CHAR); 286 final char hex1 = Utils.hexDigit(b >> 4); 287 final char hex2 = Utils.hexDigit(b); 288 buffer.write(hex1); 289 buffer.write(hex2); 290 return 3; 291 } 292 293 /** 294 * Gets the byte at position {@code index} of the byte array and 295 * make sure it is unsigned. 296 * 297 * @param index 298 * position in the array 299 * @param bytes 300 * the byte array 301 * @return the unsigned octet at position {@code index} from the array 302 */ 303 private static int getUnsignedOctet(final int index, final byte[] bytes) { 304 int b = bytes[index]; 305 if (b < 0) { 306 b = 256 + b; 307 } 308 return b; 309 } 310 311 /** 312 * Checks whether the given byte is whitespace. 313 * 314 * @param b 315 * byte to be checked 316 * @return {@code true} if the byte is either a space or tab character 317 */ 318 private static boolean isWhitespace(final int b) { 319 return b == SPACE || b == TAB; 320 } 321 322 /** 323 * The default Charset used for string decoding and encoding. 324 */ 325 private final Charset charset; 326 327 /** 328 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 329 */ 330 private final boolean strict; 331 332 /** 333 * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8} 334 */ 335 public QuotedPrintableCodec() { 336 this(StandardCharsets.UTF_8, false); 337 } 338 339 /** 340 * Constructor which allows for the selection of the strict mode. 341 * 342 * @param strict 343 * if {@code true}, soft line breaks will be used 344 * @since 1.10 345 */ 346 public QuotedPrintableCodec(final boolean strict) { 347 this(StandardCharsets.UTF_8, strict); 348 } 349 350 /** 351 * Constructor which allows for the selection of a default Charset. 352 * 353 * @param charset 354 * the default string Charset to use. 355 * @since 1.7 356 */ 357 public QuotedPrintableCodec(final Charset charset) { 358 this(charset, false); 359 } 360 361 /** 362 * Constructor which allows for the selection of a default Charset and strict mode. 363 * 364 * @param charset 365 * the default string Charset to use. 366 * @param strict 367 * if {@code true}, soft line breaks will be used 368 * @since 1.10 369 */ 370 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 371 this.charset = charset; 372 this.strict = strict; 373 } 374 375 /** 376 * Constructor which allows for the selection of a default Charset. 377 * 378 * @param charsetName 379 * the default string Charset to use. 380 * @throws UnsupportedCharsetException 381 * If no support for the named Charset is available 382 * in this instance of the Java virtual machine 383 * @throws IllegalArgumentException 384 * If the given charsetName is null 385 * @throws IllegalCharsetNameException 386 * If the given Charset name is illegal 387 * 388 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable 389 */ 390 public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 391 this(Charset.forName(charsetName), false); 392 } 393 394 /** 395 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted 396 * back to their original representation. 397 * <p> 398 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 399 * defined in RFC 1521. 400 * </p> 401 * 402 * @param bytes 403 * array of quoted-printable characters 404 * @return array of original bytes 405 * @throws DecoderException 406 * Thrown if quoted-printable decoding is unsuccessful 407 */ 408 @Override 409 public byte[] decode(final byte[] bytes) throws DecoderException { 410 return decodeQuotedPrintable(bytes); 411 } 412 413 /** 414 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original 415 * representation. 416 * 417 * @param obj 418 * quoted-printable object to convert into its original form 419 * @return original object 420 * @throws DecoderException 421 * Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure 422 * condition is encountered during the decode process. 423 */ 424 @Override 425 public Object decode(final Object obj) throws DecoderException { 426 if (obj == null) { 427 return null; 428 } 429 if (obj instanceof byte[]) { 430 return decode((byte[]) obj); 431 } 432 if (obj instanceof String) { 433 return decode((String) obj); 434 } 435 throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded"); 436 } 437 438 /** 439 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are 440 * converted back to their original representation. 441 * 442 * @param sourceStr 443 * quoted-printable string to convert into its original form 444 * @return original string 445 * @throws DecoderException 446 * Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported. 447 * @see #getCharset() 448 */ 449 @Override 450 public String decode(final String sourceStr) throws DecoderException { 451 return this.decode(sourceStr, getCharset()); 452 } 453 454 /** 455 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 456 * are converted back to their original representation. 457 * 458 * @param sourceStr 459 * quoted-printable string to convert into its original form 460 * @param sourceCharset 461 * the original string Charset 462 * @return original string 463 * @throws DecoderException 464 * Thrown if quoted-printable decoding is unsuccessful 465 * @since 1.7 466 */ 467 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException { 468 if (sourceStr == null) { 469 return null; 470 } 471 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 472 } 473 474 /** 475 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 476 * are converted back to their original representation. 477 * 478 * @param sourceStr 479 * quoted-printable string to convert into its original form 480 * @param sourceCharset 481 * the original string Charset 482 * @return original string 483 * @throws DecoderException 484 * Thrown if quoted-printable decoding is unsuccessful 485 * @throws UnsupportedEncodingException 486 * Thrown if Charset is not supported 487 */ 488 public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException { 489 if (sourceStr == null) { 490 return null; 491 } 492 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 493 } 494 495 /** 496 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 497 * <p> 498 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 499 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 500 * RFC 1521 and is suitable for encoding binary data and unformatted text. 501 * </p> 502 * 503 * @param bytes 504 * array of bytes to be encoded 505 * @return array of bytes containing quoted-printable data 506 */ 507 @Override 508 public byte[] encode(final byte[] bytes) { 509 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 510 } 511 512 /** 513 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 514 * 515 * @param obj 516 * string to convert to a quoted-printable form 517 * @return quoted-printable object 518 * @throws EncoderException 519 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is 520 * unsuccessful 521 */ 522 @Override 523 public Object encode(final Object obj) throws EncoderException { 524 if (obj == null) { 525 return null; 526 } 527 if (obj instanceof byte[]) { 528 return encode((byte[]) obj); 529 } 530 if (obj instanceof String) { 531 return encode((String) obj); 532 } 533 throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded"); 534 } 535 536 /** 537 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped. 538 * <p> 539 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 540 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 541 * RFC 1521 and is suitable for encoding binary data and unformatted text. 542 * </p> 543 * 544 * @param sourceStr 545 * string to convert to quoted-printable form 546 * @return quoted-printable string 547 * @throws EncoderException 548 * Thrown if quoted-printable encoding is unsuccessful 549 * 550 * @see #getCharset() 551 */ 552 @Override 553 public String encode(final String sourceStr) throws EncoderException { 554 return encode(sourceStr, getCharset()); 555 } 556 557 /** 558 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 559 * <p> 560 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 561 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 562 * RFC 1521 and is suitable for encoding binary data and unformatted text. 563 * </p> 564 * 565 * @param sourceStr 566 * string to convert to quoted-printable form 567 * @param sourceCharset 568 * the Charset for sourceStr 569 * @return quoted-printable string 570 * @since 1.7 571 */ 572 public String encode(final String sourceStr, final Charset sourceCharset) { 573 if (sourceStr == null) { 574 return null; 575 } 576 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset))); 577 } 578 579 /** 580 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 581 * <p> 582 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 583 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 584 * RFC 1521 and is suitable for encoding binary data and unformatted text. 585 * </p> 586 * 587 * @param sourceStr 588 * string to convert to quoted-printable form 589 * @param sourceCharset 590 * the Charset for sourceStr 591 * @return quoted-printable string 592 * @throws UnsupportedEncodingException 593 * Thrown if the Charset is not supported 594 */ 595 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException { 596 if (sourceStr == null) { 597 return null; 598 } 599 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset))); 600 } 601 602 /** 603 * Gets the default Charset name used for string decoding and encoding. 604 * 605 * @return the default Charset name 606 * @since 1.7 607 */ 608 public Charset getCharset() { 609 return this.charset; 610 } 611 612 /** 613 * Gets the default Charset name used for string decoding and encoding. 614 * 615 * @return the default Charset name 616 */ 617 public String getDefaultCharset() { 618 return this.charset.name(); 619 } 620}