001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.net; 019 020import java.io.ByteArrayOutputStream; 021import java.io.UnsupportedEncodingException; 022import java.nio.charset.Charset; 023import java.nio.charset.IllegalCharsetNameException; 024import java.nio.charset.UnsupportedCharsetException; 025import java.util.BitSet; 026 027import org.apache.commons.codec.BinaryDecoder; 028import org.apache.commons.codec.BinaryEncoder; 029import org.apache.commons.codec.Charsets; 030import org.apache.commons.codec.DecoderException; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.StringDecoder; 033import org.apache.commons.codec.StringEncoder; 034import org.apache.commons.codec.binary.StringUtils; 035 036/** 037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 038 * <p> 039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to 040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are 041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the 042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable 043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping 044 * gateway. 045 * <p> 046 * Note: 047 * <p> 048 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the 049 * quoted-printable spec: 050 * <ul> 051 * <li>{@code strict=false}: only rules #1 and #2 are implemented 052 * <li>{@code strict=true}: all rules #1 through #5 are implemented 053 * </ul> 054 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used 055 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance 056 * Q codec. The strict mode has been added in 1.10. 057 * <p> 058 * This class is immutable and thread-safe. 059 * 060 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: 061 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> 062 * 063 * @since 1.3 064 * @version $Id: QuotedPrintableCodec.java 1788792 2017-03-26 23:57:00Z sebb $ 065 */ 066public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 067 /** 068 * The default charset used for string decoding and encoding. 069 */ 070 private final Charset charset; 071 072 /** 073 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 074 */ 075 private final boolean strict; 076 077 /** 078 * BitSet of printable characters as defined in RFC 1521. 079 */ 080 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 081 082 private static final byte ESCAPE_CHAR = '='; 083 084 private static final byte TAB = 9; 085 086 private static final byte SPACE = 32; 087 088 private static final byte CR = 13; 089 090 private static final byte LF = 10; 091 092 /** 093 * Safe line length for quoted printable encoded text. 094 */ 095 private static final int SAFE_LENGTH = 73; 096 097 // Static initializer for printable chars collection 098 static { 099 // alpha characters 100 for (int i = 33; i <= 60; i++) { 101 PRINTABLE_CHARS.set(i); 102 } 103 for (int i = 62; i <= 126; i++) { 104 PRINTABLE_CHARS.set(i); 105 } 106 PRINTABLE_CHARS.set(TAB); 107 PRINTABLE_CHARS.set(SPACE); 108 } 109 110 /** 111 * Default constructor, assumes default charset of {@link Charsets#UTF_8} 112 */ 113 public QuotedPrintableCodec() { 114 this(Charsets.UTF_8, false); 115 } 116 117 /** 118 * Constructor which allows for the selection of the strict mode. 119 * 120 * @param strict 121 * if {@code true}, soft line breaks will be used 122 * @since 1.10 123 */ 124 public QuotedPrintableCodec(final boolean strict) { 125 this(Charsets.UTF_8, strict); 126 } 127 128 /** 129 * Constructor which allows for the selection of a default charset. 130 * 131 * @param charset 132 * the default string charset to use. 133 * @since 1.7 134 */ 135 public QuotedPrintableCodec(final Charset charset) { 136 this(charset, false); 137 } 138 139 /** 140 * Constructor which allows for the selection of a default charset and strict mode. 141 * 142 * @param charset 143 * the default string charset to use. 144 * @param strict 145 * if {@code true}, soft line breaks will be used 146 * @since 1.10 147 */ 148 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 149 this.charset = charset; 150 this.strict = strict; 151 } 152 153 /** 154 * Constructor which allows for the selection of a default charset. 155 * 156 * @param charsetName 157 * the default string charset to use. 158 * @throws UnsupportedCharsetException 159 * If no support for the named charset is available 160 * in this instance of the Java virtual machine 161 * @throws IllegalArgumentException 162 * If the given charsetName is null 163 * @throws IllegalCharsetNameException 164 * If the given charset name is illegal 165 * 166 * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable 167 */ 168 public QuotedPrintableCodec(final String charsetName) 169 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 170 this(Charset.forName(charsetName), false); 171 } 172 173 /** 174 * Encodes byte into its quoted-printable representation. 175 * 176 * @param b 177 * byte to encode 178 * @param buffer 179 * the buffer to write to 180 * @return The number of bytes written to the <code>buffer</code> 181 */ 182 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 183 buffer.write(ESCAPE_CHAR); 184 final char hex1 = Utils.hexDigit(b >> 4); 185 final char hex2 = Utils.hexDigit(b); 186 buffer.write(hex1); 187 buffer.write(hex2); 188 return 3; 189 } 190 191 /** 192 * Return the byte at position <code>index</code> of the byte array and 193 * make sure it is unsigned. 194 * 195 * @param index 196 * position in the array 197 * @param bytes 198 * the byte array 199 * @return the unsigned octet at position <code>index</code> from the array 200 */ 201 private static int getUnsignedOctet(final int index, final byte[] bytes) { 202 int b = bytes[index]; 203 if (b < 0) { 204 b = 256 + b; 205 } 206 return b; 207 } 208 209 /** 210 * Write a byte to the buffer. 211 * 212 * @param b 213 * byte to write 214 * @param encode 215 * indicates whether the octet shall be encoded 216 * @param buffer 217 * the buffer to write to 218 * @return the number of bytes that have been written to the buffer 219 */ 220 private static int encodeByte(final int b, final boolean encode, 221 final ByteArrayOutputStream buffer) { 222 if (encode) { 223 return encodeQuotedPrintable(b, buffer); 224 } 225 buffer.write(b); 226 return 1; 227 } 228 229 /** 230 * Checks whether the given byte is whitespace. 231 * 232 * @param b 233 * byte to be checked 234 * @return <code>true</code> if the byte is either a space or tab character 235 */ 236 private static boolean isWhitespace(final int b) { 237 return b == SPACE || b == TAB; 238 } 239 240 /** 241 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 242 * <p> 243 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 244 * RFC 1521 and is suitable for encoding binary data and unformatted text. 245 * 246 * @param printable 247 * bitset of characters deemed quoted-printable 248 * @param bytes 249 * array of bytes to be encoded 250 * @return array of bytes containing quoted-printable data 251 */ 252 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) { 253 return encodeQuotedPrintable(printable, bytes, false); 254 } 255 256 /** 257 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 258 * <p> 259 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 260 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 261 * RFC 1521 and is suitable for encoding binary data and unformatted text. 262 * 263 * @param printable 264 * bitset of characters deemed quoted-printable 265 * @param bytes 266 * array of bytes to be encoded 267 * @param strict 268 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2 269 * @return array of bytes containing quoted-printable data 270 * @since 1.10 271 */ 272 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) { 273 if (bytes == null) { 274 return null; 275 } 276 if (printable == null) { 277 printable = PRINTABLE_CHARS; 278 } 279 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 280 281 if (strict) { 282 int pos = 1; 283 // encode up to buffer.length - 3, the last three octets will be treated 284 // separately for simplification of note #3 285 for (int i = 0; i < bytes.length - 3; i++) { 286 final int b = getUnsignedOctet(i, bytes); 287 if (pos < SAFE_LENGTH) { 288 // up to this length it is safe to add any byte, encoded or not 289 pos += encodeByte(b, !printable.get(b), buffer); 290 } else { 291 // rule #3: whitespace at the end of a line *must* be encoded 292 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 293 294 // rule #5: soft line break 295 buffer.write(ESCAPE_CHAR); 296 buffer.write(CR); 297 buffer.write(LF); 298 pos = 1; 299 } 300 } 301 302 // rule #3: whitespace at the end of a line *must* be encoded 303 // if we would do a soft break line after this octet, encode whitespace 304 int b = getUnsignedOctet(bytes.length - 3, bytes); 305 boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5); 306 pos += encodeByte(b, encode, buffer); 307 308 // note #3: '=' *must not* be the ultimate or penultimate character 309 // simplification: if < 6 bytes left, do a soft line break as we may need 310 // exactly 6 bytes space for the last 2 bytes 311 if (pos > SAFE_LENGTH - 2) { 312 buffer.write(ESCAPE_CHAR); 313 buffer.write(CR); 314 buffer.write(LF); 315 } 316 for (int i = bytes.length - 2; i < bytes.length; i++) { 317 b = getUnsignedOctet(i, bytes); 318 // rule #3: trailing whitespace shall be encoded 319 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b)); 320 encodeByte(b, encode, buffer); 321 } 322 } else { 323 for (final byte c : bytes) { 324 int b = c; 325 if (b < 0) { 326 b = 256 + b; 327 } 328 if (printable.get(b)) { 329 buffer.write(b); 330 } else { 331 encodeQuotedPrintable(b, buffer); 332 } 333 } 334 } 335 return buffer.toByteArray(); 336 } 337 338 /** 339 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted 340 * back to their original representation. 341 * <p> 342 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 343 * defined in RFC 1521. 344 * 345 * @param bytes 346 * array of quoted-printable characters 347 * @return array of original bytes 348 * @throws DecoderException 349 * Thrown if quoted-printable decoding is unsuccessful 350 */ 351 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 352 if (bytes == null) { 353 return null; 354 } 355 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 356 for (int i = 0; i < bytes.length; i++) { 357 final int b = bytes[i]; 358 if (b == ESCAPE_CHAR) { 359 try { 360 // if the next octet is a CR we have found a soft line break 361 if (bytes[++i] == CR) { 362 continue; 363 } 364 final int u = Utils.digit16(bytes[i]); 365 final int l = Utils.digit16(bytes[++i]); 366 buffer.write((char) ((u << 4) + l)); 367 } catch (final ArrayIndexOutOfBoundsException e) { 368 throw new DecoderException("Invalid quoted-printable encoding", e); 369 } 370 } else if (b != CR && b != LF) { 371 // every other octet is appended except for CR & LF 372 buffer.write(b); 373 } 374 } 375 return buffer.toByteArray(); 376 } 377 378 /** 379 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 380 * <p> 381 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 382 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 383 * RFC 1521 and is suitable for encoding binary data and unformatted text. 384 * 385 * @param bytes 386 * array of bytes to be encoded 387 * @return array of bytes containing quoted-printable data 388 */ 389 @Override 390 public byte[] encode(final byte[] bytes) { 391 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 392 } 393 394 /** 395 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted 396 * back to their original representation. 397 * <p> 398 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 399 * defined in RFC 1521. 400 * 401 * @param bytes 402 * array of quoted-printable characters 403 * @return array of original bytes 404 * @throws DecoderException 405 * Thrown if quoted-printable decoding is unsuccessful 406 */ 407 @Override 408 public byte[] decode(final byte[] bytes) throws DecoderException { 409 return decodeQuotedPrintable(bytes); 410 } 411 412 /** 413 * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped. 414 * <p> 415 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 416 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 417 * RFC 1521 and is suitable for encoding binary data and unformatted text. 418 * 419 * @param str 420 * string to convert to quoted-printable form 421 * @return quoted-printable string 422 * @throws EncoderException 423 * Thrown if quoted-printable encoding is unsuccessful 424 * 425 * @see #getCharset() 426 */ 427 @Override 428 public String encode(final String str) throws EncoderException { 429 return this.encode(str, getCharset()); 430 } 431 432 /** 433 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters 434 * are converted back to their original representation. 435 * 436 * @param str 437 * quoted-printable string to convert into its original form 438 * @param charset 439 * the original string charset 440 * @return original string 441 * @throws DecoderException 442 * Thrown if quoted-printable decoding is unsuccessful 443 * @since 1.7 444 */ 445 public String decode(final String str, final Charset charset) throws DecoderException { 446 if (str == null) { 447 return null; 448 } 449 return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset); 450 } 451 452 /** 453 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters 454 * are converted back to their original representation. 455 * 456 * @param str 457 * quoted-printable string to convert into its original form 458 * @param charset 459 * the original string charset 460 * @return original string 461 * @throws DecoderException 462 * Thrown if quoted-printable decoding is unsuccessful 463 * @throws UnsupportedEncodingException 464 * Thrown if charset is not supported 465 */ 466 public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException { 467 if (str == null) { 468 return null; 469 } 470 return new String(decode(StringUtils.getBytesUsAscii(str)), charset); 471 } 472 473 /** 474 * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are 475 * converted back to their original representation. 476 * 477 * @param str 478 * quoted-printable string to convert into its original form 479 * @return original string 480 * @throws DecoderException 481 * Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported. 482 * @see #getCharset() 483 */ 484 @Override 485 public String decode(final String str) throws DecoderException { 486 return this.decode(str, this.getCharset()); 487 } 488 489 /** 490 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 491 * 492 * @param obj 493 * string to convert to a quoted-printable form 494 * @return quoted-printable object 495 * @throws EncoderException 496 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is 497 * unsuccessful 498 */ 499 @Override 500 public Object encode(final Object obj) throws EncoderException { 501 if (obj == null) { 502 return null; 503 } else if (obj instanceof byte[]) { 504 return encode((byte[]) obj); 505 } else if (obj instanceof String) { 506 return encode((String) obj); 507 } else { 508 throw new EncoderException("Objects of type " + 509 obj.getClass().getName() + 510 " cannot be quoted-printable encoded"); 511 } 512 } 513 514 /** 515 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original 516 * representation. 517 * 518 * @param obj 519 * quoted-printable object to convert into its original form 520 * @return original object 521 * @throws DecoderException 522 * Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure 523 * condition is encountered during the decode process. 524 */ 525 @Override 526 public Object decode(final Object obj) throws DecoderException { 527 if (obj == null) { 528 return null; 529 } else if (obj instanceof byte[]) { 530 return decode((byte[]) obj); 531 } else if (obj instanceof String) { 532 return decode((String) obj); 533 } else { 534 throw new DecoderException("Objects of type " + 535 obj.getClass().getName() + 536 " cannot be quoted-printable decoded"); 537 } 538 } 539 540 /** 541 * Gets the default charset name used for string decoding and encoding. 542 * 543 * @return the default charset name 544 * @since 1.7 545 */ 546 public Charset getCharset() { 547 return this.charset; 548 } 549 550 /** 551 * Gets the default charset name used for string decoding and encoding. 552 * 553 * @return the default charset name 554 */ 555 public String getDefaultCharset() { 556 return this.charset.name(); 557 } 558 559 /** 560 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. 561 * <p> 562 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 563 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 564 * RFC 1521 and is suitable for encoding binary data and unformatted text. 565 * 566 * @param str 567 * string to convert to quoted-printable form 568 * @param charset 569 * the charset for str 570 * @return quoted-printable string 571 * @since 1.7 572 */ 573 public String encode(final String str, final Charset charset) { 574 if (str == null) { 575 return null; 576 } 577 return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset))); 578 } 579 580 /** 581 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. 582 * <p> 583 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 584 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 585 * RFC 1521 and is suitable for encoding binary data and unformatted text. 586 * 587 * @param str 588 * string to convert to quoted-printable form 589 * @param charset 590 * the charset for str 591 * @return quoted-printable string 592 * @throws UnsupportedEncodingException 593 * Thrown if the charset is not supported 594 */ 595 public String encode(final String str, final String charset) throws UnsupportedEncodingException { 596 if (str == null) { 597 return null; 598 } 599 return StringUtils.newStringUsAscii(encode(str.getBytes(charset))); 600 } 601}