001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.net; 019 020import java.io.ByteArrayOutputStream; 021import java.io.UnsupportedEncodingException; 022import java.nio.charset.Charset; 023import java.nio.charset.IllegalCharsetNameException; 024import java.nio.charset.UnsupportedCharsetException; 025import java.util.BitSet; 026 027import org.apache.commons.codec.BinaryDecoder; 028import org.apache.commons.codec.BinaryEncoder; 029import org.apache.commons.codec.Charsets; 030import org.apache.commons.codec.DecoderException; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.StringDecoder; 033import org.apache.commons.codec.StringEncoder; 034import org.apache.commons.codec.binary.StringUtils; 035 036/** 037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 038 * <p> 039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to 040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are 041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the 042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable 043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping 044 * gateway. 045 * <p> 046 * Note: 047 * <p> 048 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the 049 * quoted-printable spec: 050 * <ul> 051 * <li>{@code strict=false}: only rules #1 and #2 are implemented 052 * <li>{@code strict=true}: all rules #1 through #5 are implemented 053 * </ul> 054 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used 055 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance 056 * Q codec. The strict mode has been added in 1.10. 057 * <p> 058 * This class is immutable and thread-safe. 059 * 060 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: 061 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> 062 * 063 * @since 1.3 064 * @version $Id: QuotedPrintableCodec.html 928559 2014-11-10 02:53:54Z ggregory $ 065 */ 066public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 067 /** 068 * The default charset used for string decoding and encoding. 069 */ 070 private final Charset charset; 071 072 /** 073 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 074 */ 075 private final boolean strict; 076 077 /** 078 * BitSet of printable characters as defined in RFC 1521. 079 */ 080 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 081 082 private static final byte ESCAPE_CHAR = '='; 083 084 private static final byte TAB = 9; 085 086 private static final byte SPACE = 32; 087 088 private static final byte CR = 13; 089 090 private static final byte LF = 10; 091 092 /** 093 * Safe line length for quoted printable encoded text. 094 */ 095 private static final int SAFE_LENGTH = 73; 096 097 // Static initializer for printable chars collection 098 static { 099 // alpha characters 100 for (int i = 33; i <= 60; i++) { 101 PRINTABLE_CHARS.set(i); 102 } 103 for (int i = 62; i <= 126; i++) { 104 PRINTABLE_CHARS.set(i); 105 } 106 PRINTABLE_CHARS.set(TAB); 107 PRINTABLE_CHARS.set(SPACE); 108 } 109 110 /** 111 * Default constructor, assumes default charset of {@link Charsets#UTF_8} 112 */ 113 public QuotedPrintableCodec() { 114 this(Charsets.UTF_8, false); 115 } 116 117 /** 118 * Constructor which allows for the selection of the strict mode. 119 * 120 * @param strict 121 * if {@code true}, soft line breaks will be used 122 * @since 1.10 123 */ 124 public QuotedPrintableCodec(final boolean strict) { 125 this(Charsets.UTF_8, strict); 126 } 127 128 /** 129 * Constructor which allows for the selection of a default charset. 130 * 131 * @param charset 132 * the default string charset to use. 133 * @since 1.7 134 */ 135 public QuotedPrintableCodec(final Charset charset) { 136 this(charset, false); 137 } 138 139 /** 140 * Constructor which allows for the selection of a default charset and strict mode. 141 * 142 * @param charset 143 * the default string charset to use. 144 * @param strict 145 * if {@code true}, soft line breaks will be used 146 * @since 1.10 147 */ 148 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 149 this.charset = charset; 150 this.strict = strict; 151 } 152 153 /** 154 * Constructor which allows for the selection of a default charset. 155 * 156 * @param charsetName 157 * the default string charset to use. 158 * @throws UnsupportedCharsetException 159 * If no support for the named charset is available 160 * in this instance of the Java virtual machine 161 * @throws IllegalArgumentException 162 * If the given charsetName is null 163 * @throws IllegalCharsetNameException 164 * If the given charset name is illegal 165 * 166 * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable 167 */ 168 public QuotedPrintableCodec(final String charsetName) 169 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 170 this(Charset.forName(charsetName), false); 171 } 172 173 /** 174 * Encodes byte into its quoted-printable representation. 175 * 176 * @param b 177 * byte to encode 178 * @param buffer 179 * the buffer to write to 180 * @return The number of bytes written to the <code>buffer</code> 181 */ 182 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 183 buffer.write(ESCAPE_CHAR); 184 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)); 185 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16)); 186 buffer.write(hex1); 187 buffer.write(hex2); 188 return 3; 189 } 190 191 /** 192 * Return the byte at position <code>index</code> of the byte array and 193 * make sure it is unsigned. 194 * 195 * @param index 196 * position in the array 197 * @param bytes 198 * the byte array 199 * @return the unsigned octet at position <code>index</code> from the array 200 */ 201 private static int getUnsignedOctet(final int index, final byte[] bytes) { 202 int b = bytes[index]; 203 if (b < 0) { 204 b = 256 + b; 205 } 206 return b; 207 } 208 209 /** 210 * Write a byte to the buffer. 211 * 212 * @param b 213 * byte to write 214 * @param encode 215 * indicates whether the octet shall be encoded 216 * @param buffer 217 * the buffer to write to 218 * @return the number of bytes that have been written to the buffer 219 */ 220 private static int encodeByte(final int b, final boolean encode, 221 final ByteArrayOutputStream buffer) { 222 if (encode) { 223 return encodeQuotedPrintable(b, buffer); 224 } else { 225 buffer.write(b); 226 return 1; 227 } 228 } 229 230 /** 231 * Checks whether the given byte is whitespace. 232 * 233 * @param b 234 * byte to be checked 235 * @return <code>true</code> if the byte is either a space or tab character 236 */ 237 private static boolean isWhitespace(final int b) { 238 return b == SPACE || b == TAB; 239 } 240 241 /** 242 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 243 * <p> 244 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 245 * RFC 1521 and is suitable for encoding binary data and unformatted text. 246 * 247 * @param printable 248 * bitset of characters deemed quoted-printable 249 * @param bytes 250 * array of bytes to be encoded 251 * @return array of bytes containing quoted-printable data 252 */ 253 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes) { 254 return encodeQuotedPrintable(printable, bytes, false); 255 } 256 257 /** 258 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 259 * <p> 260 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 261 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 262 * RFC 1521 and is suitable for encoding binary data and unformatted text. 263 * 264 * @param printable 265 * bitset of characters deemed quoted-printable 266 * @param bytes 267 * array of bytes to be encoded 268 * @param strict 269 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2 270 * @return array of bytes containing quoted-printable data 271 * @since 1.10 272 */ 273 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, boolean strict) { 274 if (bytes == null) { 275 return null; 276 } 277 if (printable == null) { 278 printable = PRINTABLE_CHARS; 279 } 280 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 281 282 if (strict) { 283 int pos = 1; 284 // encode up to buffer.length - 3, the last three octets will be treated 285 // separately for simplification of note #3 286 for (int i = 0; i < bytes.length - 3; i++) { 287 int b = getUnsignedOctet(i, bytes); 288 if (pos < SAFE_LENGTH) { 289 // up to this length it is safe to add any byte, encoded or not 290 pos += encodeByte(b, !printable.get(b), buffer); 291 } else { 292 // rule #3: whitespace at the end of a line *must* be encoded 293 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 294 295 // rule #5: soft line break 296 buffer.write(ESCAPE_CHAR); 297 buffer.write(CR); 298 buffer.write(LF); 299 pos = 1; 300 } 301 } 302 303 // rule #3: whitespace at the end of a line *must* be encoded 304 // if we would do a soft break line after this octet, encode whitespace 305 int b = getUnsignedOctet(bytes.length - 3, bytes); 306 boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5); 307 pos += encodeByte(b, encode, buffer); 308 309 // note #3: '=' *must not* be the ultimate or penultimate character 310 // simplification: if < 6 bytes left, do a soft line break as we may need 311 // exactly 6 bytes space for the last 2 bytes 312 if (pos > SAFE_LENGTH - 2) { 313 buffer.write(ESCAPE_CHAR); 314 buffer.write(CR); 315 buffer.write(LF); 316 } 317 for (int i = bytes.length - 2; i < bytes.length; i++) { 318 b = getUnsignedOctet(i, bytes); 319 // rule #3: trailing whitespace shall be encoded 320 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b)); 321 encodeByte(b, encode, buffer); 322 } 323 } else { 324 for (final byte c : bytes) { 325 int b = c; 326 if (b < 0) { 327 b = 256 + b; 328 } 329 if (printable.get(b)) { 330 buffer.write(b); 331 } else { 332 encodeQuotedPrintable(b, buffer); 333 } 334 } 335 } 336 return buffer.toByteArray(); 337 } 338 339 /** 340 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted 341 * back to their original representation. 342 * <p> 343 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 344 * defined in RFC 1521. 345 * 346 * @param bytes 347 * array of quoted-printable characters 348 * @return array of original bytes 349 * @throws DecoderException 350 * Thrown if quoted-printable decoding is unsuccessful 351 */ 352 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 353 if (bytes == null) { 354 return null; 355 } 356 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 357 for (int i = 0; i < bytes.length; i++) { 358 final int b = bytes[i]; 359 if (b == ESCAPE_CHAR) { 360 try { 361 // if the next octet is a CR we have found a soft line break 362 if (bytes[++i] == CR) { 363 continue; 364 } 365 final int u = Utils.digit16(bytes[i]); 366 final int l = Utils.digit16(bytes[++i]); 367 buffer.write((char) ((u << 4) + l)); 368 } catch (final ArrayIndexOutOfBoundsException e) { 369 throw new DecoderException("Invalid quoted-printable encoding", e); 370 } 371 } else if (b != CR && b != LF) { 372 // every other octet is appended except for CR & LF 373 buffer.write(b); 374 } 375 } 376 return buffer.toByteArray(); 377 } 378 379 /** 380 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 381 * <p> 382 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 383 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 384 * RFC 1521 and is suitable for encoding binary data and unformatted text. 385 * 386 * @param bytes 387 * array of bytes to be encoded 388 * @return array of bytes containing quoted-printable data 389 */ 390 @Override 391 public byte[] encode(final byte[] bytes) { 392 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 393 } 394 395 /** 396 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted 397 * back to their original representation. 398 * <p> 399 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 400 * defined in RFC 1521. 401 * 402 * @param bytes 403 * array of quoted-printable characters 404 * @return array of original bytes 405 * @throws DecoderException 406 * Thrown if quoted-printable decoding is unsuccessful 407 */ 408 @Override 409 public byte[] decode(final byte[] bytes) throws DecoderException { 410 return decodeQuotedPrintable(bytes); 411 } 412 413 /** 414 * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped. 415 * <p> 416 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 417 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 418 * RFC 1521 and is suitable for encoding binary data and unformatted text. 419 * 420 * @param str 421 * string to convert to quoted-printable form 422 * @return quoted-printable string 423 * @throws EncoderException 424 * Thrown if quoted-printable encoding is unsuccessful 425 * 426 * @see #getCharset() 427 */ 428 @Override 429 public String encode(final String str) throws EncoderException { 430 return this.encode(str, getCharset()); 431 } 432 433 /** 434 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters 435 * are converted back to their original representation. 436 * 437 * @param str 438 * quoted-printable string to convert into its original form 439 * @param charset 440 * the original string charset 441 * @return original string 442 * @throws DecoderException 443 * Thrown if quoted-printable decoding is unsuccessful 444 * @since 1.7 445 */ 446 public String decode(final String str, final Charset charset) throws DecoderException { 447 if (str == null) { 448 return null; 449 } 450 return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset); 451 } 452 453 /** 454 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters 455 * are converted back to their original representation. 456 * 457 * @param str 458 * quoted-printable string to convert into its original form 459 * @param charset 460 * the original string charset 461 * @return original string 462 * @throws DecoderException 463 * Thrown if quoted-printable decoding is unsuccessful 464 * @throws UnsupportedEncodingException 465 * Thrown if charset is not supported 466 */ 467 public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException { 468 if (str == null) { 469 return null; 470 } 471 return new String(decode(StringUtils.getBytesUsAscii(str)), charset); 472 } 473 474 /** 475 * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are 476 * converted back to their original representation. 477 * 478 * @param str 479 * quoted-printable string to convert into its original form 480 * @return original string 481 * @throws DecoderException 482 * Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported. 483 * @see #getCharset() 484 */ 485 @Override 486 public String decode(final String str) throws DecoderException { 487 return this.decode(str, this.getCharset()); 488 } 489 490 /** 491 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 492 * 493 * @param obj 494 * string to convert to a quoted-printable form 495 * @return quoted-printable object 496 * @throws EncoderException 497 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is 498 * unsuccessful 499 */ 500 @Override 501 public Object encode(final Object obj) throws EncoderException { 502 if (obj == null) { 503 return null; 504 } else if (obj instanceof byte[]) { 505 return encode((byte[]) obj); 506 } else if (obj instanceof String) { 507 return encode((String) obj); 508 } else { 509 throw new EncoderException("Objects of type " + 510 obj.getClass().getName() + 511 " cannot be quoted-printable encoded"); 512 } 513 } 514 515 /** 516 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original 517 * representation. 518 * 519 * @param obj 520 * quoted-printable object to convert into its original form 521 * @return original object 522 * @throws DecoderException 523 * Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure 524 * condition is encountered during the decode process. 525 */ 526 @Override 527 public Object decode(final Object obj) throws DecoderException { 528 if (obj == null) { 529 return null; 530 } else if (obj instanceof byte[]) { 531 return decode((byte[]) obj); 532 } else if (obj instanceof String) { 533 return decode((String) obj); 534 } else { 535 throw new DecoderException("Objects of type " + 536 obj.getClass().getName() + 537 " cannot be quoted-printable decoded"); 538 } 539 } 540 541 /** 542 * Gets the default charset name used for string decoding and encoding. 543 * 544 * @return the default charset name 545 * @since 1.7 546 */ 547 public Charset getCharset() { 548 return this.charset; 549 } 550 551 /** 552 * Gets the default charset name used for string decoding and encoding. 553 * 554 * @return the default charset name 555 */ 556 public String getDefaultCharset() { 557 return this.charset.name(); 558 } 559 560 /** 561 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. 562 * <p> 563 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 564 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 565 * RFC 1521 and is suitable for encoding binary data and unformatted text. 566 * 567 * @param str 568 * string to convert to quoted-printable form 569 * @param charset 570 * the charset for str 571 * @return quoted-printable string 572 * @since 1.7 573 */ 574 public String encode(final String str, final Charset charset) { 575 if (str == null) { 576 return null; 577 } 578 return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset))); 579 } 580 581 /** 582 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped. 583 * <p> 584 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 585 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 586 * RFC 1521 and is suitable for encoding binary data and unformatted text. 587 * 588 * @param str 589 * string to convert to quoted-printable form 590 * @param charset 591 * the charset for str 592 * @return quoted-printable string 593 * @throws UnsupportedEncodingException 594 * Thrown if the charset is not supported 595 */ 596 public String encode(final String str, final String charset) throws UnsupportedEncodingException { 597 if (str == null) { 598 return null; 599 } 600 return StringUtils.newStringUsAscii(encode(str.getBytes(charset))); 601 } 602}