001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.net; 019 020import java.io.ByteArrayOutputStream; 021import java.io.UnsupportedEncodingException; 022import java.nio.charset.Charset; 023import java.nio.charset.IllegalCharsetNameException; 024import java.nio.charset.UnsupportedCharsetException; 025import java.util.BitSet; 026 027import org.apache.commons.codec.BinaryDecoder; 028import org.apache.commons.codec.BinaryEncoder; 029import org.apache.commons.codec.Charsets; 030import org.apache.commons.codec.DecoderException; 031import org.apache.commons.codec.EncoderException; 032import org.apache.commons.codec.StringDecoder; 033import org.apache.commons.codec.StringEncoder; 034import org.apache.commons.codec.binary.StringUtils; 035 036/** 037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>. 038 * <p> 039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to 040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are 041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the 042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable 043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping 044 * gateway. 045 * <p> 046 * Note: 047 * <p> 048 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the 049 * quoted-printable spec: 050 * <ul> 051 * <li>{@code strict=false}: only rules #1 and #2 are implemented 052 * <li>{@code strict=true}: all rules #1 through #5 are implemented 053 * </ul> 054 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used 055 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance 056 * Q codec. The strict mode has been added in 1.10. 057 * <p> 058 * This class is immutable and thread-safe. 059 * 060 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: 061 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a> 062 * 063 * @since 1.3 064 */ 065public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder { 066 /** 067 * The default Charset used for string decoding and encoding. 068 */ 069 private final Charset charset; 070 071 /** 072 * Indicates whether soft line breaks shall be used during encoding (rule #3-5). 073 */ 074 private final boolean strict; 075 076 /** 077 * BitSet of printable characters as defined in RFC 1521. 078 */ 079 private static final BitSet PRINTABLE_CHARS = new BitSet(256); 080 081 private static final byte ESCAPE_CHAR = '='; 082 083 private static final byte TAB = 9; 084 085 private static final byte SPACE = 32; 086 087 private static final byte CR = 13; 088 089 private static final byte LF = 10; 090 091 /** 092 * Safe line length for quoted printable encoded text. 093 */ 094 private static final int SAFE_LENGTH = 73; 095 096 // Static initializer for printable chars collection 097 static { 098 // alpha characters 099 for (int i = 33; i <= 60; i++) { 100 PRINTABLE_CHARS.set(i); 101 } 102 for (int i = 62; i <= 126; i++) { 103 PRINTABLE_CHARS.set(i); 104 } 105 PRINTABLE_CHARS.set(TAB); 106 PRINTABLE_CHARS.set(SPACE); 107 } 108 109 /** 110 * Default constructor, assumes default Charset of {@link Charsets#UTF_8} 111 */ 112 public QuotedPrintableCodec() { 113 this(Charsets.UTF_8, false); 114 } 115 116 /** 117 * Constructor which allows for the selection of the strict mode. 118 * 119 * @param strict 120 * if {@code true}, soft line breaks will be used 121 * @since 1.10 122 */ 123 public QuotedPrintableCodec(final boolean strict) { 124 this(Charsets.UTF_8, strict); 125 } 126 127 /** 128 * Constructor which allows for the selection of a default Charset. 129 * 130 * @param charset 131 * the default string Charset to use. 132 * @since 1.7 133 */ 134 public QuotedPrintableCodec(final Charset charset) { 135 this(charset, false); 136 } 137 138 /** 139 * Constructor which allows for the selection of a default Charset and strict mode. 140 * 141 * @param charset 142 * the default string Charset to use. 143 * @param strict 144 * if {@code true}, soft line breaks will be used 145 * @since 1.10 146 */ 147 public QuotedPrintableCodec(final Charset charset, final boolean strict) { 148 this.charset = charset; 149 this.strict = strict; 150 } 151 152 /** 153 * Constructor which allows for the selection of a default Charset. 154 * 155 * @param charsetName 156 * the default string Charset to use. 157 * @throws UnsupportedCharsetException 158 * If no support for the named Charset is available 159 * in this instance of the Java virtual machine 160 * @throws IllegalArgumentException 161 * If the given charsetName is null 162 * @throws IllegalCharsetNameException 163 * If the given Charset name is illegal 164 * 165 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable 166 */ 167 public QuotedPrintableCodec(final String charsetName) 168 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException { 169 this(Charset.forName(charsetName), false); 170 } 171 172 /** 173 * Encodes byte into its quoted-printable representation. 174 * 175 * @param b 176 * byte to encode 177 * @param buffer 178 * the buffer to write to 179 * @return The number of bytes written to the <code>buffer</code> 180 */ 181 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) { 182 buffer.write(ESCAPE_CHAR); 183 final char hex1 = Utils.hexDigit(b >> 4); 184 final char hex2 = Utils.hexDigit(b); 185 buffer.write(hex1); 186 buffer.write(hex2); 187 return 3; 188 } 189 190 /** 191 * Return the byte at position <code>index</code> of the byte array and 192 * make sure it is unsigned. 193 * 194 * @param index 195 * position in the array 196 * @param bytes 197 * the byte array 198 * @return the unsigned octet at position <code>index</code> from the array 199 */ 200 private static int getUnsignedOctet(final int index, final byte[] bytes) { 201 int b = bytes[index]; 202 if (b < 0) { 203 b = 256 + b; 204 } 205 return b; 206 } 207 208 /** 209 * Write a byte to the buffer. 210 * 211 * @param b 212 * byte to write 213 * @param encode 214 * indicates whether the octet shall be encoded 215 * @param buffer 216 * the buffer to write to 217 * @return the number of bytes that have been written to the buffer 218 */ 219 private static int encodeByte(final int b, final boolean encode, 220 final ByteArrayOutputStream buffer) { 221 if (encode) { 222 return encodeQuotedPrintable(b, buffer); 223 } 224 buffer.write(b); 225 return 1; 226 } 227 228 /** 229 * Checks whether the given byte is whitespace. 230 * 231 * @param b 232 * byte to be checked 233 * @return <code>true</code> if the byte is either a space or tab character 234 */ 235 private static boolean isWhitespace(final int b) { 236 return b == SPACE || b == TAB; 237 } 238 239 /** 240 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 241 * <p> 242 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 243 * RFC 1521 and is suitable for encoding binary data and unformatted text. 244 * 245 * @param printable 246 * bitset of characters deemed quoted-printable 247 * @param bytes 248 * array of bytes to be encoded 249 * @return array of bytes containing quoted-printable data 250 */ 251 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) { 252 return encodeQuotedPrintable(printable, bytes, false); 253 } 254 255 /** 256 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 257 * <p> 258 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 259 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 260 * RFC 1521 and is suitable for encoding binary data and unformatted text. 261 * 262 * @param printable 263 * bitset of characters deemed quoted-printable 264 * @param bytes 265 * array of bytes to be encoded 266 * @param strict 267 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2 268 * @return array of bytes containing quoted-printable data 269 * @since 1.10 270 */ 271 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) { 272 if (bytes == null) { 273 return null; 274 } 275 if (printable == null) { 276 printable = PRINTABLE_CHARS; 277 } 278 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 279 280 if (strict) { 281 int pos = 1; 282 // encode up to buffer.length - 3, the last three octets will be treated 283 // separately for simplification of note #3 284 for (int i = 0; i < bytes.length - 3; i++) { 285 final int b = getUnsignedOctet(i, bytes); 286 if (pos < SAFE_LENGTH) { 287 // up to this length it is safe to add any byte, encoded or not 288 pos += encodeByte(b, !printable.get(b), buffer); 289 } else { 290 // rule #3: whitespace at the end of a line *must* be encoded 291 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer); 292 293 // rule #5: soft line break 294 buffer.write(ESCAPE_CHAR); 295 buffer.write(CR); 296 buffer.write(LF); 297 pos = 1; 298 } 299 } 300 301 // rule #3: whitespace at the end of a line *must* be encoded 302 // if we would do a soft break line after this octet, encode whitespace 303 int b = getUnsignedOctet(bytes.length - 3, bytes); 304 boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5); 305 pos += encodeByte(b, encode, buffer); 306 307 // note #3: '=' *must not* be the ultimate or penultimate character 308 // simplification: if < 6 bytes left, do a soft line break as we may need 309 // exactly 6 bytes space for the last 2 bytes 310 if (pos > SAFE_LENGTH - 2) { 311 buffer.write(ESCAPE_CHAR); 312 buffer.write(CR); 313 buffer.write(LF); 314 } 315 for (int i = bytes.length - 2; i < bytes.length; i++) { 316 b = getUnsignedOctet(i, bytes); 317 // rule #3: trailing whitespace shall be encoded 318 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b)); 319 encodeByte(b, encode, buffer); 320 } 321 } else { 322 for (final byte c : bytes) { 323 int b = c; 324 if (b < 0) { 325 b = 256 + b; 326 } 327 if (printable.get(b)) { 328 buffer.write(b); 329 } else { 330 encodeQuotedPrintable(b, buffer); 331 } 332 } 333 } 334 return buffer.toByteArray(); 335 } 336 337 /** 338 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted 339 * back to their original representation. 340 * <p> 341 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 342 * defined in RFC 1521. 343 * 344 * @param bytes 345 * array of quoted-printable characters 346 * @return array of original bytes 347 * @throws DecoderException 348 * Thrown if quoted-printable decoding is unsuccessful 349 */ 350 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException { 351 if (bytes == null) { 352 return null; 353 } 354 final ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 355 for (int i = 0; i < bytes.length; i++) { 356 final int b = bytes[i]; 357 if (b == ESCAPE_CHAR) { 358 try { 359 // if the next octet is a CR we have found a soft line break 360 if (bytes[++i] == CR) { 361 continue; 362 } 363 final int u = Utils.digit16(bytes[i]); 364 final int l = Utils.digit16(bytes[++i]); 365 buffer.write((char) ((u << 4) + l)); 366 } catch (final ArrayIndexOutOfBoundsException e) { 367 throw new DecoderException("Invalid quoted-printable encoding", e); 368 } 369 } else if (b != CR && b != LF) { 370 // every other octet is appended except for CR & LF 371 buffer.write(b); 372 } 373 } 374 return buffer.toByteArray(); 375 } 376 377 /** 378 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped. 379 * <p> 380 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 381 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 382 * RFC 1521 and is suitable for encoding binary data and unformatted text. 383 * 384 * @param bytes 385 * array of bytes to be encoded 386 * @return array of bytes containing quoted-printable data 387 */ 388 @Override 389 public byte[] encode(final byte[] bytes) { 390 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict); 391 } 392 393 /** 394 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted 395 * back to their original representation. 396 * <p> 397 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as 398 * defined in RFC 1521. 399 * 400 * @param bytes 401 * array of quoted-printable characters 402 * @return array of original bytes 403 * @throws DecoderException 404 * Thrown if quoted-printable decoding is unsuccessful 405 */ 406 @Override 407 public byte[] decode(final byte[] bytes) throws DecoderException { 408 return decodeQuotedPrintable(bytes); 409 } 410 411 /** 412 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped. 413 * <p> 414 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 415 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 416 * RFC 1521 and is suitable for encoding binary data and unformatted text. 417 * 418 * @param sourceStr 419 * string to convert to quoted-printable form 420 * @return quoted-printable string 421 * @throws EncoderException 422 * Thrown if quoted-printable encoding is unsuccessful 423 * 424 * @see #getCharset() 425 */ 426 @Override 427 public String encode(final String sourceStr) throws EncoderException { 428 return this.encode(sourceStr, getCharset()); 429 } 430 431 /** 432 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 433 * are converted back to their original representation. 434 * 435 * @param sourceStr 436 * quoted-printable string to convert into its original form 437 * @param sourceCharset 438 * the original string Charset 439 * @return original string 440 * @throws DecoderException 441 * Thrown if quoted-printable decoding is unsuccessful 442 * @since 1.7 443 */ 444 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException { 445 if (sourceStr == null) { 446 return null; 447 } 448 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 449 } 450 451 /** 452 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters 453 * are converted back to their original representation. 454 * 455 * @param sourceStr 456 * quoted-printable string to convert into its original form 457 * @param sourceCharset 458 * the original string Charset 459 * @return original string 460 * @throws DecoderException 461 * Thrown if quoted-printable decoding is unsuccessful 462 * @throws UnsupportedEncodingException 463 * Thrown if Charset is not supported 464 */ 465 public String decode(final String sourceStr, final String sourceCharset) 466 throws DecoderException, UnsupportedEncodingException { 467 if (sourceStr == null) { 468 return null; 469 } 470 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset); 471 } 472 473 /** 474 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are 475 * converted back to their original representation. 476 * 477 * @param sourceStr 478 * quoted-printable string to convert into its original form 479 * @return original string 480 * @throws DecoderException 481 * Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported. 482 * @see #getCharset() 483 */ 484 @Override 485 public String decode(final String sourceStr) throws DecoderException { 486 return this.decode(sourceStr, this.getCharset()); 487 } 488 489 /** 490 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped. 491 * 492 * @param obj 493 * string to convert to a quoted-printable form 494 * @return quoted-printable object 495 * @throws EncoderException 496 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is 497 * unsuccessful 498 */ 499 @Override 500 public Object encode(final Object obj) throws EncoderException { 501 if (obj == null) { 502 return null; 503 } else if (obj instanceof byte[]) { 504 return encode((byte[]) obj); 505 } else if (obj instanceof String) { 506 return encode((String) obj); 507 } else { 508 throw new EncoderException("Objects of type " + 509 obj.getClass().getName() + 510 " cannot be quoted-printable encoded"); 511 } 512 } 513 514 /** 515 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original 516 * representation. 517 * 518 * @param obj 519 * quoted-printable object to convert into its original form 520 * @return original object 521 * @throws DecoderException 522 * Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure 523 * condition is encountered during the decode process. 524 */ 525 @Override 526 public Object decode(final Object obj) throws DecoderException { 527 if (obj == null) { 528 return null; 529 } else if (obj instanceof byte[]) { 530 return decode((byte[]) obj); 531 } else if (obj instanceof String) { 532 return decode((String) obj); 533 } else { 534 throw new DecoderException("Objects of type " + 535 obj.getClass().getName() + 536 " cannot be quoted-printable decoded"); 537 } 538 } 539 540 /** 541 * Gets the default Charset name used for string decoding and encoding. 542 * 543 * @return the default Charset name 544 * @since 1.7 545 */ 546 public Charset getCharset() { 547 return this.charset; 548 } 549 550 /** 551 * Gets the default Charset name used for string decoding and encoding. 552 * 553 * @return the default Charset name 554 */ 555 public String getDefaultCharset() { 556 return this.charset.name(); 557 } 558 559 /** 560 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 561 * <p> 562 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 563 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 564 * RFC 1521 and is suitable for encoding binary data and unformatted text. 565 * 566 * @param sourceStr 567 * string to convert to quoted-printable form 568 * @param sourceCharset 569 * the Charset for sourceStr 570 * @return quoted-printable string 571 * @since 1.7 572 */ 573 public String encode(final String sourceStr, final Charset sourceCharset) { 574 if (sourceStr == null) { 575 return null; 576 } 577 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset))); 578 } 579 580 /** 581 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped. 582 * <p> 583 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset 584 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in 585 * RFC 1521 and is suitable for encoding binary data and unformatted text. 586 * 587 * @param sourceStr 588 * string to convert to quoted-printable form 589 * @param sourceCharset 590 * the Charset for sourceStr 591 * @return quoted-printable string 592 * @throws UnsupportedEncodingException 593 * Thrown if the Charset is not supported 594 */ 595 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException { 596 if (sourceStr == null) { 597 return null; 598 } 599 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset))); 600 } 601}