1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.net;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.StandardCharsets;
25 import java.nio.charset.UnsupportedCharsetException;
26 import java.util.BitSet;
27
28 import org.apache.commons.codec.BinaryDecoder;
29 import org.apache.commons.codec.BinaryEncoder;
30 import org.apache.commons.codec.DecoderException;
31 import org.apache.commons.codec.EncoderException;
32 import org.apache.commons.codec.StringDecoder;
33 import org.apache.commons.codec.StringEncoder;
34 import org.apache.commons.codec.binary.StringUtils;
35
36 /**
37 * Codec for the Quoted-Printable section of <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38 * <p>
39 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
40 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
41 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
42 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
43 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
44 * gateway.
45 * </p>
46 * <p>
47 * Note:
48 * </p>
49 * <p>
50 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
51 * quoted-printable spec:
52 * </p>
53 * <ul>
54 * <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
55 * <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
56 * </ul>
57 * <p>
58 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
59 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
60 * Q codec. The strict mode has been added in 1.10.
61 * </p>
62 * <p>
63 * This class is immutable and thread-safe.
64 * </p>
65 *
66 * @see <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
67 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
68 *
69 * @since 1.3
70 */
71 public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
72 /**
73 * BitSet of printable characters as defined in RFC 1521.
74 */
75 private static final BitSet PRINTABLE_CHARS = new BitSet(256);
76
77 private static final byte ESCAPE_CHAR = '=';
78
79 private static final byte TAB = 9;
80
81 private static final byte SPACE = 32;
82
83 private static final byte CR = 13;
84
85 private static final byte LF = 10;
86
87 /**
88 * Minimum length required for the byte arrays used by encodeQuotedPrintable method
89 */
90 private static final int MIN_BYTES = 3;
91
92 /**
93 * Safe line length for quoted printable encoded text.
94 */
95 private static final int SAFE_LENGTH = 73;
96
97 // Static initializer for printable chars collection
98 static {
99 // alpha characters
100 for (int i = 33; i <= 60; i++) {
101 PRINTABLE_CHARS.set(i);
102 }
103 for (int i = 62; i <= 126; i++) {
104 PRINTABLE_CHARS.set(i);
105 }
106 PRINTABLE_CHARS.set(TAB);
107 PRINTABLE_CHARS.set(SPACE);
108 }
109
110 /**
111 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
112 * back to their original representation.
113 * <p>
114 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
115 * defined in RFC 1521.
116 * </p>
117 *
118 * @param bytes
119 * array of quoted-printable characters
120 * @return array of original bytes
121 * @throws DecoderException
122 * Thrown if quoted-printable decoding is unsuccessful
123 */
124 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
125 if (bytes == null) {
126 return null;
127 }
128 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
129 for (int i = 0; i < bytes.length; i++) {
130 final int b = bytes[i];
131 if (b == ESCAPE_CHAR) {
132 try {
133 // if the next octet is a CR we have found a soft line break
134 if (bytes[++i] == CR) {
135 continue;
136 }
137 final int u = Utils.digit16(bytes[i]);
138 final int l = Utils.digit16(bytes[++i]);
139 buffer.write((char) ((u << 4) + l));
140 } catch (final ArrayIndexOutOfBoundsException e) {
141 throw new DecoderException("Invalid quoted-printable encoding", e);
142 }
143 } else if (b != CR && b != LF) {
144 // every other octet is appended except for CR & LF
145 buffer.write(b);
146 }
147 }
148 return buffer.toByteArray();
149 }
150
151 /**
152 * Encodes a byte in the buffer.
153 *
154 * @param b
155 * byte to write
156 * @param encode
157 * indicates whether the octet shall be encoded
158 * @param buffer
159 * the buffer to write to
160 * @return the number of bytes that have been written to the buffer
161 */
162 private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) {
163 if (encode) {
164 return encodeQuotedPrintable(b, buffer);
165 }
166 buffer.write(b);
167 return 1;
168 }
169
170 /**
171 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
172 * <p>
173 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
174 * RFC 1521 and is suitable for encoding binary data and unformatted text.
175 * </p>
176 *
177 * @param printable
178 * bitset of characters deemed quoted-printable
179 * @param bytes
180 * array of bytes to be encoded
181 * @return array of bytes containing quoted-printable data
182 */
183 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
184 return encodeQuotedPrintable(printable, bytes, false);
185 }
186
187 /**
188 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
189 * <p>
190 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
191 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
192 * RFC 1521 and is suitable for encoding binary data and unformatted text.
193 * </p>
194 *
195 * @param printable
196 * bitset of characters deemed quoted-printable
197 * @param bytes
198 * array of bytes to be encoded
199 * @param strict
200 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
201 * @return array of bytes containing quoted-printable data
202 * @since 1.10
203 */
204 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
205 if (bytes == null) {
206 return null;
207 }
208 if (printable == null) {
209 printable = PRINTABLE_CHARS;
210 }
211 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
212 final int bytesLength = bytes.length;
213
214 if (strict) {
215 if (bytesLength < MIN_BYTES) {
216 return null;
217 }
218
219 int pos = 1;
220 // encode up to buffer.length - 3, the last three octets will be treated
221 // separately for simplification of note #3
222 for (int i = 0; i < bytesLength - 3; i++) {
223 final int b = getUnsignedOctet(i, bytes);
224 if (pos < SAFE_LENGTH) {
225 // up to this length it is safe to add any byte, encoded or not
226 pos += encodeByte(b, !printable.get(b), buffer);
227 } else {
228 // rule #3: whitespace at the end of a line *must* be encoded
229 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
230
231 // rule #5: soft line break
232 buffer.write(ESCAPE_CHAR);
233 buffer.write(CR);
234 buffer.write(LF);
235 pos = 1;
236 }
237 }
238
239 // rule #3: whitespace at the end of a line *must* be encoded
240 // if we would do a soft break line after this octet, encode whitespace
241 int b = getUnsignedOctet(bytesLength - 3, bytes);
242 boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
243 pos += encodeByte(b, encode, buffer);
244
245 // note #3: '=' *must not* be the ultimate or penultimate character
246 // simplification: if < 6 bytes left, do a soft line break as we may need
247 // exactly 6 bytes space for the last 2 bytes
248 if (pos > SAFE_LENGTH - 2) {
249 buffer.write(ESCAPE_CHAR);
250 buffer.write(CR);
251 buffer.write(LF);
252 }
253 for (int i = bytesLength - 2; i < bytesLength; i++) {
254 b = getUnsignedOctet(i, bytes);
255 // rule #3: trailing whitespace shall be encoded
256 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
257 encodeByte(b, encode, buffer);
258 }
259 } else {
260 for (final byte c : bytes) {
261 int b = c;
262 if (b < 0) {
263 b = 256 + b;
264 }
265 if (printable.get(b)) {
266 buffer.write(b);
267 } else {
268 encodeQuotedPrintable(b, buffer);
269 }
270 }
271 }
272 return buffer.toByteArray();
273 }
274
275 /**
276 * Encodes byte into its quoted-printable representation.
277 *
278 * @param b
279 * byte to encode
280 * @param buffer
281 * the buffer to write to
282 * @return The number of bytes written to the {@code buffer}
283 */
284 private static int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
285 buffer.write(ESCAPE_CHAR);
286 final char hex1 = Utils.hexChar(b >> 4);
287 final char hex2 = Utils.hexChar(b);
288 buffer.write(hex1);
289 buffer.write(hex2);
290 return 3;
291 }
292
293 /**
294 * Gets the byte at position {@code index} of the byte array and
295 * make sure it is unsigned.
296 *
297 * @param index
298 * position in the array
299 * @param bytes
300 * the byte array
301 * @return the unsigned octet at position {@code index} from the array
302 */
303 private static int getUnsignedOctet(final int index, final byte[] bytes) {
304 int b = bytes[index];
305 if (b < 0) {
306 b = 256 + b;
307 }
308 return b;
309 }
310
311 /**
312 * Checks whether the given byte is whitespace.
313 *
314 * @param b
315 * byte to be checked
316 * @return {@code true} if the byte is either a space or tab character
317 */
318 private static boolean isWhitespace(final int b) {
319 return b == SPACE || b == TAB;
320 }
321
322 /**
323 * The default Charset used for string decoding and encoding.
324 */
325 private final Charset charset;
326
327 /**
328 * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
329 */
330 private final boolean strict;
331
332 /**
333 * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
334 */
335 public QuotedPrintableCodec() {
336 this(StandardCharsets.UTF_8, false);
337 }
338
339 /**
340 * Constructor which allows for the selection of the strict mode.
341 *
342 * @param strict
343 * if {@code true}, soft line breaks will be used
344 * @since 1.10
345 */
346 public QuotedPrintableCodec(final boolean strict) {
347 this(StandardCharsets.UTF_8, strict);
348 }
349
350 /**
351 * Constructor which allows for the selection of a default Charset.
352 *
353 * @param charset
354 * the default string Charset to use.
355 * @since 1.7
356 */
357 public QuotedPrintableCodec(final Charset charset) {
358 this(charset, false);
359 }
360
361 /**
362 * Constructor which allows for the selection of a default Charset and strict mode.
363 *
364 * @param charset
365 * the default string Charset to use.
366 * @param strict
367 * if {@code true}, soft line breaks will be used
368 * @since 1.10
369 */
370 public QuotedPrintableCodec(final Charset charset, final boolean strict) {
371 this.charset = charset;
372 this.strict = strict;
373 }
374
375 /**
376 * Constructor which allows for the selection of a default Charset.
377 *
378 * @param charsetName
379 * the default string Charset to use.
380 * @throws UnsupportedCharsetException
381 * If no support for the named Charset is available
382 * in this instance of the Java virtual machine
383 * @throws IllegalArgumentException
384 * If the given charsetName is null
385 * @throws IllegalCharsetNameException
386 * If the given Charset name is illegal
387 *
388 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
389 */
390 public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
391 this(Charset.forName(charsetName), false);
392 }
393
394 /**
395 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
396 * back to their original representation.
397 * <p>
398 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
399 * defined in RFC 1521.
400 * </p>
401 *
402 * @param bytes
403 * array of quoted-printable characters
404 * @return array of original bytes
405 * @throws DecoderException
406 * Thrown if quoted-printable decoding is unsuccessful
407 */
408 @Override
409 public byte[] decode(final byte[] bytes) throws DecoderException {
410 return decodeQuotedPrintable(bytes);
411 }
412
413 /**
414 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
415 * representation.
416 *
417 * @param obj
418 * quoted-printable object to convert into its original form
419 * @return original object
420 * @throws DecoderException
421 * Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
422 * condition is encountered during the decode process.
423 */
424 @Override
425 public Object decode(final Object obj) throws DecoderException {
426 if (obj == null) {
427 return null;
428 }
429 if (obj instanceof byte[]) {
430 return decode((byte[]) obj);
431 }
432 if (obj instanceof String) {
433 return decode((String) obj);
434 }
435 throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded");
436 }
437
438 /**
439 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
440 * converted back to their original representation.
441 *
442 * @param sourceStr
443 * quoted-printable string to convert into its original form
444 * @return original string
445 * @throws DecoderException
446 * Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
447 * @see #getCharset()
448 */
449 @Override
450 public String decode(final String sourceStr) throws DecoderException {
451 return this.decode(sourceStr, getCharset());
452 }
453
454 /**
455 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
456 * are converted back to their original representation.
457 *
458 * @param sourceStr
459 * quoted-printable string to convert into its original form
460 * @param sourceCharset
461 * the original string Charset
462 * @return original string
463 * @throws DecoderException
464 * Thrown if quoted-printable decoding is unsuccessful
465 * @since 1.7
466 */
467 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
468 if (sourceStr == null) {
469 return null;
470 }
471 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
472 }
473
474 /**
475 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
476 * are converted back to their original representation.
477 *
478 * @param sourceStr
479 * quoted-printable string to convert into its original form
480 * @param sourceCharset
481 * the original string Charset
482 * @return original string
483 * @throws DecoderException
484 * Thrown if quoted-printable decoding is unsuccessful
485 * @throws UnsupportedEncodingException
486 * Thrown if Charset is not supported
487 */
488 public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException {
489 if (sourceStr == null) {
490 return null;
491 }
492 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
493 }
494
495 /**
496 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
497 * <p>
498 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
499 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
500 * RFC 1521 and is suitable for encoding binary data and unformatted text.
501 * </p>
502 *
503 * @param bytes
504 * array of bytes to be encoded
505 * @return array of bytes containing quoted-printable data
506 */
507 @Override
508 public byte[] encode(final byte[] bytes) {
509 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
510 }
511
512 /**
513 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
514 *
515 * @param obj
516 * string to convert to a quoted-printable form
517 * @return quoted-printable object
518 * @throws EncoderException
519 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
520 * unsuccessful
521 */
522 @Override
523 public Object encode(final Object obj) throws EncoderException {
524 if (obj == null) {
525 return null;
526 }
527 if (obj instanceof byte[]) {
528 return encode((byte[]) obj);
529 }
530 if (obj instanceof String) {
531 return encode((String) obj);
532 }
533 throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded");
534 }
535
536 /**
537 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
538 * <p>
539 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
540 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
541 * RFC 1521 and is suitable for encoding binary data and unformatted text.
542 * </p>
543 *
544 * @param sourceStr
545 * string to convert to quoted-printable form
546 * @return quoted-printable string
547 * @throws EncoderException
548 * Thrown if quoted-printable encoding is unsuccessful
549 *
550 * @see #getCharset()
551 */
552 @Override
553 public String encode(final String sourceStr) throws EncoderException {
554 return encode(sourceStr, getCharset());
555 }
556
557 /**
558 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
559 * <p>
560 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
561 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
562 * RFC 1521 and is suitable for encoding binary data and unformatted text.
563 * </p>
564 *
565 * @param sourceStr
566 * string to convert to quoted-printable form
567 * @param sourceCharset
568 * the Charset for sourceStr
569 * @return quoted-printable string
570 * @since 1.7
571 */
572 public String encode(final String sourceStr, final Charset sourceCharset) {
573 if (sourceStr == null) {
574 return null;
575 }
576 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
577 }
578
579 /**
580 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
581 * <p>
582 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
583 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
584 * RFC 1521 and is suitable for encoding binary data and unformatted text.
585 * </p>
586 *
587 * @param sourceStr
588 * string to convert to quoted-printable form
589 * @param sourceCharset
590 * the Charset for sourceStr
591 * @return quoted-printable string
592 * @throws UnsupportedEncodingException
593 * Thrown if the Charset is not supported
594 */
595 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
596 if (sourceStr == null) {
597 return null;
598 }
599 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
600 }
601
602 /**
603 * Gets the default Charset name used for string decoding and encoding.
604 *
605 * @return the default Charset name
606 * @since 1.7
607 */
608 public Charset getCharset() {
609 return this.charset;
610 }
611
612 /**
613 * Gets the default Charset name used for string decoding and encoding.
614 *
615 * @return the default Charset name
616 */
617 public String getDefaultCharset() {
618 return this.charset.name();
619 }
620 }