1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.net;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.UnsupportedCharsetException;
25 import java.util.BitSet;
26
27 import org.apache.commons.codec.BinaryDecoder;
28 import org.apache.commons.codec.BinaryEncoder;
29 import org.apache.commons.codec.Charsets;
30 import org.apache.commons.codec.DecoderException;
31 import org.apache.commons.codec.EncoderException;
32 import org.apache.commons.codec.StringDecoder;
33 import org.apache.commons.codec.StringEncoder;
34 import org.apache.commons.codec.binary.StringUtils;
35
36 /**
37 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38 * <p>
39 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
40 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
41 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
42 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
43 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
44 * gateway.
45 * <p>
46 * Note:
47 * <p>
48 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
49 * quoted-printable spec:
50 * <ul>
51 * <li>{@code strict=false}: only rules #1 and #2 are implemented
52 * <li>{@code strict=true}: all rules #1 through #5 are implemented
53 * </ul>
54 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
55 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
56 * Q codec. The strict mode has been added in 1.10.
57 * <p>
58 * This class is immutable and thread-safe.
59 *
60 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
61 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
62 *
63 * @since 1.3
64 * @version $Id: QuotedPrintableCodec.java 1788792 2017-03-26 23:57:00Z sebb $
65 */
66 public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
67 /**
68 * The default charset used for string decoding and encoding.
69 */
70 private final Charset charset;
71
72 /**
73 * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
74 */
75 private final boolean strict;
76
77 /**
78 * BitSet of printable characters as defined in RFC 1521.
79 */
80 private static final BitSet PRINTABLE_CHARS = new BitSet(256);
81
82 private static final byte ESCAPE_CHAR = '=';
83
84 private static final byte TAB = 9;
85
86 private static final byte SPACE = 32;
87
88 private static final byte CR = 13;
89
90 private static final byte LF = 10;
91
92 /**
93 * Safe line length for quoted printable encoded text.
94 */
95 private static final int SAFE_LENGTH = 73;
96
97 // Static initializer for printable chars collection
98 static {
99 // alpha characters
100 for (int i = 33; i <= 60; i++) {
101 PRINTABLE_CHARS.set(i);
102 }
103 for (int i = 62; i <= 126; i++) {
104 PRINTABLE_CHARS.set(i);
105 }
106 PRINTABLE_CHARS.set(TAB);
107 PRINTABLE_CHARS.set(SPACE);
108 }
109
110 /**
111 * Default constructor, assumes default charset of {@link Charsets#UTF_8}
112 */
113 public QuotedPrintableCodec() {
114 this(Charsets.UTF_8, false);
115 }
116
117 /**
118 * Constructor which allows for the selection of the strict mode.
119 *
120 * @param strict
121 * if {@code true}, soft line breaks will be used
122 * @since 1.10
123 */
124 public QuotedPrintableCodec(final boolean strict) {
125 this(Charsets.UTF_8, strict);
126 }
127
128 /**
129 * Constructor which allows for the selection of a default charset.
130 *
131 * @param charset
132 * the default string charset to use.
133 * @since 1.7
134 */
135 public QuotedPrintableCodec(final Charset charset) {
136 this(charset, false);
137 }
138
139 /**
140 * Constructor which allows for the selection of a default charset and strict mode.
141 *
142 * @param charset
143 * the default string charset to use.
144 * @param strict
145 * if {@code true}, soft line breaks will be used
146 * @since 1.10
147 */
148 public QuotedPrintableCodec(final Charset charset, final boolean strict) {
149 this.charset = charset;
150 this.strict = strict;
151 }
152
153 /**
154 * Constructor which allows for the selection of a default charset.
155 *
156 * @param charsetName
157 * the default string charset to use.
158 * @throws UnsupportedCharsetException
159 * If no support for the named charset is available
160 * in this instance of the Java virtual machine
161 * @throws IllegalArgumentException
162 * If the given charsetName is null
163 * @throws IllegalCharsetNameException
164 * If the given charset name is illegal
165 *
166 * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable
167 */
168 public QuotedPrintableCodec(final String charsetName)
169 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
170 this(Charset.forName(charsetName), false);
171 }
172
173 /**
174 * Encodes byte into its quoted-printable representation.
175 *
176 * @param b
177 * byte to encode
178 * @param buffer
179 * the buffer to write to
180 * @return The number of bytes written to the <code>buffer</code>
181 */
182 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
183 buffer.write(ESCAPE_CHAR);
184 final char hex1 = Utils.hexDigit(b >> 4);
185 final char hex2 = Utils.hexDigit(b);
186 buffer.write(hex1);
187 buffer.write(hex2);
188 return 3;
189 }
190
191 /**
192 * Return the byte at position <code>index</code> of the byte array and
193 * make sure it is unsigned.
194 *
195 * @param index
196 * position in the array
197 * @param bytes
198 * the byte array
199 * @return the unsigned octet at position <code>index</code> from the array
200 */
201 private static int getUnsignedOctet(final int index, final byte[] bytes) {
202 int b = bytes[index];
203 if (b < 0) {
204 b = 256 + b;
205 }
206 return b;
207 }
208
209 /**
210 * Write a byte to the buffer.
211 *
212 * @param b
213 * byte to write
214 * @param encode
215 * indicates whether the octet shall be encoded
216 * @param buffer
217 * the buffer to write to
218 * @return the number of bytes that have been written to the buffer
219 */
220 private static int encodeByte(final int b, final boolean encode,
221 final ByteArrayOutputStream buffer) {
222 if (encode) {
223 return encodeQuotedPrintable(b, buffer);
224 }
225 buffer.write(b);
226 return 1;
227 }
228
229 /**
230 * Checks whether the given byte is whitespace.
231 *
232 * @param b
233 * byte to be checked
234 * @return <code>true</code> if the byte is either a space or tab character
235 */
236 private static boolean isWhitespace(final int b) {
237 return b == SPACE || b == TAB;
238 }
239
240 /**
241 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
242 * <p>
243 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
244 * RFC 1521 and is suitable for encoding binary data and unformatted text.
245 *
246 * @param printable
247 * bitset of characters deemed quoted-printable
248 * @param bytes
249 * array of bytes to be encoded
250 * @return array of bytes containing quoted-printable data
251 */
252 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
253 return encodeQuotedPrintable(printable, bytes, false);
254 }
255
256 /**
257 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
258 * <p>
259 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
260 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
261 * RFC 1521 and is suitable for encoding binary data and unformatted text.
262 *
263 * @param printable
264 * bitset of characters deemed quoted-printable
265 * @param bytes
266 * array of bytes to be encoded
267 * @param strict
268 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
269 * @return array of bytes containing quoted-printable data
270 * @since 1.10
271 */
272 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
273 if (bytes == null) {
274 return null;
275 }
276 if (printable == null) {
277 printable = PRINTABLE_CHARS;
278 }
279 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
280
281 if (strict) {
282 int pos = 1;
283 // encode up to buffer.length - 3, the last three octets will be treated
284 // separately for simplification of note #3
285 for (int i = 0; i < bytes.length - 3; i++) {
286 final int b = getUnsignedOctet(i, bytes);
287 if (pos < SAFE_LENGTH) {
288 // up to this length it is safe to add any byte, encoded or not
289 pos += encodeByte(b, !printable.get(b), buffer);
290 } else {
291 // rule #3: whitespace at the end of a line *must* be encoded
292 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
293
294 // rule #5: soft line break
295 buffer.write(ESCAPE_CHAR);
296 buffer.write(CR);
297 buffer.write(LF);
298 pos = 1;
299 }
300 }
301
302 // rule #3: whitespace at the end of a line *must* be encoded
303 // if we would do a soft break line after this octet, encode whitespace
304 int b = getUnsignedOctet(bytes.length - 3, bytes);
305 boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5);
306 pos += encodeByte(b, encode, buffer);
307
308 // note #3: '=' *must not* be the ultimate or penultimate character
309 // simplification: if < 6 bytes left, do a soft line break as we may need
310 // exactly 6 bytes space for the last 2 bytes
311 if (pos > SAFE_LENGTH - 2) {
312 buffer.write(ESCAPE_CHAR);
313 buffer.write(CR);
314 buffer.write(LF);
315 }
316 for (int i = bytes.length - 2; i < bytes.length; i++) {
317 b = getUnsignedOctet(i, bytes);
318 // rule #3: trailing whitespace shall be encoded
319 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
320 encodeByte(b, encode, buffer);
321 }
322 } else {
323 for (final byte c : bytes) {
324 int b = c;
325 if (b < 0) {
326 b = 256 + b;
327 }
328 if (printable.get(b)) {
329 buffer.write(b);
330 } else {
331 encodeQuotedPrintable(b, buffer);
332 }
333 }
334 }
335 return buffer.toByteArray();
336 }
337
338 /**
339 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
340 * back to their original representation.
341 * <p>
342 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
343 * defined in RFC 1521.
344 *
345 * @param bytes
346 * array of quoted-printable characters
347 * @return array of original bytes
348 * @throws DecoderException
349 * Thrown if quoted-printable decoding is unsuccessful
350 */
351 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
352 if (bytes == null) {
353 return null;
354 }
355 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
356 for (int i = 0; i < bytes.length; i++) {
357 final int b = bytes[i];
358 if (b == ESCAPE_CHAR) {
359 try {
360 // if the next octet is a CR we have found a soft line break
361 if (bytes[++i] == CR) {
362 continue;
363 }
364 final int u = Utils.digit16(bytes[i]);
365 final int l = Utils.digit16(bytes[++i]);
366 buffer.write((char) ((u << 4) + l));
367 } catch (final ArrayIndexOutOfBoundsException e) {
368 throw new DecoderException("Invalid quoted-printable encoding", e);
369 }
370 } else if (b != CR && b != LF) {
371 // every other octet is appended except for CR & LF
372 buffer.write(b);
373 }
374 }
375 return buffer.toByteArray();
376 }
377
378 /**
379 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
380 * <p>
381 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
382 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
383 * RFC 1521 and is suitable for encoding binary data and unformatted text.
384 *
385 * @param bytes
386 * array of bytes to be encoded
387 * @return array of bytes containing quoted-printable data
388 */
389 @Override
390 public byte[] encode(final byte[] bytes) {
391 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
392 }
393
394 /**
395 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
396 * back to their original representation.
397 * <p>
398 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
399 * defined in RFC 1521.
400 *
401 * @param bytes
402 * array of quoted-printable characters
403 * @return array of original bytes
404 * @throws DecoderException
405 * Thrown if quoted-printable decoding is unsuccessful
406 */
407 @Override
408 public byte[] decode(final byte[] bytes) throws DecoderException {
409 return decodeQuotedPrintable(bytes);
410 }
411
412 /**
413 * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped.
414 * <p>
415 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
416 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
417 * RFC 1521 and is suitable for encoding binary data and unformatted text.
418 *
419 * @param str
420 * string to convert to quoted-printable form
421 * @return quoted-printable string
422 * @throws EncoderException
423 * Thrown if quoted-printable encoding is unsuccessful
424 *
425 * @see #getCharset()
426 */
427 @Override
428 public String encode(final String str) throws EncoderException {
429 return this.encode(str, getCharset());
430 }
431
432 /**
433 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
434 * are converted back to their original representation.
435 *
436 * @param str
437 * quoted-printable string to convert into its original form
438 * @param charset
439 * the original string charset
440 * @return original string
441 * @throws DecoderException
442 * Thrown if quoted-printable decoding is unsuccessful
443 * @since 1.7
444 */
445 public String decode(final String str, final Charset charset) throws DecoderException {
446 if (str == null) {
447 return null;
448 }
449 return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset);
450 }
451
452 /**
453 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
454 * are converted back to their original representation.
455 *
456 * @param str
457 * quoted-printable string to convert into its original form
458 * @param charset
459 * the original string charset
460 * @return original string
461 * @throws DecoderException
462 * Thrown if quoted-printable decoding is unsuccessful
463 * @throws UnsupportedEncodingException
464 * Thrown if charset is not supported
465 */
466 public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException {
467 if (str == null) {
468 return null;
469 }
470 return new String(decode(StringUtils.getBytesUsAscii(str)), charset);
471 }
472
473 /**
474 * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are
475 * converted back to their original representation.
476 *
477 * @param str
478 * quoted-printable string to convert into its original form
479 * @return original string
480 * @throws DecoderException
481 * Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported.
482 * @see #getCharset()
483 */
484 @Override
485 public String decode(final String str) throws DecoderException {
486 return this.decode(str, this.getCharset());
487 }
488
489 /**
490 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
491 *
492 * @param obj
493 * string to convert to a quoted-printable form
494 * @return quoted-printable object
495 * @throws EncoderException
496 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
497 * unsuccessful
498 */
499 @Override
500 public Object encode(final Object obj) throws EncoderException {
501 if (obj == null) {
502 return null;
503 } else if (obj instanceof byte[]) {
504 return encode((byte[]) obj);
505 } else if (obj instanceof String) {
506 return encode((String) obj);
507 } else {
508 throw new EncoderException("Objects of type " +
509 obj.getClass().getName() +
510 " cannot be quoted-printable encoded");
511 }
512 }
513
514 /**
515 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
516 * representation.
517 *
518 * @param obj
519 * quoted-printable object to convert into its original form
520 * @return original object
521 * @throws DecoderException
522 * Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure
523 * condition is encountered during the decode process.
524 */
525 @Override
526 public Object decode(final Object obj) throws DecoderException {
527 if (obj == null) {
528 return null;
529 } else if (obj instanceof byte[]) {
530 return decode((byte[]) obj);
531 } else if (obj instanceof String) {
532 return decode((String) obj);
533 } else {
534 throw new DecoderException("Objects of type " +
535 obj.getClass().getName() +
536 " cannot be quoted-printable decoded");
537 }
538 }
539
540 /**
541 * Gets the default charset name used for string decoding and encoding.
542 *
543 * @return the default charset name
544 * @since 1.7
545 */
546 public Charset getCharset() {
547 return this.charset;
548 }
549
550 /**
551 * Gets the default charset name used for string decoding and encoding.
552 *
553 * @return the default charset name
554 */
555 public String getDefaultCharset() {
556 return this.charset.name();
557 }
558
559 /**
560 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
561 * <p>
562 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
563 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
564 * RFC 1521 and is suitable for encoding binary data and unformatted text.
565 *
566 * @param str
567 * string to convert to quoted-printable form
568 * @param charset
569 * the charset for str
570 * @return quoted-printable string
571 * @since 1.7
572 */
573 public String encode(final String str, final Charset charset) {
574 if (str == null) {
575 return null;
576 }
577 return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset)));
578 }
579
580 /**
581 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
582 * <p>
583 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
584 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
585 * RFC 1521 and is suitable for encoding binary data and unformatted text.
586 *
587 * @param str
588 * string to convert to quoted-printable form
589 * @param charset
590 * the charset for str
591 * @return quoted-printable string
592 * @throws UnsupportedEncodingException
593 * Thrown if the charset is not supported
594 */
595 public String encode(final String str, final String charset) throws UnsupportedEncodingException {
596 if (str == null) {
597 return null;
598 }
599 return StringUtils.newStringUsAscii(encode(str.getBytes(charset)));
600 }
601 }