1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.net;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.UnsupportedCharsetException;
25 import java.util.BitSet;
26
27 import org.apache.commons.codec.BinaryDecoder;
28 import org.apache.commons.codec.BinaryEncoder;
29 import org.apache.commons.codec.Charsets;
30 import org.apache.commons.codec.DecoderException;
31 import org.apache.commons.codec.EncoderException;
32 import org.apache.commons.codec.StringDecoder;
33 import org.apache.commons.codec.StringEncoder;
34 import org.apache.commons.codec.binary.StringUtils;
35
36 /**
37 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38 * <p>
39 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
40 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
41 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
42 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
43 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
44 * gateway.
45 * <p>
46 * Note:
47 * <p>
48 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
49 * quoted-printable spec:
50 * <ul>
51 * <li>{@code strict=false}: only rules #1 and #2 are implemented
52 * <li>{@code strict=true}: all rules #1 through #5 are implemented
53 * </ul>
54 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
55 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
56 * Q codec. The strict mode has been added in 1.10.
57 * <p>
58 * This class is immutable and thread-safe.
59 *
60 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
61 * Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
62 *
63 * @since 1.3
64 * @version $Id: QuotedPrintableCodec.html 928559 2014-11-10 02:53:54Z ggregory $
65 */
66 public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
67 /**
68 * The default charset used for string decoding and encoding.
69 */
70 private final Charset charset;
71
72 /**
73 * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
74 */
75 private final boolean strict;
76
77 /**
78 * BitSet of printable characters as defined in RFC 1521.
79 */
80 private static final BitSet PRINTABLE_CHARS = new BitSet(256);
81
82 private static final byte ESCAPE_CHAR = '=';
83
84 private static final byte TAB = 9;
85
86 private static final byte SPACE = 32;
87
88 private static final byte CR = 13;
89
90 private static final byte LF = 10;
91
92 /**
93 * Safe line length for quoted printable encoded text.
94 */
95 private static final int SAFE_LENGTH = 73;
96
97 // Static initializer for printable chars collection
98 static {
99 // alpha characters
100 for (int i = 33; i <= 60; i++) {
101 PRINTABLE_CHARS.set(i);
102 }
103 for (int i = 62; i <= 126; i++) {
104 PRINTABLE_CHARS.set(i);
105 }
106 PRINTABLE_CHARS.set(TAB);
107 PRINTABLE_CHARS.set(SPACE);
108 }
109
110 /**
111 * Default constructor, assumes default charset of {@link Charsets#UTF_8}
112 */
113 public QuotedPrintableCodec() {
114 this(Charsets.UTF_8, false);
115 }
116
117 /**
118 * Constructor which allows for the selection of the strict mode.
119 *
120 * @param strict
121 * if {@code true}, soft line breaks will be used
122 * @since 1.10
123 */
124 public QuotedPrintableCodec(final boolean strict) {
125 this(Charsets.UTF_8, strict);
126 }
127
128 /**
129 * Constructor which allows for the selection of a default charset.
130 *
131 * @param charset
132 * the default string charset to use.
133 * @since 1.7
134 */
135 public QuotedPrintableCodec(final Charset charset) {
136 this(charset, false);
137 }
138
139 /**
140 * Constructor which allows for the selection of a default charset and strict mode.
141 *
142 * @param charset
143 * the default string charset to use.
144 * @param strict
145 * if {@code true}, soft line breaks will be used
146 * @since 1.10
147 */
148 public QuotedPrintableCodec(final Charset charset, final boolean strict) {
149 this.charset = charset;
150 this.strict = strict;
151 }
152
153 /**
154 * Constructor which allows for the selection of a default charset.
155 *
156 * @param charsetName
157 * the default string charset to use.
158 * @throws UnsupportedCharsetException
159 * If no support for the named charset is available
160 * in this instance of the Java virtual machine
161 * @throws IllegalArgumentException
162 * If the given charsetName is null
163 * @throws IllegalCharsetNameException
164 * If the given charset name is illegal
165 *
166 * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable
167 */
168 public QuotedPrintableCodec(final String charsetName)
169 throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
170 this(Charset.forName(charsetName), false);
171 }
172
173 /**
174 * Encodes byte into its quoted-printable representation.
175 *
176 * @param b
177 * byte to encode
178 * @param buffer
179 * the buffer to write to
180 * @return The number of bytes written to the <code>buffer</code>
181 */
182 private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
183 buffer.write(ESCAPE_CHAR);
184 final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
185 final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16));
186 buffer.write(hex1);
187 buffer.write(hex2);
188 return 3;
189 }
190
191 /**
192 * Return the byte at position <code>index</code> of the byte array and
193 * make sure it is unsigned.
194 *
195 * @param index
196 * position in the array
197 * @param bytes
198 * the byte array
199 * @return the unsigned octet at position <code>index</code> from the array
200 */
201 private static int getUnsignedOctet(final int index, final byte[] bytes) {
202 int b = bytes[index];
203 if (b < 0) {
204 b = 256 + b;
205 }
206 return b;
207 }
208
209 /**
210 * Write a byte to the buffer.
211 *
212 * @param b
213 * byte to write
214 * @param encode
215 * indicates whether the octet shall be encoded
216 * @param buffer
217 * the buffer to write to
218 * @return the number of bytes that have been written to the buffer
219 */
220 private static int encodeByte(final int b, final boolean encode,
221 final ByteArrayOutputStream buffer) {
222 if (encode) {
223 return encodeQuotedPrintable(b, buffer);
224 } else {
225 buffer.write(b);
226 return 1;
227 }
228 }
229
230 /**
231 * Checks whether the given byte is whitespace.
232 *
233 * @param b
234 * byte to be checked
235 * @return <code>true</code> if the byte is either a space or tab character
236 */
237 private static boolean isWhitespace(final int b) {
238 return b == SPACE || b == TAB;
239 }
240
241 /**
242 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
243 * <p>
244 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
245 * RFC 1521 and is suitable for encoding binary data and unformatted text.
246 *
247 * @param printable
248 * bitset of characters deemed quoted-printable
249 * @param bytes
250 * array of bytes to be encoded
251 * @return array of bytes containing quoted-printable data
252 */
253 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes) {
254 return encodeQuotedPrintable(printable, bytes, false);
255 }
256
257 /**
258 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
259 * <p>
260 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
261 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
262 * RFC 1521 and is suitable for encoding binary data and unformatted text.
263 *
264 * @param printable
265 * bitset of characters deemed quoted-printable
266 * @param bytes
267 * array of bytes to be encoded
268 * @param strict
269 * if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
270 * @return array of bytes containing quoted-printable data
271 * @since 1.10
272 */
273 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, boolean strict) {
274 if (bytes == null) {
275 return null;
276 }
277 if (printable == null) {
278 printable = PRINTABLE_CHARS;
279 }
280 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
281
282 if (strict) {
283 int pos = 1;
284 // encode up to buffer.length - 3, the last three octets will be treated
285 // separately for simplification of note #3
286 for (int i = 0; i < bytes.length - 3; i++) {
287 int b = getUnsignedOctet(i, bytes);
288 if (pos < SAFE_LENGTH) {
289 // up to this length it is safe to add any byte, encoded or not
290 pos += encodeByte(b, !printable.get(b), buffer);
291 } else {
292 // rule #3: whitespace at the end of a line *must* be encoded
293 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
294
295 // rule #5: soft line break
296 buffer.write(ESCAPE_CHAR);
297 buffer.write(CR);
298 buffer.write(LF);
299 pos = 1;
300 }
301 }
302
303 // rule #3: whitespace at the end of a line *must* be encoded
304 // if we would do a soft break line after this octet, encode whitespace
305 int b = getUnsignedOctet(bytes.length - 3, bytes);
306 boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5);
307 pos += encodeByte(b, encode, buffer);
308
309 // note #3: '=' *must not* be the ultimate or penultimate character
310 // simplification: if < 6 bytes left, do a soft line break as we may need
311 // exactly 6 bytes space for the last 2 bytes
312 if (pos > SAFE_LENGTH - 2) {
313 buffer.write(ESCAPE_CHAR);
314 buffer.write(CR);
315 buffer.write(LF);
316 }
317 for (int i = bytes.length - 2; i < bytes.length; i++) {
318 b = getUnsignedOctet(i, bytes);
319 // rule #3: trailing whitespace shall be encoded
320 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
321 encodeByte(b, encode, buffer);
322 }
323 } else {
324 for (final byte c : bytes) {
325 int b = c;
326 if (b < 0) {
327 b = 256 + b;
328 }
329 if (printable.get(b)) {
330 buffer.write(b);
331 } else {
332 encodeQuotedPrintable(b, buffer);
333 }
334 }
335 }
336 return buffer.toByteArray();
337 }
338
339 /**
340 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
341 * back to their original representation.
342 * <p>
343 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
344 * defined in RFC 1521.
345 *
346 * @param bytes
347 * array of quoted-printable characters
348 * @return array of original bytes
349 * @throws DecoderException
350 * Thrown if quoted-printable decoding is unsuccessful
351 */
352 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
353 if (bytes == null) {
354 return null;
355 }
356 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
357 for (int i = 0; i < bytes.length; i++) {
358 final int b = bytes[i];
359 if (b == ESCAPE_CHAR) {
360 try {
361 // if the next octet is a CR we have found a soft line break
362 if (bytes[++i] == CR) {
363 continue;
364 }
365 final int u = Utils.digit16(bytes[i]);
366 final int l = Utils.digit16(bytes[++i]);
367 buffer.write((char) ((u << 4) + l));
368 } catch (final ArrayIndexOutOfBoundsException e) {
369 throw new DecoderException("Invalid quoted-printable encoding", e);
370 }
371 } else if (b != CR && b != LF) {
372 // every other octet is appended except for CR & LF
373 buffer.write(b);
374 }
375 }
376 return buffer.toByteArray();
377 }
378
379 /**
380 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
381 * <p>
382 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
383 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
384 * RFC 1521 and is suitable for encoding binary data and unformatted text.
385 *
386 * @param bytes
387 * array of bytes to be encoded
388 * @return array of bytes containing quoted-printable data
389 */
390 @Override
391 public byte[] encode(final byte[] bytes) {
392 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
393 }
394
395 /**
396 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
397 * back to their original representation.
398 * <p>
399 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
400 * defined in RFC 1521.
401 *
402 * @param bytes
403 * array of quoted-printable characters
404 * @return array of original bytes
405 * @throws DecoderException
406 * Thrown if quoted-printable decoding is unsuccessful
407 */
408 @Override
409 public byte[] decode(final byte[] bytes) throws DecoderException {
410 return decodeQuotedPrintable(bytes);
411 }
412
413 /**
414 * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped.
415 * <p>
416 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
417 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
418 * RFC 1521 and is suitable for encoding binary data and unformatted text.
419 *
420 * @param str
421 * string to convert to quoted-printable form
422 * @return quoted-printable string
423 * @throws EncoderException
424 * Thrown if quoted-printable encoding is unsuccessful
425 *
426 * @see #getCharset()
427 */
428 @Override
429 public String encode(final String str) throws EncoderException {
430 return this.encode(str, getCharset());
431 }
432
433 /**
434 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
435 * are converted back to their original representation.
436 *
437 * @param str
438 * quoted-printable string to convert into its original form
439 * @param charset
440 * the original string charset
441 * @return original string
442 * @throws DecoderException
443 * Thrown if quoted-printable decoding is unsuccessful
444 * @since 1.7
445 */
446 public String decode(final String str, final Charset charset) throws DecoderException {
447 if (str == null) {
448 return null;
449 }
450 return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset);
451 }
452
453 /**
454 * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
455 * are converted back to their original representation.
456 *
457 * @param str
458 * quoted-printable string to convert into its original form
459 * @param charset
460 * the original string charset
461 * @return original string
462 * @throws DecoderException
463 * Thrown if quoted-printable decoding is unsuccessful
464 * @throws UnsupportedEncodingException
465 * Thrown if charset is not supported
466 */
467 public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException {
468 if (str == null) {
469 return null;
470 }
471 return new String(decode(StringUtils.getBytesUsAscii(str)), charset);
472 }
473
474 /**
475 * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are
476 * converted back to their original representation.
477 *
478 * @param str
479 * quoted-printable string to convert into its original form
480 * @return original string
481 * @throws DecoderException
482 * Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported.
483 * @see #getCharset()
484 */
485 @Override
486 public String decode(final String str) throws DecoderException {
487 return this.decode(str, this.getCharset());
488 }
489
490 /**
491 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
492 *
493 * @param obj
494 * string to convert to a quoted-printable form
495 * @return quoted-printable object
496 * @throws EncoderException
497 * Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
498 * unsuccessful
499 */
500 @Override
501 public Object encode(final Object obj) throws EncoderException {
502 if (obj == null) {
503 return null;
504 } else if (obj instanceof byte[]) {
505 return encode((byte[]) obj);
506 } else if (obj instanceof String) {
507 return encode((String) obj);
508 } else {
509 throw new EncoderException("Objects of type " +
510 obj.getClass().getName() +
511 " cannot be quoted-printable encoded");
512 }
513 }
514
515 /**
516 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
517 * representation.
518 *
519 * @param obj
520 * quoted-printable object to convert into its original form
521 * @return original object
522 * @throws DecoderException
523 * Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure
524 * condition is encountered during the decode process.
525 */
526 @Override
527 public Object decode(final Object obj) throws DecoderException {
528 if (obj == null) {
529 return null;
530 } else if (obj instanceof byte[]) {
531 return decode((byte[]) obj);
532 } else if (obj instanceof String) {
533 return decode((String) obj);
534 } else {
535 throw new DecoderException("Objects of type " +
536 obj.getClass().getName() +
537 " cannot be quoted-printable decoded");
538 }
539 }
540
541 /**
542 * Gets the default charset name used for string decoding and encoding.
543 *
544 * @return the default charset name
545 * @since 1.7
546 */
547 public Charset getCharset() {
548 return this.charset;
549 }
550
551 /**
552 * Gets the default charset name used for string decoding and encoding.
553 *
554 * @return the default charset name
555 */
556 public String getDefaultCharset() {
557 return this.charset.name();
558 }
559
560 /**
561 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
562 * <p>
563 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
564 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
565 * RFC 1521 and is suitable for encoding binary data and unformatted text.
566 *
567 * @param str
568 * string to convert to quoted-printable form
569 * @param charset
570 * the charset for str
571 * @return quoted-printable string
572 * @since 1.7
573 */
574 public String encode(final String str, final Charset charset) {
575 if (str == null) {
576 return null;
577 }
578 return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset)));
579 }
580
581 /**
582 * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
583 * <p>
584 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
585 * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
586 * RFC 1521 and is suitable for encoding binary data and unformatted text.
587 *
588 * @param str
589 * string to convert to quoted-printable form
590 * @param charset
591 * the charset for str
592 * @return quoted-printable string
593 * @throws UnsupportedEncodingException
594 * Thrown if the charset is not supported
595 */
596 public String encode(final String str, final String charset) throws UnsupportedEncodingException {
597 if (str == null) {
598 return null;
599 }
600 return StringUtils.newStringUsAscii(encode(str.getBytes(charset)));
601 }
602 }