View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.net;
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.UnsupportedEncodingException;
22  import java.nio.charset.Charset;
23  import java.nio.charset.IllegalCharsetNameException;
24  import java.nio.charset.UnsupportedCharsetException;
25  import java.util.BitSet;
26  
27  import org.apache.commons.codec.BinaryDecoder;
28  import org.apache.commons.codec.BinaryEncoder;
29  import org.apache.commons.codec.Charsets;
30  import org.apache.commons.codec.DecoderException;
31  import org.apache.commons.codec.EncoderException;
32  import org.apache.commons.codec.StringDecoder;
33  import org.apache.commons.codec.StringEncoder;
34  import org.apache.commons.codec.binary.StringUtils;
35  
36  /**
37   * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38   * <p>
39   * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
40   * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
41   * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
42   * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
43   * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
44   * gateway.
45   * <p>
46   * Note:
47   * <p>
48   * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
49   * quoted-printable spec:
50   * <ul>
51   *   <li>{@code strict=false}: only rules #1 and #2 are implemented
52   *   <li>{@code strict=true}: all rules #1 through #5 are implemented
53   * </ul>
54   * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
55   * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
56   * Q codec. The strict mode has been added in 1.10.
57   * <p>
58   * This class is immutable and thread-safe.
59   *
60   * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
61   *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
62   *
63   * @since 1.3
64   * @version $Id: QuotedPrintableCodec.html 928559 2014-11-10 02:53:54Z ggregory $
65   */
66  public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
67      /**
68       * The default charset used for string decoding and encoding.
69       */
70      private final Charset charset;
71  
72      /**
73       * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
74       */
75      private final boolean strict;
76  
77      /**
78       * BitSet of printable characters as defined in RFC 1521.
79       */
80      private static final BitSet PRINTABLE_CHARS = new BitSet(256);
81  
82      private static final byte ESCAPE_CHAR = '=';
83  
84      private static final byte TAB = 9;
85  
86      private static final byte SPACE = 32;
87  
88      private static final byte CR = 13;
89  
90      private static final byte LF = 10;
91  
92      /**
93       * Safe line length for quoted printable encoded text.
94       */
95      private static final int SAFE_LENGTH = 73;
96  
97      // Static initializer for printable chars collection
98      static {
99          // alpha characters
100         for (int i = 33; i <= 60; i++) {
101             PRINTABLE_CHARS.set(i);
102         }
103         for (int i = 62; i <= 126; i++) {
104             PRINTABLE_CHARS.set(i);
105         }
106         PRINTABLE_CHARS.set(TAB);
107         PRINTABLE_CHARS.set(SPACE);
108     }
109 
110     /**
111      * Default constructor, assumes default charset of {@link Charsets#UTF_8}
112      */
113     public QuotedPrintableCodec() {
114         this(Charsets.UTF_8, false);
115     }
116 
117     /**
118      * Constructor which allows for the selection of the strict mode.
119      *
120      * @param strict
121      *            if {@code true}, soft line breaks will be used
122      * @since 1.10
123      */
124     public QuotedPrintableCodec(final boolean strict) {
125         this(Charsets.UTF_8, strict);
126     }
127 
128     /**
129      * Constructor which allows for the selection of a default charset.
130      *
131      * @param charset
132      *            the default string charset to use.
133      * @since 1.7
134      */
135     public QuotedPrintableCodec(final Charset charset) {
136         this(charset, false);
137     }
138 
139     /**
140      * Constructor which allows for the selection of a default charset and strict mode.
141      *
142      * @param charset
143      *            the default string charset to use.
144      * @param strict
145      *            if {@code true}, soft line breaks will be used
146      * @since 1.10
147      */
148     public QuotedPrintableCodec(final Charset charset, final boolean strict) {
149         this.charset = charset;
150         this.strict = strict;
151     }
152 
153     /**
154      * Constructor which allows for the selection of a default charset.
155      *
156      * @param charsetName
157      *            the default string charset to use.
158      * @throws UnsupportedCharsetException
159      *             If no support for the named charset is available
160      *             in this instance of the Java virtual machine
161      * @throws IllegalArgumentException
162      *             If the given charsetName is null
163      * @throws IllegalCharsetNameException
164      *             If the given charset name is illegal
165      *
166      * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable
167      */
168     public QuotedPrintableCodec(final String charsetName)
169             throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
170         this(Charset.forName(charsetName), false);
171     }
172 
173     /**
174      * Encodes byte into its quoted-printable representation.
175      *
176      * @param b
177      *            byte to encode
178      * @param buffer
179      *            the buffer to write to
180      * @return The number of bytes written to the <code>buffer</code>
181      */
182     private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
183         buffer.write(ESCAPE_CHAR);
184         final char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
185         final char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16));
186         buffer.write(hex1);
187         buffer.write(hex2);
188         return 3;
189     }
190 
191     /**
192      * Return the byte at position <code>index</code> of the byte array and
193      * make sure it is unsigned.
194      *
195      * @param index
196      *            position in the array
197      * @param bytes
198      *            the byte array
199      * @return the unsigned octet at position <code>index</code> from the array
200      */
201     private static int getUnsignedOctet(final int index, final byte[] bytes) {
202         int b = bytes[index];
203         if (b < 0) {
204             b = 256 + b;
205         }
206         return b;
207     }
208 
209     /**
210      * Write a byte to the buffer.
211      *
212      * @param b
213      *            byte to write
214      * @param encode
215      *            indicates whether the octet shall be encoded
216      * @param buffer
217      *            the buffer to write to
218      * @return the number of bytes that have been written to the buffer
219      */
220     private static int encodeByte(final int b, final boolean encode,
221                                   final ByteArrayOutputStream buffer) {
222         if (encode) {
223             return encodeQuotedPrintable(b, buffer);
224         } else {
225             buffer.write(b);
226             return 1;
227         }
228     }
229 
230     /**
231      * Checks whether the given byte is whitespace.
232      *
233      * @param b
234      *            byte to be checked
235      * @return <code>true</code> if the byte is either a space or tab character
236      */
237     private static boolean isWhitespace(final int b) {
238         return b == SPACE || b == TAB;
239     }
240 
241     /**
242      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
243      * <p>
244      * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
245      * RFC 1521 and is suitable for encoding binary data and unformatted text.
246      *
247      * @param printable
248      *            bitset of characters deemed quoted-printable
249      * @param bytes
250      *            array of bytes to be encoded
251      * @return array of bytes containing quoted-printable data
252      */
253     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes) {
254         return encodeQuotedPrintable(printable, bytes, false);
255     }
256 
257     /**
258      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
259      * <p>
260      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
261      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
262      * RFC 1521 and is suitable for encoding binary data and unformatted text.
263      *
264      * @param printable
265      *            bitset of characters deemed quoted-printable
266      * @param bytes
267      *            array of bytes to be encoded
268      * @param strict
269      *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
270      * @return array of bytes containing quoted-printable data
271      * @since 1.10
272      */
273     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, boolean strict) {
274         if (bytes == null) {
275             return null;
276         }
277         if (printable == null) {
278             printable = PRINTABLE_CHARS;
279         }
280         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
281 
282         if (strict) {
283             int pos = 1;
284             // encode up to buffer.length - 3, the last three octets will be treated
285             // separately for simplification of note #3
286             for (int i = 0; i < bytes.length - 3; i++) {
287                 int b = getUnsignedOctet(i, bytes);
288                 if (pos < SAFE_LENGTH) {
289                     // up to this length it is safe to add any byte, encoded or not
290                     pos += encodeByte(b, !printable.get(b), buffer);
291                 } else {
292                     // rule #3: whitespace at the end of a line *must* be encoded
293                     encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
294 
295                     // rule #5: soft line break
296                     buffer.write(ESCAPE_CHAR);
297                     buffer.write(CR);
298                     buffer.write(LF);
299                     pos = 1;
300                 }
301             }
302 
303             // rule #3: whitespace at the end of a line *must* be encoded
304             // if we would do a soft break line after this octet, encode whitespace
305             int b = getUnsignedOctet(bytes.length - 3, bytes);
306             boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5);
307             pos += encodeByte(b, encode, buffer);
308 
309             // note #3: '=' *must not* be the ultimate or penultimate character
310             // simplification: if < 6 bytes left, do a soft line break as we may need
311             //                 exactly 6 bytes space for the last 2 bytes
312             if (pos > SAFE_LENGTH - 2) {
313                 buffer.write(ESCAPE_CHAR);
314                 buffer.write(CR);
315                 buffer.write(LF);
316             }
317             for (int i = bytes.length - 2; i < bytes.length; i++) {
318                 b = getUnsignedOctet(i, bytes);
319                 // rule #3: trailing whitespace shall be encoded
320                 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
321                 encodeByte(b, encode, buffer);
322             }
323         } else {
324             for (final byte c : bytes) {
325                 int b = c;
326                 if (b < 0) {
327                     b = 256 + b;
328                 }
329                 if (printable.get(b)) {
330                     buffer.write(b);
331                 } else {
332                     encodeQuotedPrintable(b, buffer);
333                 }
334             }
335         }
336         return buffer.toByteArray();
337     }
338 
339     /**
340      * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
341      * back to their original representation.
342      * <p>
343      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
344      * defined in RFC 1521.
345      *
346      * @param bytes
347      *            array of quoted-printable characters
348      * @return array of original bytes
349      * @throws DecoderException
350      *             Thrown if quoted-printable decoding is unsuccessful
351      */
352     public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
353         if (bytes == null) {
354             return null;
355         }
356         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
357         for (int i = 0; i < bytes.length; i++) {
358             final int b = bytes[i];
359             if (b == ESCAPE_CHAR) {
360                 try {
361                     // if the next octet is a CR we have found a soft line break
362                     if (bytes[++i] == CR) {
363                         continue;
364                     }
365                     final int u = Utils.digit16(bytes[i]);
366                     final int l = Utils.digit16(bytes[++i]);
367                     buffer.write((char) ((u << 4) + l));
368                 } catch (final ArrayIndexOutOfBoundsException e) {
369                     throw new DecoderException("Invalid quoted-printable encoding", e);
370                 }
371             } else if (b != CR && b != LF) {
372                 // every other octet is appended except for CR & LF
373                 buffer.write(b);
374             }
375         }
376         return buffer.toByteArray();
377     }
378 
379     /**
380      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
381      * <p>
382      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
383      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
384      * RFC 1521 and is suitable for encoding binary data and unformatted text.
385      *
386      * @param bytes
387      *            array of bytes to be encoded
388      * @return array of bytes containing quoted-printable data
389      */
390     @Override
391     public byte[] encode(final byte[] bytes) {
392         return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
393     }
394 
395     /**
396      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
397      * back to their original representation.
398      * <p>
399      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
400      * defined in RFC 1521.
401      *
402      * @param bytes
403      *            array of quoted-printable characters
404      * @return array of original bytes
405      * @throws DecoderException
406      *             Thrown if quoted-printable decoding is unsuccessful
407      */
408     @Override
409     public byte[] decode(final byte[] bytes) throws DecoderException {
410         return decodeQuotedPrintable(bytes);
411     }
412 
413     /**
414      * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped.
415      * <p>
416      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
417      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
418      * RFC 1521 and is suitable for encoding binary data and unformatted text.
419      *
420      * @param str
421      *            string to convert to quoted-printable form
422      * @return quoted-printable string
423      * @throws EncoderException
424      *             Thrown if quoted-printable encoding is unsuccessful
425      *
426      * @see #getCharset()
427      */
428     @Override
429     public String encode(final String str) throws EncoderException {
430         return this.encode(str, getCharset());
431     }
432 
433     /**
434      * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
435      * are converted back to their original representation.
436      *
437      * @param str
438      *            quoted-printable string to convert into its original form
439      * @param charset
440      *            the original string charset
441      * @return original string
442      * @throws DecoderException
443      *             Thrown if quoted-printable decoding is unsuccessful
444      * @since 1.7
445      */
446     public String decode(final String str, final Charset charset) throws DecoderException {
447         if (str == null) {
448             return null;
449         }
450         return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset);
451     }
452 
453     /**
454      * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
455      * are converted back to their original representation.
456      *
457      * @param str
458      *            quoted-printable string to convert into its original form
459      * @param charset
460      *            the original string charset
461      * @return original string
462      * @throws DecoderException
463      *             Thrown if quoted-printable decoding is unsuccessful
464      * @throws UnsupportedEncodingException
465      *             Thrown if charset is not supported
466      */
467     public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException {
468         if (str == null) {
469             return null;
470         }
471         return new String(decode(StringUtils.getBytesUsAscii(str)), charset);
472     }
473 
474     /**
475      * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are
476      * converted back to their original representation.
477      *
478      * @param str
479      *            quoted-printable string to convert into its original form
480      * @return original string
481      * @throws DecoderException
482      *             Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported.
483      * @see #getCharset()
484      */
485     @Override
486     public String decode(final String str) throws DecoderException {
487         return this.decode(str, this.getCharset());
488     }
489 
490     /**
491      * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
492      *
493      * @param obj
494      *            string to convert to a quoted-printable form
495      * @return quoted-printable object
496      * @throws EncoderException
497      *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
498      *             unsuccessful
499      */
500     @Override
501     public Object encode(final Object obj) throws EncoderException {
502         if (obj == null) {
503             return null;
504         } else if (obj instanceof byte[]) {
505             return encode((byte[]) obj);
506         } else if (obj instanceof String) {
507             return encode((String) obj);
508         } else {
509             throw new EncoderException("Objects of type " +
510                   obj.getClass().getName() +
511                   " cannot be quoted-printable encoded");
512         }
513     }
514 
515     /**
516      * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
517      * representation.
518      *
519      * @param obj
520      *            quoted-printable object to convert into its original form
521      * @return original object
522      * @throws DecoderException
523      *             Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure
524      *             condition is encountered during the decode process.
525      */
526     @Override
527     public Object decode(final Object obj) throws DecoderException {
528         if (obj == null) {
529             return null;
530         } else if (obj instanceof byte[]) {
531             return decode((byte[]) obj);
532         } else if (obj instanceof String) {
533             return decode((String) obj);
534         } else {
535             throw new DecoderException("Objects of type " +
536                   obj.getClass().getName() +
537                   " cannot be quoted-printable decoded");
538         }
539     }
540 
541     /**
542      * Gets the default charset name used for string decoding and encoding.
543      *
544      * @return the default charset name
545      * @since 1.7
546      */
547     public Charset getCharset() {
548         return this.charset;
549     }
550 
551     /**
552      * Gets the default charset name used for string decoding and encoding.
553      *
554      * @return the default charset name
555      */
556     public String getDefaultCharset() {
557         return this.charset.name();
558     }
559 
560     /**
561      * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
562      * <p>
563      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
564      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
565      * RFC 1521 and is suitable for encoding binary data and unformatted text.
566      *
567      * @param str
568      *            string to convert to quoted-printable form
569      * @param charset
570      *            the charset for str
571      * @return quoted-printable string
572      * @since 1.7
573      */
574     public String encode(final String str, final Charset charset) {
575         if (str == null) {
576             return null;
577         }
578         return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset)));
579     }
580 
581     /**
582      * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
583      * <p>
584      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
585      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
586      * RFC 1521 and is suitable for encoding binary data and unformatted text.
587      *
588      * @param str
589      *            string to convert to quoted-printable form
590      * @param charset
591      *            the charset for str
592      * @return quoted-printable string
593      * @throws UnsupportedEncodingException
594      *             Thrown if the charset is not supported
595      */
596     public String encode(final String str, final String charset) throws UnsupportedEncodingException {
597         if (str == null) {
598             return null;
599         }
600         return StringUtils.newStringUsAscii(encode(str.getBytes(charset)));
601     }
602 }