View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.net;
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.UnsupportedEncodingException;
22  import java.nio.charset.Charset;
23  import java.nio.charset.IllegalCharsetNameException;
24  import java.nio.charset.StandardCharsets;
25  import java.nio.charset.UnsupportedCharsetException;
26  import java.util.BitSet;
27  
28  import org.apache.commons.codec.BinaryDecoder;
29  import org.apache.commons.codec.BinaryEncoder;
30  import org.apache.commons.codec.DecoderException;
31  import org.apache.commons.codec.EncoderException;
32  import org.apache.commons.codec.StringDecoder;
33  import org.apache.commons.codec.StringEncoder;
34  import org.apache.commons.codec.binary.StringUtils;
35  
36  /**
37   * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38   * <p>
39   * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
40   * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
41   * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
42   * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
43   * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
44   * gateway.
45   * </p>
46   * <p>
47   * Note:
48   * </p>
49   * <p>
50   * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
51   * quoted-printable spec:
52   * </p>
53   * <ul>
54   *   <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
55   *   <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
56   * </ul>
57   * <p>
58   * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
59   * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
60   * Q codec. The strict mode has been added in 1.10.
61   * </p>
62   * <p>
63   * This class is immutable and thread-safe.
64   * </p>
65   *
66   * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
67   *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
68   *
69   * @since 1.3
70   */
71  public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
72      /**
73       * BitSet of printable characters as defined in RFC 1521.
74       */
75      private static final BitSet PRINTABLE_CHARS = new BitSet(256);
76  
77      private static final byte ESCAPE_CHAR = '=';
78  
79      private static final byte TAB = 9;
80  
81      private static final byte SPACE = 32;
82  
83      private static final byte CR = 13;
84  
85      private static final byte LF = 10;
86  
87      /**
88       * Minimum length required for the byte arrays used by encodeQuotedPrintable method
89       */
90      private static final int MIN_BYTES = 3;
91  
92      /**
93       * Safe line length for quoted printable encoded text.
94       */
95      private static final int SAFE_LENGTH = 73;
96  
97      // Static initializer for printable chars collection
98      static {
99          // alpha characters
100         for (int i = 33; i <= 60; i++) {
101             PRINTABLE_CHARS.set(i);
102         }
103         for (int i = 62; i <= 126; i++) {
104             PRINTABLE_CHARS.set(i);
105         }
106         PRINTABLE_CHARS.set(TAB);
107         PRINTABLE_CHARS.set(SPACE);
108     }
109 
110     /**
111      * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
112      * back to their original representation.
113      * <p>
114      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
115      * defined in RFC 1521.
116      * </p>
117      *
118      * @param bytes
119      *            array of quoted-printable characters
120      * @return array of original bytes
121      * @throws DecoderException
122      *             Thrown if quoted-printable decoding is unsuccessful
123      */
124     public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
125         if (bytes == null) {
126             return null;
127         }
128         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
129         for (int i = 0; i < bytes.length; i++) {
130             final int b = bytes[i];
131             if (b == ESCAPE_CHAR) {
132                 try {
133                     // if the next octet is a CR we have found a soft line break
134                     if (bytes[++i] == CR) {
135                         continue;
136                     }
137                     final int u = Utils.digit16(bytes[i]);
138                     final int l = Utils.digit16(bytes[++i]);
139                     buffer.write((char) ((u << 4) + l));
140                 } catch (final ArrayIndexOutOfBoundsException e) {
141                     throw new DecoderException("Invalid quoted-printable encoding", e);
142                 }
143             } else if (b != CR && b != LF) {
144                 // every other octet is appended except for CR & LF
145                 buffer.write(b);
146             }
147         }
148         return buffer.toByteArray();
149     }
150 
151     /**
152      * Write a byte to the buffer.
153      *
154      * @param b
155      *            byte to write
156      * @param encode
157      *            indicates whether the octet shall be encoded
158      * @param buffer
159      *            the buffer to write to
160      * @return the number of bytes that have been written to the buffer
161      */
162     private static int encodeByte(final int b, final boolean encode,
163                                   final ByteArrayOutputStream buffer) {
164         if (encode) {
165             return encodeQuotedPrintable(b, buffer);
166         }
167         buffer.write(b);
168         return 1;
169     }
170 
171     /**
172      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
173      * <p>
174      * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
175      * RFC 1521 and is suitable for encoding binary data and unformatted text.
176      * </p>
177      *
178      * @param printable
179      *            bitset of characters deemed quoted-printable
180      * @param bytes
181      *            array of bytes to be encoded
182      * @return array of bytes containing quoted-printable data
183      */
184     public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
185         return encodeQuotedPrintable(printable, bytes, false);
186     }
187 
188     /**
189      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
190      * <p>
191      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
192      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
193      * RFC 1521 and is suitable for encoding binary data and unformatted text.
194      * </p>
195      *
196      * @param printable
197      *            bitset of characters deemed quoted-printable
198      * @param bytes
199      *            array of bytes to be encoded
200      * @param strict
201      *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
202      * @return array of bytes containing quoted-printable data
203      * @since 1.10
204      */
205     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
206         if (bytes == null) {
207             return null;
208         }
209         if (printable == null) {
210             printable = PRINTABLE_CHARS;
211         }
212         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
213         final int bytesLength = bytes.length;
214 
215         if (strict) {
216             if (bytesLength < MIN_BYTES) {
217                 return null;
218             }
219 
220             int pos = 1;
221             // encode up to buffer.length - 3, the last three octets will be treated
222             // separately for simplification of note #3
223             for (int i = 0; i < bytesLength - 3; i++) {
224                 final int b = getUnsignedOctet(i, bytes);
225                 if (pos < SAFE_LENGTH) {
226                     // up to this length it is safe to add any byte, encoded or not
227                     pos += encodeByte(b, !printable.get(b), buffer);
228                 } else {
229                     // rule #3: whitespace at the end of a line *must* be encoded
230                     encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
231 
232                     // rule #5: soft line break
233                     buffer.write(ESCAPE_CHAR);
234                     buffer.write(CR);
235                     buffer.write(LF);
236                     pos = 1;
237                 }
238             }
239 
240             // rule #3: whitespace at the end of a line *must* be encoded
241             // if we would do a soft break line after this octet, encode whitespace
242             int b = getUnsignedOctet(bytesLength - 3, bytes);
243             boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
244             pos += encodeByte(b, encode, buffer);
245 
246             // note #3: '=' *must not* be the ultimate or penultimate character
247             // simplification: if < 6 bytes left, do a soft line break as we may need
248             //                 exactly 6 bytes space for the last 2 bytes
249             if (pos > SAFE_LENGTH - 2) {
250                 buffer.write(ESCAPE_CHAR);
251                 buffer.write(CR);
252                 buffer.write(LF);
253             }
254             for (int i = bytesLength - 2; i < bytesLength; i++) {
255                 b = getUnsignedOctet(i, bytes);
256                 // rule #3: trailing whitespace shall be encoded
257                 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
258                 encodeByte(b, encode, buffer);
259             }
260         } else {
261             for (final byte c : bytes) {
262                 int b = c;
263                 if (b < 0) {
264                     b = 256 + b;
265                 }
266                 if (printable.get(b)) {
267                     buffer.write(b);
268                 } else {
269                     encodeQuotedPrintable(b, buffer);
270                 }
271             }
272         }
273         return buffer.toByteArray();
274     }
275 
276     /**
277      * Encodes byte into its quoted-printable representation.
278      *
279      * @param b
280      *            byte to encode
281      * @param buffer
282      *            the buffer to write to
283      * @return The number of bytes written to the {@code buffer}
284      */
285     private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
286         buffer.write(ESCAPE_CHAR);
287         final char hex1 = Utils.hexDigit(b >> 4);
288         final char hex2 = Utils.hexDigit(b);
289         buffer.write(hex1);
290         buffer.write(hex2);
291         return 3;
292     }
293 
294     /**
295      * Gets the byte at position {@code index} of the byte array and
296      * make sure it is unsigned.
297      *
298      * @param index
299      *            position in the array
300      * @param bytes
301      *            the byte array
302      * @return the unsigned octet at position {@code index} from the array
303      */
304     private static int getUnsignedOctet(final int index, final byte[] bytes) {
305         int b = bytes[index];
306         if (b < 0) {
307             b = 256 + b;
308         }
309         return b;
310     }
311 
312     /**
313      * Checks whether the given byte is whitespace.
314      *
315      * @param b
316      *            byte to be checked
317      * @return {@code true} if the byte is either a space or tab character
318      */
319     private static boolean isWhitespace(final int b) {
320         return b == SPACE || b == TAB;
321     }
322 
323     /**
324      * The default Charset used for string decoding and encoding.
325      */
326     private final Charset charset;
327 
328     /**
329      * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
330      */
331     private final boolean strict;
332 
333     /**
334      * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
335      */
336     public QuotedPrintableCodec() {
337         this(StandardCharsets.UTF_8, false);
338     }
339 
340     /**
341      * Constructor which allows for the selection of the strict mode.
342      *
343      * @param strict
344      *            if {@code true}, soft line breaks will be used
345      * @since 1.10
346      */
347     public QuotedPrintableCodec(final boolean strict) {
348         this(StandardCharsets.UTF_8, strict);
349     }
350 
351     /**
352      * Constructor which allows for the selection of a default Charset.
353      *
354      * @param charset
355      *            the default string Charset to use.
356      * @since 1.7
357      */
358     public QuotedPrintableCodec(final Charset charset) {
359         this(charset, false);
360     }
361 
362     /**
363      * Constructor which allows for the selection of a default Charset and strict mode.
364      *
365      * @param charset
366      *            the default string Charset to use.
367      * @param strict
368      *            if {@code true}, soft line breaks will be used
369      * @since 1.10
370      */
371     public QuotedPrintableCodec(final Charset charset, final boolean strict) {
372         this.charset = charset;
373         this.strict = strict;
374     }
375 
376     /**
377      * Constructor which allows for the selection of a default Charset.
378      *
379      * @param charsetName
380      *            the default string Charset to use.
381      * @throws UnsupportedCharsetException
382      *             If no support for the named Charset is available
383      *             in this instance of the Java virtual machine
384      * @throws IllegalArgumentException
385      *             If the given charsetName is null
386      * @throws IllegalCharsetNameException
387      *             If the given Charset name is illegal
388      *
389      * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
390      */
391     public QuotedPrintableCodec(final String charsetName)
392             throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
393         this(Charset.forName(charsetName), false);
394     }
395 
396     /**
397      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
398      * back to their original representation.
399      * <p>
400      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
401      * defined in RFC 1521.
402      * </p>
403      *
404      * @param bytes
405      *            array of quoted-printable characters
406      * @return array of original bytes
407      * @throws DecoderException
408      *             Thrown if quoted-printable decoding is unsuccessful
409      */
410     @Override
411     public byte[] decode(final byte[] bytes) throws DecoderException {
412         return decodeQuotedPrintable(bytes);
413     }
414 
415     /**
416      * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
417      * representation.
418      *
419      * @param obj
420      *            quoted-printable object to convert into its original form
421      * @return original object
422      * @throws DecoderException
423      *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
424      *             condition is encountered during the decode process.
425      */
426     @Override
427     public Object decode(final Object obj) throws DecoderException {
428         if (obj == null) {
429             return null;
430         }
431         if (obj instanceof byte[]) {
432             return decode((byte[]) obj);
433         }
434         if (obj instanceof String) {
435             return decode((String) obj);
436         }
437         throw new DecoderException("Objects of type " +
438               obj.getClass().getName() +
439               " cannot be quoted-printable decoded");
440     }
441 
442     /**
443      * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
444      * converted back to their original representation.
445      *
446      * @param sourceStr
447      *            quoted-printable string to convert into its original form
448      * @return original string
449      * @throws DecoderException
450      *             Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
451      * @see #getCharset()
452      */
453     @Override
454     public String decode(final String sourceStr) throws DecoderException {
455         return this.decode(sourceStr, this.getCharset());
456     }
457 
458     /**
459      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
460      * are converted back to their original representation.
461      *
462      * @param sourceStr
463      *            quoted-printable string to convert into its original form
464      * @param sourceCharset
465      *            the original string Charset
466      * @return original string
467      * @throws DecoderException
468      *             Thrown if quoted-printable decoding is unsuccessful
469      * @since 1.7
470      */
471     public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
472         if (sourceStr == null) {
473             return null;
474         }
475         return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
476     }
477 
478     /**
479      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
480      * are converted back to their original representation.
481      *
482      * @param sourceStr
483      *            quoted-printable string to convert into its original form
484      * @param sourceCharset
485      *            the original string Charset
486      * @return original string
487      * @throws DecoderException
488      *             Thrown if quoted-printable decoding is unsuccessful
489      * @throws UnsupportedEncodingException
490      *             Thrown if Charset is not supported
491      */
492     public String decode(final String sourceStr, final String sourceCharset)
493             throws DecoderException, UnsupportedEncodingException {
494         if (sourceStr == null) {
495             return null;
496         }
497         return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
498     }
499 
500     /**
501      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
502      * <p>
503      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
504      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
505      * RFC 1521 and is suitable for encoding binary data and unformatted text.
506      * </p>
507      *
508      * @param bytes
509      *            array of bytes to be encoded
510      * @return array of bytes containing quoted-printable data
511      */
512     @Override
513     public byte[] encode(final byte[] bytes) {
514         return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
515     }
516 
517     /**
518      * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
519      *
520      * @param obj
521      *            string to convert to a quoted-printable form
522      * @return quoted-printable object
523      * @throws EncoderException
524      *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
525      *             unsuccessful
526      */
527     @Override
528     public Object encode(final Object obj) throws EncoderException {
529         if (obj == null) {
530             return null;
531         }
532         if (obj instanceof byte[]) {
533             return encode((byte[]) obj);
534         }
535         if (obj instanceof String) {
536             return encode((String) obj);
537         }
538         throw new EncoderException("Objects of type " +
539               obj.getClass().getName() +
540               " cannot be quoted-printable encoded");
541     }
542 
543     /**
544      * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
545      * <p>
546      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
547      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
548      * RFC 1521 and is suitable for encoding binary data and unformatted text.
549      * </p>
550      *
551      * @param sourceStr
552      *            string to convert to quoted-printable form
553      * @return quoted-printable string
554      * @throws EncoderException
555      *             Thrown if quoted-printable encoding is unsuccessful
556      *
557      * @see #getCharset()
558      */
559     @Override
560     public String encode(final String sourceStr) throws EncoderException {
561         return this.encode(sourceStr, getCharset());
562     }
563 
564     /**
565      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
566      * <p>
567      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
568      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
569      * RFC 1521 and is suitable for encoding binary data and unformatted text.
570      * </p>
571      *
572      * @param sourceStr
573      *            string to convert to quoted-printable form
574      * @param sourceCharset
575      *            the Charset for sourceStr
576      * @return quoted-printable string
577      * @since 1.7
578      */
579     public String encode(final String sourceStr, final Charset sourceCharset) {
580         if (sourceStr == null) {
581             return null;
582         }
583         return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
584     }
585 
586     /**
587      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
588      * <p>
589      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
590      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
591      * RFC 1521 and is suitable for encoding binary data and unformatted text.
592      * </p>
593      *
594      * @param sourceStr
595      *            string to convert to quoted-printable form
596      * @param sourceCharset
597      *            the Charset for sourceStr
598      * @return quoted-printable string
599      * @throws UnsupportedEncodingException
600      *             Thrown if the Charset is not supported
601      */
602     public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
603         if (sourceStr == null) {
604             return null;
605         }
606         return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
607     }
608 
609     /**
610      * Gets the default Charset name used for string decoding and encoding.
611      *
612      * @return the default Charset name
613      * @since 1.7
614      */
615     public Charset getCharset() {
616         return this.charset;
617     }
618 
619     /**
620      * Gets the default Charset name used for string decoding and encoding.
621      *
622      * @return the default Charset name
623      */
624     public String getDefaultCharset() {
625         return this.charset.name();
626     }
627 }