001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.io.ByteArrayOutputStream;
021import java.io.UnsupportedEncodingException;
022import java.nio.charset.Charset;
023import java.nio.charset.IllegalCharsetNameException;
024import java.nio.charset.StandardCharsets;
025import java.nio.charset.UnsupportedCharsetException;
026import java.util.BitSet;
027
028import org.apache.commons.codec.BinaryDecoder;
029import org.apache.commons.codec.BinaryEncoder;
030import org.apache.commons.codec.DecoderException;
031import org.apache.commons.codec.EncoderException;
032import org.apache.commons.codec.StringDecoder;
033import org.apache.commons.codec.StringEncoder;
034import org.apache.commons.codec.binary.StringUtils;
035
036/**
037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
038 * <p>
039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
044 * gateway.
045 * </p>
046 * <p>
047 * Note:
048 * </p>
049 * <p>
050 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
051 * quoted-printable spec:
052 * </p>
053 * <ul>
054 *   <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
055 *   <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
056 * </ul>
057 * <p>
058 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
059 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
060 * Q codec. The strict mode has been added in 1.10.
061 * </p>
062 * <p>
063 * This class is immutable and thread-safe.
064 * </p>
065 *
066 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
067 *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
068 *
069 * @since 1.3
070 */
071public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
072    /**
073     * BitSet of printable characters as defined in RFC 1521.
074     */
075    private static final BitSet PRINTABLE_CHARS = new BitSet(256);
076
077    private static final byte ESCAPE_CHAR = '=';
078
079    private static final byte TAB = 9;
080
081    private static final byte SPACE = 32;
082
083    private static final byte CR = 13;
084
085    private static final byte LF = 10;
086
087    /**
088     * Minimum length required for the byte arrays used by encodeQuotedPrintable method
089     */
090    private static final int MIN_BYTES = 3;
091
092    /**
093     * Safe line length for quoted printable encoded text.
094     */
095    private static final int SAFE_LENGTH = 73;
096
097    // Static initializer for printable chars collection
098    static {
099        // alpha characters
100        for (int i = 33; i <= 60; i++) {
101            PRINTABLE_CHARS.set(i);
102        }
103        for (int i = 62; i <= 126; i++) {
104            PRINTABLE_CHARS.set(i);
105        }
106        PRINTABLE_CHARS.set(TAB);
107        PRINTABLE_CHARS.set(SPACE);
108    }
109
110    /**
111     * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
112     * back to their original representation.
113     * <p>
114     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
115     * defined in RFC 1521.
116     * </p>
117     *
118     * @param bytes
119     *            array of quoted-printable characters
120     * @return array of original bytes
121     * @throws DecoderException
122     *             Thrown if quoted-printable decoding is unsuccessful
123     */
124    public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
125        if (bytes == null) {
126            return null;
127        }
128        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
129        for (int i = 0; i < bytes.length; i++) {
130            final int b = bytes[i];
131            if (b == ESCAPE_CHAR) {
132                try {
133                    // if the next octet is a CR we have found a soft line break
134                    if (bytes[++i] == CR) {
135                        continue;
136                    }
137                    final int u = Utils.digit16(bytes[i]);
138                    final int l = Utils.digit16(bytes[++i]);
139                    buffer.write((char) ((u << 4) + l));
140                } catch (final ArrayIndexOutOfBoundsException e) {
141                    throw new DecoderException("Invalid quoted-printable encoding", e);
142                }
143            } else if (b != CR && b != LF) {
144                // every other octet is appended except for CR & LF
145                buffer.write(b);
146            }
147        }
148        return buffer.toByteArray();
149    }
150
151    /**
152     * Encodes a byte in the buffer.
153     *
154     * @param b
155     *            byte to write
156     * @param encode
157     *            indicates whether the octet shall be encoded
158     * @param buffer
159     *            the buffer to write to
160     * @return the number of bytes that have been written to the buffer
161     */
162    private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) {
163        if (encode) {
164            return encodeQuotedPrintable(b, buffer);
165        }
166        buffer.write(b);
167        return 1;
168    }
169
170    /**
171     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
172     * <p>
173     * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
174     * RFC 1521 and is suitable for encoding binary data and unformatted text.
175     * </p>
176     *
177     * @param printable
178     *            bitset of characters deemed quoted-printable
179     * @param bytes
180     *            array of bytes to be encoded
181     * @return array of bytes containing quoted-printable data
182     */
183    public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
184        return encodeQuotedPrintable(printable, bytes, false);
185    }
186
187    /**
188     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
189     * <p>
190     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
191     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
192     * RFC 1521 and is suitable for encoding binary data and unformatted text.
193     * </p>
194     *
195     * @param printable
196     *            bitset of characters deemed quoted-printable
197     * @param bytes
198     *            array of bytes to be encoded
199     * @param strict
200     *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
201     * @return array of bytes containing quoted-printable data
202     * @since 1.10
203     */
204    public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
205        if (bytes == null) {
206            return null;
207        }
208        if (printable == null) {
209            printable = PRINTABLE_CHARS;
210        }
211        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
212        final int bytesLength = bytes.length;
213
214        if (strict) {
215            if (bytesLength < MIN_BYTES) {
216                return null;
217            }
218
219            int pos = 1;
220            // encode up to buffer.length - 3, the last three octets will be treated
221            // separately for simplification of note #3
222            for (int i = 0; i < bytesLength - 3; i++) {
223                final int b = getUnsignedOctet(i, bytes);
224                if (pos < SAFE_LENGTH) {
225                    // up to this length it is safe to add any byte, encoded or not
226                    pos += encodeByte(b, !printable.get(b), buffer);
227                } else {
228                    // rule #3: whitespace at the end of a line *must* be encoded
229                    encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
230
231                    // rule #5: soft line break
232                    buffer.write(ESCAPE_CHAR);
233                    buffer.write(CR);
234                    buffer.write(LF);
235                    pos = 1;
236                }
237            }
238
239            // rule #3: whitespace at the end of a line *must* be encoded
240            // if we would do a soft break line after this octet, encode whitespace
241            int b = getUnsignedOctet(bytesLength - 3, bytes);
242            boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
243            pos += encodeByte(b, encode, buffer);
244
245            // note #3: '=' *must not* be the ultimate or penultimate character
246            // simplification: if < 6 bytes left, do a soft line break as we may need
247            //                 exactly 6 bytes space for the last 2 bytes
248            if (pos > SAFE_LENGTH - 2) {
249                buffer.write(ESCAPE_CHAR);
250                buffer.write(CR);
251                buffer.write(LF);
252            }
253            for (int i = bytesLength - 2; i < bytesLength; i++) {
254                b = getUnsignedOctet(i, bytes);
255                // rule #3: trailing whitespace shall be encoded
256                encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
257                encodeByte(b, encode, buffer);
258            }
259        } else {
260            for (final byte c : bytes) {
261                int b = c;
262                if (b < 0) {
263                    b = 256 + b;
264                }
265                if (printable.get(b)) {
266                    buffer.write(b);
267                } else {
268                    encodeQuotedPrintable(b, buffer);
269                }
270            }
271        }
272        return buffer.toByteArray();
273    }
274
275    /**
276     * Encodes byte into its quoted-printable representation.
277     *
278     * @param b
279     *            byte to encode
280     * @param buffer
281     *            the buffer to write to
282     * @return The number of bytes written to the {@code buffer}
283     */
284    private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
285        buffer.write(ESCAPE_CHAR);
286        final char hex1 = Utils.hexDigit(b >> 4);
287        final char hex2 = Utils.hexDigit(b);
288        buffer.write(hex1);
289        buffer.write(hex2);
290        return 3;
291    }
292
293    /**
294     * Gets the byte at position {@code index} of the byte array and
295     * make sure it is unsigned.
296     *
297     * @param index
298     *            position in the array
299     * @param bytes
300     *            the byte array
301     * @return the unsigned octet at position {@code index} from the array
302     */
303    private static int getUnsignedOctet(final int index, final byte[] bytes) {
304        int b = bytes[index];
305        if (b < 0) {
306            b = 256 + b;
307        }
308        return b;
309    }
310
311    /**
312     * Checks whether the given byte is whitespace.
313     *
314     * @param b
315     *            byte to be checked
316     * @return {@code true} if the byte is either a space or tab character
317     */
318    private static boolean isWhitespace(final int b) {
319        return b == SPACE || b == TAB;
320    }
321
322    /**
323     * The default Charset used for string decoding and encoding.
324     */
325    private final Charset charset;
326
327    /**
328     * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
329     */
330    private final boolean strict;
331
332    /**
333     * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
334     */
335    public QuotedPrintableCodec() {
336        this(StandardCharsets.UTF_8, false);
337    }
338
339    /**
340     * Constructor which allows for the selection of the strict mode.
341     *
342     * @param strict
343     *            if {@code true}, soft line breaks will be used
344     * @since 1.10
345     */
346    public QuotedPrintableCodec(final boolean strict) {
347        this(StandardCharsets.UTF_8, strict);
348    }
349
350    /**
351     * Constructor which allows for the selection of a default Charset.
352     *
353     * @param charset
354     *            the default string Charset to use.
355     * @since 1.7
356     */
357    public QuotedPrintableCodec(final Charset charset) {
358        this(charset, false);
359    }
360
361    /**
362     * Constructor which allows for the selection of a default Charset and strict mode.
363     *
364     * @param charset
365     *            the default string Charset to use.
366     * @param strict
367     *            if {@code true}, soft line breaks will be used
368     * @since 1.10
369     */
370    public QuotedPrintableCodec(final Charset charset, final boolean strict) {
371        this.charset = charset;
372        this.strict = strict;
373    }
374
375    /**
376     * Constructor which allows for the selection of a default Charset.
377     *
378     * @param charsetName
379     *            the default string Charset to use.
380     * @throws UnsupportedCharsetException
381     *             If no support for the named Charset is available
382     *             in this instance of the Java virtual machine
383     * @throws IllegalArgumentException
384     *             If the given charsetName is null
385     * @throws IllegalCharsetNameException
386     *             If the given Charset name is illegal
387     *
388     * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
389     */
390    public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
391        this(Charset.forName(charsetName), false);
392    }
393
394    /**
395     * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
396     * back to their original representation.
397     * <p>
398     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
399     * defined in RFC 1521.
400     * </p>
401     *
402     * @param bytes
403     *            array of quoted-printable characters
404     * @return array of original bytes
405     * @throws DecoderException
406     *             Thrown if quoted-printable decoding is unsuccessful
407     */
408    @Override
409    public byte[] decode(final byte[] bytes) throws DecoderException {
410        return decodeQuotedPrintable(bytes);
411    }
412
413    /**
414     * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
415     * representation.
416     *
417     * @param obj
418     *            quoted-printable object to convert into its original form
419     * @return original object
420     * @throws DecoderException
421     *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
422     *             condition is encountered during the decode process.
423     */
424    @Override
425    public Object decode(final Object obj) throws DecoderException {
426        if (obj == null) {
427            return null;
428        }
429        if (obj instanceof byte[]) {
430            return decode((byte[]) obj);
431        }
432        if (obj instanceof String) {
433            return decode((String) obj);
434        }
435        throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded");
436    }
437
438    /**
439     * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
440     * converted back to their original representation.
441     *
442     * @param sourceStr
443     *            quoted-printable string to convert into its original form
444     * @return original string
445     * @throws DecoderException
446     *             Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
447     * @see #getCharset()
448     */
449    @Override
450    public String decode(final String sourceStr) throws DecoderException {
451        return this.decode(sourceStr, this.getCharset());
452    }
453
454    /**
455     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
456     * are converted back to their original representation.
457     *
458     * @param sourceStr
459     *            quoted-printable string to convert into its original form
460     * @param sourceCharset
461     *            the original string Charset
462     * @return original string
463     * @throws DecoderException
464     *             Thrown if quoted-printable decoding is unsuccessful
465     * @since 1.7
466     */
467    public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
468        if (sourceStr == null) {
469            return null;
470        }
471        return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
472    }
473
474    /**
475     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
476     * are converted back to their original representation.
477     *
478     * @param sourceStr
479     *            quoted-printable string to convert into its original form
480     * @param sourceCharset
481     *            the original string Charset
482     * @return original string
483     * @throws DecoderException
484     *             Thrown if quoted-printable decoding is unsuccessful
485     * @throws UnsupportedEncodingException
486     *             Thrown if Charset is not supported
487     */
488    public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException {
489        if (sourceStr == null) {
490            return null;
491        }
492        return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
493    }
494
495    /**
496     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
497     * <p>
498     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
499     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
500     * RFC 1521 and is suitable for encoding binary data and unformatted text.
501     * </p>
502     *
503     * @param bytes
504     *            array of bytes to be encoded
505     * @return array of bytes containing quoted-printable data
506     */
507    @Override
508    public byte[] encode(final byte[] bytes) {
509        return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
510    }
511
512    /**
513     * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
514     *
515     * @param obj
516     *            string to convert to a quoted-printable form
517     * @return quoted-printable object
518     * @throws EncoderException
519     *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
520     *             unsuccessful
521     */
522    @Override
523    public Object encode(final Object obj) throws EncoderException {
524        if (obj == null) {
525            return null;
526        }
527        if (obj instanceof byte[]) {
528            return encode((byte[]) obj);
529        }
530        if (obj instanceof String) {
531            return encode((String) obj);
532        }
533        throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded");
534    }
535
536    /**
537     * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
538     * <p>
539     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
540     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
541     * RFC 1521 and is suitable for encoding binary data and unformatted text.
542     * </p>
543     *
544     * @param sourceStr
545     *            string to convert to quoted-printable form
546     * @return quoted-printable string
547     * @throws EncoderException
548     *             Thrown if quoted-printable encoding is unsuccessful
549     *
550     * @see #getCharset()
551     */
552    @Override
553    public String encode(final String sourceStr) throws EncoderException {
554        return encode(sourceStr, getCharset());
555    }
556
557    /**
558     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
559     * <p>
560     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
561     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
562     * RFC 1521 and is suitable for encoding binary data and unformatted text.
563     * </p>
564     *
565     * @param sourceStr
566     *            string to convert to quoted-printable form
567     * @param sourceCharset
568     *            the Charset for sourceStr
569     * @return quoted-printable string
570     * @since 1.7
571     */
572    public String encode(final String sourceStr, final Charset sourceCharset) {
573        if (sourceStr == null) {
574            return null;
575        }
576        return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
577    }
578
579    /**
580     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
581     * <p>
582     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
583     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
584     * RFC 1521 and is suitable for encoding binary data and unformatted text.
585     * </p>
586     *
587     * @param sourceStr
588     *            string to convert to quoted-printable form
589     * @param sourceCharset
590     *            the Charset for sourceStr
591     * @return quoted-printable string
592     * @throws UnsupportedEncodingException
593     *             Thrown if the Charset is not supported
594     */
595    public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
596        if (sourceStr == null) {
597            return null;
598        }
599        return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
600    }
601
602    /**
603     * Gets the default Charset name used for string decoding and encoding.
604     *
605     * @return the default Charset name
606     * @since 1.7
607     */
608    public Charset getCharset() {
609        return this.charset;
610    }
611
612    /**
613     * Gets the default Charset name used for string decoding and encoding.
614     *
615     * @return the default Charset name
616     */
617    public String getDefaultCharset() {
618        return this.charset.name();
619    }
620}