001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.io.ByteArrayOutputStream;
021import java.io.UnsupportedEncodingException;
022import java.nio.charset.Charset;
023import java.nio.charset.IllegalCharsetNameException;
024import java.nio.charset.StandardCharsets;
025import java.nio.charset.UnsupportedCharsetException;
026import java.util.BitSet;
027
028import org.apache.commons.codec.BinaryDecoder;
029import org.apache.commons.codec.BinaryEncoder;
030import org.apache.commons.codec.DecoderException;
031import org.apache.commons.codec.EncoderException;
032import org.apache.commons.codec.StringDecoder;
033import org.apache.commons.codec.StringEncoder;
034import org.apache.commons.codec.binary.StringUtils;
035
036/**
037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
038 * <p>
039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
044 * gateway.
045 * </p>
046 * <p>
047 * Note:
048 * </p>
049 * <p>
050 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
051 * quoted-printable spec:
052 * </p>
053 * <ul>
054 *   <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
055 *   <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
056 * </ul>
057 * <p>
058 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
059 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
060 * Q codec. The strict mode has been added in 1.10.
061 * </p>
062 * <p>
063 * This class is immutable and thread-safe.
064 * </p>
065 *
066 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
067 *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
068 *
069 * @since 1.3
070 */
071public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
072    /**
073     * BitSet of printable characters as defined in RFC 1521.
074     */
075    private static final BitSet PRINTABLE_CHARS = new BitSet(256);
076
077    private static final byte ESCAPE_CHAR = '=';
078
079    private static final byte TAB = 9;
080
081    private static final byte SPACE = 32;
082
083    private static final byte CR = 13;
084
085    private static final byte LF = 10;
086
087    /**
088     * Minimum length required for the byte arrays used by encodeQuotedPrintable method
089     */
090    private static final int MIN_BYTES = 3;
091
092    /**
093     * Safe line length for quoted printable encoded text.
094     */
095    private static final int SAFE_LENGTH = 73;
096
097    // Static initializer for printable chars collection
098    static {
099        // alpha characters
100        for (int i = 33; i <= 60; i++) {
101            PRINTABLE_CHARS.set(i);
102        }
103        for (int i = 62; i <= 126; i++) {
104            PRINTABLE_CHARS.set(i);
105        }
106        PRINTABLE_CHARS.set(TAB);
107        PRINTABLE_CHARS.set(SPACE);
108    }
109
110    /**
111     * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
112     * back to their original representation.
113     * <p>
114     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
115     * defined in RFC 1521.
116     * </p>
117     *
118     * @param bytes
119     *            array of quoted-printable characters
120     * @return array of original bytes
121     * @throws DecoderException
122     *             Thrown if quoted-printable decoding is unsuccessful
123     */
124    public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
125        if (bytes == null) {
126            return null;
127        }
128        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
129        for (int i = 0; i < bytes.length; i++) {
130            final int b = bytes[i];
131            if (b == ESCAPE_CHAR) {
132                try {
133                    // if the next octet is a CR we have found a soft line break
134                    if (bytes[++i] == CR) {
135                        continue;
136                    }
137                    final int u = Utils.digit16(bytes[i]);
138                    final int l = Utils.digit16(bytes[++i]);
139                    buffer.write((char) ((u << 4) + l));
140                } catch (final ArrayIndexOutOfBoundsException e) {
141                    throw new DecoderException("Invalid quoted-printable encoding", e);
142                }
143            } else if (b != CR && b != LF) {
144                // every other octet is appended except for CR & LF
145                buffer.write(b);
146            }
147        }
148        return buffer.toByteArray();
149    }
150
151    /**
152     * Write a byte to the buffer.
153     *
154     * @param b
155     *            byte to write
156     * @param encode
157     *            indicates whether the octet shall be encoded
158     * @param buffer
159     *            the buffer to write to
160     * @return the number of bytes that have been written to the buffer
161     */
162    private static int encodeByte(final int b, final boolean encode,
163                                  final ByteArrayOutputStream buffer) {
164        if (encode) {
165            return encodeQuotedPrintable(b, buffer);
166        }
167        buffer.write(b);
168        return 1;
169    }
170
171    /**
172     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
173     * <p>
174     * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
175     * RFC 1521 and is suitable for encoding binary data and unformatted text.
176     * </p>
177     *
178     * @param printable
179     *            bitset of characters deemed quoted-printable
180     * @param bytes
181     *            array of bytes to be encoded
182     * @return array of bytes containing quoted-printable data
183     */
184    public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
185        return encodeQuotedPrintable(printable, bytes, false);
186    }
187
188    /**
189     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
190     * <p>
191     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
192     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
193     * RFC 1521 and is suitable for encoding binary data and unformatted text.
194     * </p>
195     *
196     * @param printable
197     *            bitset of characters deemed quoted-printable
198     * @param bytes
199     *            array of bytes to be encoded
200     * @param strict
201     *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
202     * @return array of bytes containing quoted-printable data
203     * @since 1.10
204     */
205    public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
206        if (bytes == null) {
207            return null;
208        }
209        if (printable == null) {
210            printable = PRINTABLE_CHARS;
211        }
212        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
213        final int bytesLength = bytes.length;
214
215        if (strict) {
216            if (bytesLength < MIN_BYTES) {
217                return null;
218            }
219
220            int pos = 1;
221            // encode up to buffer.length - 3, the last three octets will be treated
222            // separately for simplification of note #3
223            for (int i = 0; i < bytesLength - 3; i++) {
224                final int b = getUnsignedOctet(i, bytes);
225                if (pos < SAFE_LENGTH) {
226                    // up to this length it is safe to add any byte, encoded or not
227                    pos += encodeByte(b, !printable.get(b), buffer);
228                } else {
229                    // rule #3: whitespace at the end of a line *must* be encoded
230                    encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
231
232                    // rule #5: soft line break
233                    buffer.write(ESCAPE_CHAR);
234                    buffer.write(CR);
235                    buffer.write(LF);
236                    pos = 1;
237                }
238            }
239
240            // rule #3: whitespace at the end of a line *must* be encoded
241            // if we would do a soft break line after this octet, encode whitespace
242            int b = getUnsignedOctet(bytesLength - 3, bytes);
243            boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
244            pos += encodeByte(b, encode, buffer);
245
246            // note #3: '=' *must not* be the ultimate or penultimate character
247            // simplification: if < 6 bytes left, do a soft line break as we may need
248            //                 exactly 6 bytes space for the last 2 bytes
249            if (pos > SAFE_LENGTH - 2) {
250                buffer.write(ESCAPE_CHAR);
251                buffer.write(CR);
252                buffer.write(LF);
253            }
254            for (int i = bytesLength - 2; i < bytesLength; i++) {
255                b = getUnsignedOctet(i, bytes);
256                // rule #3: trailing whitespace shall be encoded
257                encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
258                encodeByte(b, encode, buffer);
259            }
260        } else {
261            for (final byte c : bytes) {
262                int b = c;
263                if (b < 0) {
264                    b = 256 + b;
265                }
266                if (printable.get(b)) {
267                    buffer.write(b);
268                } else {
269                    encodeQuotedPrintable(b, buffer);
270                }
271            }
272        }
273        return buffer.toByteArray();
274    }
275
276    /**
277     * Encodes byte into its quoted-printable representation.
278     *
279     * @param b
280     *            byte to encode
281     * @param buffer
282     *            the buffer to write to
283     * @return The number of bytes written to the {@code buffer}
284     */
285    private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
286        buffer.write(ESCAPE_CHAR);
287        final char hex1 = Utils.hexDigit(b >> 4);
288        final char hex2 = Utils.hexDigit(b);
289        buffer.write(hex1);
290        buffer.write(hex2);
291        return 3;
292    }
293
294    /**
295     * Gets the byte at position {@code index} of the byte array and
296     * make sure it is unsigned.
297     *
298     * @param index
299     *            position in the array
300     * @param bytes
301     *            the byte array
302     * @return the unsigned octet at position {@code index} from the array
303     */
304    private static int getUnsignedOctet(final int index, final byte[] bytes) {
305        int b = bytes[index];
306        if (b < 0) {
307            b = 256 + b;
308        }
309        return b;
310    }
311
312    /**
313     * Checks whether the given byte is whitespace.
314     *
315     * @param b
316     *            byte to be checked
317     * @return {@code true} if the byte is either a space or tab character
318     */
319    private static boolean isWhitespace(final int b) {
320        return b == SPACE || b == TAB;
321    }
322
323    /**
324     * The default Charset used for string decoding and encoding.
325     */
326    private final Charset charset;
327
328    /**
329     * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
330     */
331    private final boolean strict;
332
333    /**
334     * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
335     */
336    public QuotedPrintableCodec() {
337        this(StandardCharsets.UTF_8, false);
338    }
339
340    /**
341     * Constructor which allows for the selection of the strict mode.
342     *
343     * @param strict
344     *            if {@code true}, soft line breaks will be used
345     * @since 1.10
346     */
347    public QuotedPrintableCodec(final boolean strict) {
348        this(StandardCharsets.UTF_8, strict);
349    }
350
351    /**
352     * Constructor which allows for the selection of a default Charset.
353     *
354     * @param charset
355     *            the default string Charset to use.
356     * @since 1.7
357     */
358    public QuotedPrintableCodec(final Charset charset) {
359        this(charset, false);
360    }
361
362    /**
363     * Constructor which allows for the selection of a default Charset and strict mode.
364     *
365     * @param charset
366     *            the default string Charset to use.
367     * @param strict
368     *            if {@code true}, soft line breaks will be used
369     * @since 1.10
370     */
371    public QuotedPrintableCodec(final Charset charset, final boolean strict) {
372        this.charset = charset;
373        this.strict = strict;
374    }
375
376    /**
377     * Constructor which allows for the selection of a default Charset.
378     *
379     * @param charsetName
380     *            the default string Charset to use.
381     * @throws UnsupportedCharsetException
382     *             If no support for the named Charset is available
383     *             in this instance of the Java virtual machine
384     * @throws IllegalArgumentException
385     *             If the given charsetName is null
386     * @throws IllegalCharsetNameException
387     *             If the given Charset name is illegal
388     *
389     * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
390     */
391    public QuotedPrintableCodec(final String charsetName)
392            throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
393        this(Charset.forName(charsetName), false);
394    }
395
396    /**
397     * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
398     * back to their original representation.
399     * <p>
400     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
401     * defined in RFC 1521.
402     * </p>
403     *
404     * @param bytes
405     *            array of quoted-printable characters
406     * @return array of original bytes
407     * @throws DecoderException
408     *             Thrown if quoted-printable decoding is unsuccessful
409     */
410    @Override
411    public byte[] decode(final byte[] bytes) throws DecoderException {
412        return decodeQuotedPrintable(bytes);
413    }
414
415    /**
416     * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
417     * representation.
418     *
419     * @param obj
420     *            quoted-printable object to convert into its original form
421     * @return original object
422     * @throws DecoderException
423     *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
424     *             condition is encountered during the decode process.
425     */
426    @Override
427    public Object decode(final Object obj) throws DecoderException {
428        if (obj == null) {
429            return null;
430        }
431        if (obj instanceof byte[]) {
432            return decode((byte[]) obj);
433        }
434        if (obj instanceof String) {
435            return decode((String) obj);
436        }
437        throw new DecoderException("Objects of type " +
438              obj.getClass().getName() +
439              " cannot be quoted-printable decoded");
440    }
441
442    /**
443     * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
444     * converted back to their original representation.
445     *
446     * @param sourceStr
447     *            quoted-printable string to convert into its original form
448     * @return original string
449     * @throws DecoderException
450     *             Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
451     * @see #getCharset()
452     */
453    @Override
454    public String decode(final String sourceStr) throws DecoderException {
455        return this.decode(sourceStr, this.getCharset());
456    }
457
458    /**
459     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
460     * are converted back to their original representation.
461     *
462     * @param sourceStr
463     *            quoted-printable string to convert into its original form
464     * @param sourceCharset
465     *            the original string Charset
466     * @return original string
467     * @throws DecoderException
468     *             Thrown if quoted-printable decoding is unsuccessful
469     * @since 1.7
470     */
471    public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
472        if (sourceStr == null) {
473            return null;
474        }
475        return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
476    }
477
478    /**
479     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
480     * are converted back to their original representation.
481     *
482     * @param sourceStr
483     *            quoted-printable string to convert into its original form
484     * @param sourceCharset
485     *            the original string Charset
486     * @return original string
487     * @throws DecoderException
488     *             Thrown if quoted-printable decoding is unsuccessful
489     * @throws UnsupportedEncodingException
490     *             Thrown if Charset is not supported
491     */
492    public String decode(final String sourceStr, final String sourceCharset)
493            throws DecoderException, UnsupportedEncodingException {
494        if (sourceStr == null) {
495            return null;
496        }
497        return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
498    }
499
500    /**
501     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
502     * <p>
503     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
504     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
505     * RFC 1521 and is suitable for encoding binary data and unformatted text.
506     * </p>
507     *
508     * @param bytes
509     *            array of bytes to be encoded
510     * @return array of bytes containing quoted-printable data
511     */
512    @Override
513    public byte[] encode(final byte[] bytes) {
514        return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
515    }
516
517    /**
518     * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
519     *
520     * @param obj
521     *            string to convert to a quoted-printable form
522     * @return quoted-printable object
523     * @throws EncoderException
524     *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
525     *             unsuccessful
526     */
527    @Override
528    public Object encode(final Object obj) throws EncoderException {
529        if (obj == null) {
530            return null;
531        }
532        if (obj instanceof byte[]) {
533            return encode((byte[]) obj);
534        }
535        if (obj instanceof String) {
536            return encode((String) obj);
537        }
538        throw new EncoderException("Objects of type " +
539              obj.getClass().getName() +
540              " cannot be quoted-printable encoded");
541    }
542
543    /**
544     * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
545     * <p>
546     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
547     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
548     * RFC 1521 and is suitable for encoding binary data and unformatted text.
549     * </p>
550     *
551     * @param sourceStr
552     *            string to convert to quoted-printable form
553     * @return quoted-printable string
554     * @throws EncoderException
555     *             Thrown if quoted-printable encoding is unsuccessful
556     *
557     * @see #getCharset()
558     */
559    @Override
560    public String encode(final String sourceStr) throws EncoderException {
561        return this.encode(sourceStr, getCharset());
562    }
563
564    /**
565     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
566     * <p>
567     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
568     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
569     * RFC 1521 and is suitable for encoding binary data and unformatted text.
570     * </p>
571     *
572     * @param sourceStr
573     *            string to convert to quoted-printable form
574     * @param sourceCharset
575     *            the Charset for sourceStr
576     * @return quoted-printable string
577     * @since 1.7
578     */
579    public String encode(final String sourceStr, final Charset sourceCharset) {
580        if (sourceStr == null) {
581            return null;
582        }
583        return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
584    }
585
586    /**
587     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
588     * <p>
589     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
590     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
591     * RFC 1521 and is suitable for encoding binary data and unformatted text.
592     * </p>
593     *
594     * @param sourceStr
595     *            string to convert to quoted-printable form
596     * @param sourceCharset
597     *            the Charset for sourceStr
598     * @return quoted-printable string
599     * @throws UnsupportedEncodingException
600     *             Thrown if the Charset is not supported
601     */
602    public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
603        if (sourceStr == null) {
604            return null;
605        }
606        return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
607    }
608
609    /**
610     * Gets the default Charset name used for string decoding and encoding.
611     *
612     * @return the default Charset name
613     * @since 1.7
614     */
615    public Charset getCharset() {
616        return this.charset;
617    }
618
619    /**
620     * Gets the default Charset name used for string decoding and encoding.
621     *
622     * @return the default Charset name
623     */
624    public String getDefaultCharset() {
625        return this.charset.name();
626    }
627}