001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.io.ByteArrayOutputStream;
021import java.io.UnsupportedEncodingException;
022import java.nio.charset.Charset;
023import java.nio.charset.IllegalCharsetNameException;
024import java.nio.charset.UnsupportedCharsetException;
025import java.util.BitSet;
026
027import org.apache.commons.codec.BinaryDecoder;
028import org.apache.commons.codec.BinaryEncoder;
029import org.apache.commons.codec.Charsets;
030import org.apache.commons.codec.DecoderException;
031import org.apache.commons.codec.EncoderException;
032import org.apache.commons.codec.StringDecoder;
033import org.apache.commons.codec.StringEncoder;
034import org.apache.commons.codec.binary.StringUtils;
035
036/**
037 * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
038 * <p>
039 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
040 * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
041 * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
042 * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
043 * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
044 * gateway.
045 * <p>
046 * Note:
047 * <p>
048 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
049 * quoted-printable spec:
050 * <ul>
051 *   <li>{@code strict=false}: only rules #1 and #2 are implemented
052 *   <li>{@code strict=true}: all rules #1 through #5 are implemented
053 * </ul>
054 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
055 * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
056 * Q codec. The strict mode has been added in 1.10.
057 * <p>
058 * This class is immutable and thread-safe.
059 *
060 * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
061 *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
062 *
063 * @since 1.3
064 */
065public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
066    /**
067     * The default Charset used for string decoding and encoding.
068     */
069    private final Charset charset;
070
071    /**
072     * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
073     */
074    private final boolean strict;
075
076    /**
077     * BitSet of printable characters as defined in RFC 1521.
078     */
079    private static final BitSet PRINTABLE_CHARS = new BitSet(256);
080
081    private static final byte ESCAPE_CHAR = '=';
082
083    private static final byte TAB = 9;
084
085    private static final byte SPACE = 32;
086
087    private static final byte CR = 13;
088
089    private static final byte LF = 10;
090
091    /**
092     * Safe line length for quoted printable encoded text.
093     */
094    private static final int SAFE_LENGTH = 73;
095
096    // Static initializer for printable chars collection
097    static {
098        // alpha characters
099        for (int i = 33; i <= 60; i++) {
100            PRINTABLE_CHARS.set(i);
101        }
102        for (int i = 62; i <= 126; i++) {
103            PRINTABLE_CHARS.set(i);
104        }
105        PRINTABLE_CHARS.set(TAB);
106        PRINTABLE_CHARS.set(SPACE);
107    }
108
109    /**
110     * Default constructor, assumes default Charset of {@link Charsets#UTF_8}
111     */
112    public QuotedPrintableCodec() {
113        this(Charsets.UTF_8, false);
114    }
115
116    /**
117     * Constructor which allows for the selection of the strict mode.
118     *
119     * @param strict
120     *            if {@code true}, soft line breaks will be used
121     * @since 1.10
122     */
123    public QuotedPrintableCodec(final boolean strict) {
124        this(Charsets.UTF_8, strict);
125    }
126
127    /**
128     * Constructor which allows for the selection of a default Charset.
129     *
130     * @param charset
131     *            the default string Charset to use.
132     * @since 1.7
133     */
134    public QuotedPrintableCodec(final Charset charset) {
135        this(charset, false);
136    }
137
138    /**
139     * Constructor which allows for the selection of a default Charset and strict mode.
140     *
141     * @param charset
142     *            the default string Charset to use.
143     * @param strict
144     *            if {@code true}, soft line breaks will be used
145     * @since 1.10
146     */
147    public QuotedPrintableCodec(final Charset charset, final boolean strict) {
148        this.charset = charset;
149        this.strict = strict;
150    }
151
152    /**
153     * Constructor which allows for the selection of a default Charset.
154     *
155     * @param charsetName
156     *            the default string Charset to use.
157     * @throws UnsupportedCharsetException
158     *             If no support for the named Charset is available
159     *             in this instance of the Java virtual machine
160     * @throws IllegalArgumentException
161     *             If the given charsetName is null
162     * @throws IllegalCharsetNameException
163     *             If the given Charset name is illegal
164     *
165     * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
166     */
167    public QuotedPrintableCodec(final String charsetName)
168            throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
169        this(Charset.forName(charsetName), false);
170    }
171
172    /**
173     * Encodes byte into its quoted-printable representation.
174     *
175     * @param b
176     *            byte to encode
177     * @param buffer
178     *            the buffer to write to
179     * @return The number of bytes written to the <code>buffer</code>
180     */
181    private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
182        buffer.write(ESCAPE_CHAR);
183        final char hex1 = Utils.hexDigit(b >> 4);
184        final char hex2 = Utils.hexDigit(b);
185        buffer.write(hex1);
186        buffer.write(hex2);
187        return 3;
188    }
189
190    /**
191     * Return the byte at position <code>index</code> of the byte array and
192     * make sure it is unsigned.
193     *
194     * @param index
195     *            position in the array
196     * @param bytes
197     *            the byte array
198     * @return the unsigned octet at position <code>index</code> from the array
199     */
200    private static int getUnsignedOctet(final int index, final byte[] bytes) {
201        int b = bytes[index];
202        if (b < 0) {
203            b = 256 + b;
204        }
205        return b;
206    }
207
208    /**
209     * Write a byte to the buffer.
210     *
211     * @param b
212     *            byte to write
213     * @param encode
214     *            indicates whether the octet shall be encoded
215     * @param buffer
216     *            the buffer to write to
217     * @return the number of bytes that have been written to the buffer
218     */
219    private static int encodeByte(final int b, final boolean encode,
220                                  final ByteArrayOutputStream buffer) {
221        if (encode) {
222            return encodeQuotedPrintable(b, buffer);
223        }
224        buffer.write(b);
225        return 1;
226    }
227
228    /**
229     * Checks whether the given byte is whitespace.
230     *
231     * @param b
232     *            byte to be checked
233     * @return <code>true</code> if the byte is either a space or tab character
234     */
235    private static boolean isWhitespace(final int b) {
236        return b == SPACE || b == TAB;
237    }
238
239    /**
240     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
241     * <p>
242     * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
243     * RFC 1521 and is suitable for encoding binary data and unformatted text.
244     *
245     * @param printable
246     *            bitset of characters deemed quoted-printable
247     * @param bytes
248     *            array of bytes to be encoded
249     * @return array of bytes containing quoted-printable data
250     */
251    public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
252        return encodeQuotedPrintable(printable, bytes, false);
253    }
254
255    /**
256     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
257     * <p>
258     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
259     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
260     * RFC 1521 and is suitable for encoding binary data and unformatted text.
261     *
262     * @param printable
263     *            bitset of characters deemed quoted-printable
264     * @param bytes
265     *            array of bytes to be encoded
266     * @param strict
267     *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
268     * @return array of bytes containing quoted-printable data
269     * @since 1.10
270     */
271    public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
272        if (bytes == null) {
273            return null;
274        }
275        if (printable == null) {
276            printable = PRINTABLE_CHARS;
277        }
278        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
279
280        if (strict) {
281            int pos = 1;
282            // encode up to buffer.length - 3, the last three octets will be treated
283            // separately for simplification of note #3
284            for (int i = 0; i < bytes.length - 3; i++) {
285                final int b = getUnsignedOctet(i, bytes);
286                if (pos < SAFE_LENGTH) {
287                    // up to this length it is safe to add any byte, encoded or not
288                    pos += encodeByte(b, !printable.get(b), buffer);
289                } else {
290                    // rule #3: whitespace at the end of a line *must* be encoded
291                    encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
292
293                    // rule #5: soft line break
294                    buffer.write(ESCAPE_CHAR);
295                    buffer.write(CR);
296                    buffer.write(LF);
297                    pos = 1;
298                }
299            }
300
301            // rule #3: whitespace at the end of a line *must* be encoded
302            // if we would do a soft break line after this octet, encode whitespace
303            int b = getUnsignedOctet(bytes.length - 3, bytes);
304            boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5);
305            pos += encodeByte(b, encode, buffer);
306
307            // note #3: '=' *must not* be the ultimate or penultimate character
308            // simplification: if < 6 bytes left, do a soft line break as we may need
309            //                 exactly 6 bytes space for the last 2 bytes
310            if (pos > SAFE_LENGTH - 2) {
311                buffer.write(ESCAPE_CHAR);
312                buffer.write(CR);
313                buffer.write(LF);
314            }
315            for (int i = bytes.length - 2; i < bytes.length; i++) {
316                b = getUnsignedOctet(i, bytes);
317                // rule #3: trailing whitespace shall be encoded
318                encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
319                encodeByte(b, encode, buffer);
320            }
321        } else {
322            for (final byte c : bytes) {
323                int b = c;
324                if (b < 0) {
325                    b = 256 + b;
326                }
327                if (printable.get(b)) {
328                    buffer.write(b);
329                } else {
330                    encodeQuotedPrintable(b, buffer);
331                }
332            }
333        }
334        return buffer.toByteArray();
335    }
336
337    /**
338     * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
339     * back to their original representation.
340     * <p>
341     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
342     * defined in RFC 1521.
343     *
344     * @param bytes
345     *            array of quoted-printable characters
346     * @return array of original bytes
347     * @throws DecoderException
348     *             Thrown if quoted-printable decoding is unsuccessful
349     */
350    public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
351        if (bytes == null) {
352            return null;
353        }
354        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
355        for (int i = 0; i < bytes.length; i++) {
356            final int b = bytes[i];
357            if (b == ESCAPE_CHAR) {
358                try {
359                    // if the next octet is a CR we have found a soft line break
360                    if (bytes[++i] == CR) {
361                        continue;
362                    }
363                    final int u = Utils.digit16(bytes[i]);
364                    final int l = Utils.digit16(bytes[++i]);
365                    buffer.write((char) ((u << 4) + l));
366                } catch (final ArrayIndexOutOfBoundsException e) {
367                    throw new DecoderException("Invalid quoted-printable encoding", e);
368                }
369            } else if (b != CR && b != LF) {
370                // every other octet is appended except for CR & LF
371                buffer.write(b);
372            }
373        }
374        return buffer.toByteArray();
375    }
376
377    /**
378     * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
379     * <p>
380     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
381     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
382     * RFC 1521 and is suitable for encoding binary data and unformatted text.
383     *
384     * @param bytes
385     *            array of bytes to be encoded
386     * @return array of bytes containing quoted-printable data
387     */
388    @Override
389    public byte[] encode(final byte[] bytes) {
390        return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
391    }
392
393    /**
394     * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
395     * back to their original representation.
396     * <p>
397     * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
398     * defined in RFC 1521.
399     *
400     * @param bytes
401     *            array of quoted-printable characters
402     * @return array of original bytes
403     * @throws DecoderException
404     *             Thrown if quoted-printable decoding is unsuccessful
405     */
406    @Override
407    public byte[] decode(final byte[] bytes) throws DecoderException {
408        return decodeQuotedPrintable(bytes);
409    }
410
411    /**
412     * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
413     * <p>
414     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
415     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
416     * RFC 1521 and is suitable for encoding binary data and unformatted text.
417     *
418     * @param sourceStr
419     *            string to convert to quoted-printable form
420     * @return quoted-printable string
421     * @throws EncoderException
422     *             Thrown if quoted-printable encoding is unsuccessful
423     *
424     * @see #getCharset()
425     */
426    @Override
427    public String encode(final String sourceStr) throws EncoderException {
428        return this.encode(sourceStr, getCharset());
429    }
430
431    /**
432     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
433     * are converted back to their original representation.
434     *
435     * @param sourceStr
436     *            quoted-printable string to convert into its original form
437     * @param sourceCharset
438     *            the original string Charset
439     * @return original string
440     * @throws DecoderException
441     *             Thrown if quoted-printable decoding is unsuccessful
442     * @since 1.7
443     */
444    public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
445        if (sourceStr == null) {
446            return null;
447        }
448        return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
449    }
450
451    /**
452     * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
453     * are converted back to their original representation.
454     *
455     * @param sourceStr
456     *            quoted-printable string to convert into its original form
457     * @param sourceCharset
458     *            the original string Charset
459     * @return original string
460     * @throws DecoderException
461     *             Thrown if quoted-printable decoding is unsuccessful
462     * @throws UnsupportedEncodingException
463     *             Thrown if Charset is not supported
464     */
465    public String decode(final String sourceStr, final String sourceCharset)
466            throws DecoderException, UnsupportedEncodingException {
467        if (sourceStr == null) {
468            return null;
469        }
470        return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
471    }
472
473    /**
474     * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
475     * converted back to their original representation.
476     *
477     * @param sourceStr
478     *            quoted-printable string to convert into its original form
479     * @return original string
480     * @throws DecoderException
481     *             Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
482     * @see #getCharset()
483     */
484    @Override
485    public String decode(final String sourceStr) throws DecoderException {
486        return this.decode(sourceStr, this.getCharset());
487    }
488
489    /**
490     * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
491     *
492     * @param obj
493     *            string to convert to a quoted-printable form
494     * @return quoted-printable object
495     * @throws EncoderException
496     *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
497     *             unsuccessful
498     */
499    @Override
500    public Object encode(final Object obj) throws EncoderException {
501        if (obj == null) {
502            return null;
503        } else if (obj instanceof byte[]) {
504            return encode((byte[]) obj);
505        } else if (obj instanceof String) {
506            return encode((String) obj);
507        } else {
508            throw new EncoderException("Objects of type " +
509                  obj.getClass().getName() +
510                  " cannot be quoted-printable encoded");
511        }
512    }
513
514    /**
515     * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
516     * representation.
517     *
518     * @param obj
519     *            quoted-printable object to convert into its original form
520     * @return original object
521     * @throws DecoderException
522     *             Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure
523     *             condition is encountered during the decode process.
524     */
525    @Override
526    public Object decode(final Object obj) throws DecoderException {
527        if (obj == null) {
528            return null;
529        } else if (obj instanceof byte[]) {
530            return decode((byte[]) obj);
531        } else if (obj instanceof String) {
532            return decode((String) obj);
533        } else {
534            throw new DecoderException("Objects of type " +
535                  obj.getClass().getName() +
536                  " cannot be quoted-printable decoded");
537        }
538    }
539
540    /**
541     * Gets the default Charset name used for string decoding and encoding.
542     *
543     * @return the default Charset name
544     * @since 1.7
545     */
546    public Charset getCharset() {
547        return this.charset;
548    }
549
550    /**
551     * Gets the default Charset name used for string decoding and encoding.
552     *
553     * @return the default Charset name
554     */
555    public String getDefaultCharset() {
556        return this.charset.name();
557    }
558
559    /**
560     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
561     * <p>
562     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
563     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
564     * RFC 1521 and is suitable for encoding binary data and unformatted text.
565     *
566     * @param sourceStr
567     *            string to convert to quoted-printable form
568     * @param sourceCharset
569     *            the Charset for sourceStr
570     * @return quoted-printable string
571     * @since 1.7
572     */
573    public String encode(final String sourceStr, final Charset sourceCharset) {
574        if (sourceStr == null) {
575            return null;
576        }
577        return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
578    }
579
580    /**
581     * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
582     * <p>
583     * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
584     * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
585     * RFC 1521 and is suitable for encoding binary data and unformatted text.
586     *
587     * @param sourceStr
588     *            string to convert to quoted-printable form
589     * @param sourceCharset
590     *            the Charset for sourceStr
591     * @return quoted-printable string
592     * @throws UnsupportedEncodingException
593     *             Thrown if the Charset is not supported
594     */
595    public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
596        if (sourceStr == null) {
597            return null;
598        }
599        return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
600    }
601}