QuotedPrintableCodec.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.net;

  18. import java.io.ByteArrayOutputStream;
  19. import java.io.UnsupportedEncodingException;
  20. import java.nio.charset.Charset;
  21. import java.nio.charset.IllegalCharsetNameException;
  22. import java.nio.charset.UnsupportedCharsetException;
  23. import java.util.BitSet;

  24. import org.apache.commons.codec.BinaryDecoder;
  25. import org.apache.commons.codec.BinaryEncoder;
  26. import org.apache.commons.codec.Charsets;
  27. import org.apache.commons.codec.DecoderException;
  28. import org.apache.commons.codec.EncoderException;
  29. import org.apache.commons.codec.StringDecoder;
  30. import org.apache.commons.codec.StringEncoder;
  31. import org.apache.commons.codec.binary.StringUtils;

  32. /**
  33.  * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
  34.  * <p>
  35.  * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
  36.  * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
  37.  * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
  38.  * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
  39.  * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
  40.  * gateway.
  41.  * <p>
  42.  * Note:
  43.  * <p>
  44.  * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
  45.  * quoted-printable spec:
  46.  * <ul>
  47.  *   <li>{@code strict=false}: only rules #1 and #2 are implemented
  48.  *   <li>{@code strict=true}: all rules #1 through #5 are implemented
  49.  * </ul>
  50.  * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
  51.  * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
  52.  * Q codec. The strict mode has been added in 1.10.
  53.  * <p>
  54.  * This class is immutable and thread-safe.
  55.  *
  56.  * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
  57.  *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
  58.  *
  59.  * @since 1.3
  60.  * @version $Id: QuotedPrintableCodec.java 1788792 2017-03-26 23:57:00Z sebb $
  61.  */
  62. public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
  63.     /**
  64.      * The default charset used for string decoding and encoding.
  65.      */
  66.     private final Charset charset;

  67.     /**
  68.      * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
  69.      */
  70.     private final boolean strict;

  71.     /**
  72.      * BitSet of printable characters as defined in RFC 1521.
  73.      */
  74.     private static final BitSet PRINTABLE_CHARS = new BitSet(256);

  75.     private static final byte ESCAPE_CHAR = '=';

  76.     private static final byte TAB = 9;

  77.     private static final byte SPACE = 32;

  78.     private static final byte CR = 13;

  79.     private static final byte LF = 10;

  80.     /**
  81.      * Safe line length for quoted printable encoded text.
  82.      */
  83.     private static final int SAFE_LENGTH = 73;

  84.     // Static initializer for printable chars collection
  85.     static {
  86.         // alpha characters
  87.         for (int i = 33; i <= 60; i++) {
  88.             PRINTABLE_CHARS.set(i);
  89.         }
  90.         for (int i = 62; i <= 126; i++) {
  91.             PRINTABLE_CHARS.set(i);
  92.         }
  93.         PRINTABLE_CHARS.set(TAB);
  94.         PRINTABLE_CHARS.set(SPACE);
  95.     }

  96.     /**
  97.      * Default constructor, assumes default charset of {@link Charsets#UTF_8}
  98.      */
  99.     public QuotedPrintableCodec() {
  100.         this(Charsets.UTF_8, false);
  101.     }

  102.     /**
  103.      * Constructor which allows for the selection of the strict mode.
  104.      *
  105.      * @param strict
  106.      *            if {@code true}, soft line breaks will be used
  107.      * @since 1.10
  108.      */
  109.     public QuotedPrintableCodec(final boolean strict) {
  110.         this(Charsets.UTF_8, strict);
  111.     }

  112.     /**
  113.      * Constructor which allows for the selection of a default charset.
  114.      *
  115.      * @param charset
  116.      *            the default string charset to use.
  117.      * @since 1.7
  118.      */
  119.     public QuotedPrintableCodec(final Charset charset) {
  120.         this(charset, false);
  121.     }

  122.     /**
  123.      * Constructor which allows for the selection of a default charset and strict mode.
  124.      *
  125.      * @param charset
  126.      *            the default string charset to use.
  127.      * @param strict
  128.      *            if {@code true}, soft line breaks will be used
  129.      * @since 1.10
  130.      */
  131.     public QuotedPrintableCodec(final Charset charset, final boolean strict) {
  132.         this.charset = charset;
  133.         this.strict = strict;
  134.     }

  135.     /**
  136.      * Constructor which allows for the selection of a default charset.
  137.      *
  138.      * @param charsetName
  139.      *            the default string charset to use.
  140.      * @throws UnsupportedCharsetException
  141.      *             If no support for the named charset is available
  142.      *             in this instance of the Java virtual machine
  143.      * @throws IllegalArgumentException
  144.      *             If the given charsetName is null
  145.      * @throws IllegalCharsetNameException
  146.      *             If the given charset name is illegal
  147.      *
  148.      * @since 1.7 throws UnsupportedCharsetException if the named charset is unavailable
  149.      */
  150.     public QuotedPrintableCodec(final String charsetName)
  151.             throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
  152.         this(Charset.forName(charsetName), false);
  153.     }

  154.     /**
  155.      * Encodes byte into its quoted-printable representation.
  156.      *
  157.      * @param b
  158.      *            byte to encode
  159.      * @param buffer
  160.      *            the buffer to write to
  161.      * @return The number of bytes written to the <code>buffer</code>
  162.      */
  163.     private static final int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
  164.         buffer.write(ESCAPE_CHAR);
  165.         final char hex1 = Utils.hexDigit(b >> 4);
  166.         final char hex2 = Utils.hexDigit(b);
  167.         buffer.write(hex1);
  168.         buffer.write(hex2);
  169.         return 3;
  170.     }

  171.     /**
  172.      * Return the byte at position <code>index</code> of the byte array and
  173.      * make sure it is unsigned.
  174.      *
  175.      * @param index
  176.      *            position in the array
  177.      * @param bytes
  178.      *            the byte array
  179.      * @return the unsigned octet at position <code>index</code> from the array
  180.      */
  181.     private static int getUnsignedOctet(final int index, final byte[] bytes) {
  182.         int b = bytes[index];
  183.         if (b < 0) {
  184.             b = 256 + b;
  185.         }
  186.         return b;
  187.     }

  188.     /**
  189.      * Write a byte to the buffer.
  190.      *
  191.      * @param b
  192.      *            byte to write
  193.      * @param encode
  194.      *            indicates whether the octet shall be encoded
  195.      * @param buffer
  196.      *            the buffer to write to
  197.      * @return the number of bytes that have been written to the buffer
  198.      */
  199.     private static int encodeByte(final int b, final boolean encode,
  200.                                   final ByteArrayOutputStream buffer) {
  201.         if (encode) {
  202.             return encodeQuotedPrintable(b, buffer);
  203.         }
  204.         buffer.write(b);
  205.         return 1;
  206.     }

  207.     /**
  208.      * Checks whether the given byte is whitespace.
  209.      *
  210.      * @param b
  211.      *            byte to be checked
  212.      * @return <code>true</code> if the byte is either a space or tab character
  213.      */
  214.     private static boolean isWhitespace(final int b) {
  215.         return b == SPACE || b == TAB;
  216.     }

  217.     /**
  218.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  219.      * <p>
  220.      * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  221.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  222.      *
  223.      * @param printable
  224.      *            bitset of characters deemed quoted-printable
  225.      * @param bytes
  226.      *            array of bytes to be encoded
  227.      * @return array of bytes containing quoted-printable data
  228.      */
  229.     public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
  230.         return encodeQuotedPrintable(printable, bytes, false);
  231.     }

  232.     /**
  233.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  234.      * <p>
  235.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  236.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  237.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  238.      *
  239.      * @param printable
  240.      *            bitset of characters deemed quoted-printable
  241.      * @param bytes
  242.      *            array of bytes to be encoded
  243.      * @param strict
  244.      *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
  245.      * @return array of bytes containing quoted-printable data
  246.      * @since 1.10
  247.      */
  248.     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
  249.         if (bytes == null) {
  250.             return null;
  251.         }
  252.         if (printable == null) {
  253.             printable = PRINTABLE_CHARS;
  254.         }
  255.         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();

  256.         if (strict) {
  257.             int pos = 1;
  258.             // encode up to buffer.length - 3, the last three octets will be treated
  259.             // separately for simplification of note #3
  260.             for (int i = 0; i < bytes.length - 3; i++) {
  261.                 final int b = getUnsignedOctet(i, bytes);
  262.                 if (pos < SAFE_LENGTH) {
  263.                     // up to this length it is safe to add any byte, encoded or not
  264.                     pos += encodeByte(b, !printable.get(b), buffer);
  265.                 } else {
  266.                     // rule #3: whitespace at the end of a line *must* be encoded
  267.                     encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);

  268.                     // rule #5: soft line break
  269.                     buffer.write(ESCAPE_CHAR);
  270.                     buffer.write(CR);
  271.                     buffer.write(LF);
  272.                     pos = 1;
  273.                 }
  274.             }

  275.             // rule #3: whitespace at the end of a line *must* be encoded
  276.             // if we would do a soft break line after this octet, encode whitespace
  277.             int b = getUnsignedOctet(bytes.length - 3, bytes);
  278.             boolean encode = !printable.get(b) || (isWhitespace(b) && pos > SAFE_LENGTH - 5);
  279.             pos += encodeByte(b, encode, buffer);

  280.             // note #3: '=' *must not* be the ultimate or penultimate character
  281.             // simplification: if < 6 bytes left, do a soft line break as we may need
  282.             //                 exactly 6 bytes space for the last 2 bytes
  283.             if (pos > SAFE_LENGTH - 2) {
  284.                 buffer.write(ESCAPE_CHAR);
  285.                 buffer.write(CR);
  286.                 buffer.write(LF);
  287.             }
  288.             for (int i = bytes.length - 2; i < bytes.length; i++) {
  289.                 b = getUnsignedOctet(i, bytes);
  290.                 // rule #3: trailing whitespace shall be encoded
  291.                 encode = !printable.get(b) || (i > bytes.length - 2 && isWhitespace(b));
  292.                 encodeByte(b, encode, buffer);
  293.             }
  294.         } else {
  295.             for (final byte c : bytes) {
  296.                 int b = c;
  297.                 if (b < 0) {
  298.                     b = 256 + b;
  299.                 }
  300.                 if (printable.get(b)) {
  301.                     buffer.write(b);
  302.                 } else {
  303.                     encodeQuotedPrintable(b, buffer);
  304.                 }
  305.             }
  306.         }
  307.         return buffer.toByteArray();
  308.     }

  309.     /**
  310.      * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
  311.      * back to their original representation.
  312.      * <p>
  313.      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
  314.      * defined in RFC 1521.
  315.      *
  316.      * @param bytes
  317.      *            array of quoted-printable characters
  318.      * @return array of original bytes
  319.      * @throws DecoderException
  320.      *             Thrown if quoted-printable decoding is unsuccessful
  321.      */
  322.     public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
  323.         if (bytes == null) {
  324.             return null;
  325.         }
  326.         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
  327.         for (int i = 0; i < bytes.length; i++) {
  328.             final int b = bytes[i];
  329.             if (b == ESCAPE_CHAR) {
  330.                 try {
  331.                     // if the next octet is a CR we have found a soft line break
  332.                     if (bytes[++i] == CR) {
  333.                         continue;
  334.                     }
  335.                     final int u = Utils.digit16(bytes[i]);
  336.                     final int l = Utils.digit16(bytes[++i]);
  337.                     buffer.write((char) ((u << 4) + l));
  338.                 } catch (final ArrayIndexOutOfBoundsException e) {
  339.                     throw new DecoderException("Invalid quoted-printable encoding", e);
  340.                 }
  341.             } else if (b != CR && b != LF) {
  342.                 // every other octet is appended except for CR & LF
  343.                 buffer.write(b);
  344.             }
  345.         }
  346.         return buffer.toByteArray();
  347.     }

  348.     /**
  349.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  350.      * <p>
  351.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  352.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  353.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  354.      *
  355.      * @param bytes
  356.      *            array of bytes to be encoded
  357.      * @return array of bytes containing quoted-printable data
  358.      */
  359.     @Override
  360.     public byte[] encode(final byte[] bytes) {
  361.         return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
  362.     }

  363.     /**
  364.      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
  365.      * back to their original representation.
  366.      * <p>
  367.      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
  368.      * defined in RFC 1521.
  369.      *
  370.      * @param bytes
  371.      *            array of quoted-printable characters
  372.      * @return array of original bytes
  373.      * @throws DecoderException
  374.      *             Thrown if quoted-printable decoding is unsuccessful
  375.      */
  376.     @Override
  377.     public byte[] decode(final byte[] bytes) throws DecoderException {
  378.         return decodeQuotedPrintable(bytes);
  379.     }

  380.     /**
  381.      * Encodes a string into its quoted-printable form using the default string charset. Unsafe characters are escaped.
  382.      * <p>
  383.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  384.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  385.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  386.      *
  387.      * @param str
  388.      *            string to convert to quoted-printable form
  389.      * @return quoted-printable string
  390.      * @throws EncoderException
  391.      *             Thrown if quoted-printable encoding is unsuccessful
  392.      *
  393.      * @see #getCharset()
  394.      */
  395.     @Override
  396.     public String encode(final String str) throws EncoderException {
  397.         return this.encode(str, getCharset());
  398.     }

  399.     /**
  400.      * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
  401.      * are converted back to their original representation.
  402.      *
  403.      * @param str
  404.      *            quoted-printable string to convert into its original form
  405.      * @param charset
  406.      *            the original string charset
  407.      * @return original string
  408.      * @throws DecoderException
  409.      *             Thrown if quoted-printable decoding is unsuccessful
  410.      * @since 1.7
  411.      */
  412.     public String decode(final String str, final Charset charset) throws DecoderException {
  413.         if (str == null) {
  414.             return null;
  415.         }
  416.         return new String(this.decode(StringUtils.getBytesUsAscii(str)), charset);
  417.     }

  418.     /**
  419.      * Decodes a quoted-printable string into its original form using the specified string charset. Escaped characters
  420.      * are converted back to their original representation.
  421.      *
  422.      * @param str
  423.      *            quoted-printable string to convert into its original form
  424.      * @param charset
  425.      *            the original string charset
  426.      * @return original string
  427.      * @throws DecoderException
  428.      *             Thrown if quoted-printable decoding is unsuccessful
  429.      * @throws UnsupportedEncodingException
  430.      *             Thrown if charset is not supported
  431.      */
  432.     public String decode(final String str, final String charset) throws DecoderException, UnsupportedEncodingException {
  433.         if (str == null) {
  434.             return null;
  435.         }
  436.         return new String(decode(StringUtils.getBytesUsAscii(str)), charset);
  437.     }

  438.     /**
  439.      * Decodes a quoted-printable string into its original form using the default string charset. Escaped characters are
  440.      * converted back to their original representation.
  441.      *
  442.      * @param str
  443.      *            quoted-printable string to convert into its original form
  444.      * @return original string
  445.      * @throws DecoderException
  446.      *             Thrown if quoted-printable decoding is unsuccessful. Thrown if charset is not supported.
  447.      * @see #getCharset()
  448.      */
  449.     @Override
  450.     public String decode(final String str) throws DecoderException {
  451.         return this.decode(str, this.getCharset());
  452.     }

  453.     /**
  454.      * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
  455.      *
  456.      * @param obj
  457.      *            string to convert to a quoted-printable form
  458.      * @return quoted-printable object
  459.      * @throws EncoderException
  460.      *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
  461.      *             unsuccessful
  462.      */
  463.     @Override
  464.     public Object encode(final Object obj) throws EncoderException {
  465.         if (obj == null) {
  466.             return null;
  467.         } else if (obj instanceof byte[]) {
  468.             return encode((byte[]) obj);
  469.         } else if (obj instanceof String) {
  470.             return encode((String) obj);
  471.         } else {
  472.             throw new EncoderException("Objects of type " +
  473.                   obj.getClass().getName() +
  474.                   " cannot be quoted-printable encoded");
  475.         }
  476.     }

  477.     /**
  478.      * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
  479.      * representation.
  480.      *
  481.      * @param obj
  482.      *            quoted-printable object to convert into its original form
  483.      * @return original object
  484.      * @throws DecoderException
  485.      *             Thrown if the argument is not a <code>String</code> or <code>byte[]</code>. Thrown if a failure
  486.      *             condition is encountered during the decode process.
  487.      */
  488.     @Override
  489.     public Object decode(final Object obj) throws DecoderException {
  490.         if (obj == null) {
  491.             return null;
  492.         } else if (obj instanceof byte[]) {
  493.             return decode((byte[]) obj);
  494.         } else if (obj instanceof String) {
  495.             return decode((String) obj);
  496.         } else {
  497.             throw new DecoderException("Objects of type " +
  498.                   obj.getClass().getName() +
  499.                   " cannot be quoted-printable decoded");
  500.         }
  501.     }

  502.     /**
  503.      * Gets the default charset name used for string decoding and encoding.
  504.      *
  505.      * @return the default charset name
  506.      * @since 1.7
  507.      */
  508.     public Charset getCharset() {
  509.         return this.charset;
  510.     }

  511.     /**
  512.      * Gets the default charset name used for string decoding and encoding.
  513.      *
  514.      * @return the default charset name
  515.      */
  516.     public String getDefaultCharset() {
  517.         return this.charset.name();
  518.     }

  519.     /**
  520.      * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
  521.      * <p>
  522.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  523.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  524.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  525.      *
  526.      * @param str
  527.      *            string to convert to quoted-printable form
  528.      * @param charset
  529.      *            the charset for str
  530.      * @return quoted-printable string
  531.      * @since 1.7
  532.      */
  533.     public String encode(final String str, final Charset charset) {
  534.         if (str == null) {
  535.             return null;
  536.         }
  537.         return StringUtils.newStringUsAscii(this.encode(str.getBytes(charset)));
  538.     }

  539.     /**
  540.      * Encodes a string into its quoted-printable form using the specified charset. Unsafe characters are escaped.
  541.      * <p>
  542.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  543.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  544.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  545.      *
  546.      * @param str
  547.      *            string to convert to quoted-printable form
  548.      * @param charset
  549.      *            the charset for str
  550.      * @return quoted-printable string
  551.      * @throws UnsupportedEncodingException
  552.      *             Thrown if the charset is not supported
  553.      */
  554.     public String encode(final String str, final String charset) throws UnsupportedEncodingException {
  555.         if (str == null) {
  556.             return null;
  557.         }
  558.         return StringUtils.newStringUsAscii(encode(str.getBytes(charset)));
  559.     }
  560. }