QuotedPrintableCodec.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.net;

  18. import java.io.ByteArrayOutputStream;
  19. import java.io.UnsupportedEncodingException;
  20. import java.nio.charset.Charset;
  21. import java.nio.charset.IllegalCharsetNameException;
  22. import java.nio.charset.StandardCharsets;
  23. import java.nio.charset.UnsupportedCharsetException;
  24. import java.util.BitSet;

  25. import org.apache.commons.codec.BinaryDecoder;
  26. import org.apache.commons.codec.BinaryEncoder;
  27. import org.apache.commons.codec.DecoderException;
  28. import org.apache.commons.codec.EncoderException;
  29. import org.apache.commons.codec.StringDecoder;
  30. import org.apache.commons.codec.StringEncoder;
  31. import org.apache.commons.codec.binary.StringUtils;

  32. /**
  33.  * Codec for the Quoted-Printable section of <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
  34.  * <p>
  35.  * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to
  36.  * printable characters in the ASCII character set. It encodes the data in such a way that the resulting octets are
  37.  * unlikely to be modified by mail transport. If the data being encoded are mostly ASCII text, the encoded form of the
  38.  * data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable
  39.  * to ensure the integrity of the data should the message pass through a character- translating, and/or line-wrapping
  40.  * gateway.
  41.  * </p>
  42.  * <p>
  43.  * Note:
  44.  * </p>
  45.  * <p>
  46.  * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the
  47.  * quoted-printable spec:
  48.  * </p>
  49.  * <ul>
  50.  *   <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
  51.  *   <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
  52.  * </ul>
  53.  * <p>
  54.  * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used
  55.  * for certain applications that do not require quoted-printable line formatting (rules #3, #4, #5), for instance
  56.  * Q codec. The strict mode has been added in 1.10.
  57.  * </p>
  58.  * <p>
  59.  * This class is immutable and thread-safe.
  60.  * </p>
  61.  *
  62.  * @see <a href="http://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One:
  63.  *          Mechanisms for Specifying and Describing the Format of Internet Message Bodies </a>
  64.  *
  65.  * @since 1.3
  66.  */
  67. public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
  68.     /**
  69.      * BitSet of printable characters as defined in RFC 1521.
  70.      */
  71.     private static final BitSet PRINTABLE_CHARS = new BitSet(256);

  72.     private static final byte ESCAPE_CHAR = '=';

  73.     private static final byte TAB = 9;

  74.     private static final byte SPACE = 32;

  75.     private static final byte CR = 13;

  76.     private static final byte LF = 10;

  77.     /**
  78.      * Minimum length required for the byte arrays used by encodeQuotedPrintable method
  79.      */
  80.     private static final int MIN_BYTES = 3;

  81.     /**
  82.      * Safe line length for quoted printable encoded text.
  83.      */
  84.     private static final int SAFE_LENGTH = 73;

  85.     // Static initializer for printable chars collection
  86.     static {
  87.         // alpha characters
  88.         for (int i = 33; i <= 60; i++) {
  89.             PRINTABLE_CHARS.set(i);
  90.         }
  91.         for (int i = 62; i <= 126; i++) {
  92.             PRINTABLE_CHARS.set(i);
  93.         }
  94.         PRINTABLE_CHARS.set(TAB);
  95.         PRINTABLE_CHARS.set(SPACE);
  96.     }

  97.     /**
  98.      * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted
  99.      * back to their original representation.
  100.      * <p>
  101.      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
  102.      * defined in RFC 1521.
  103.      * </p>
  104.      *
  105.      * @param bytes
  106.      *            array of quoted-printable characters
  107.      * @return array of original bytes
  108.      * @throws DecoderException
  109.      *             Thrown if quoted-printable decoding is unsuccessful
  110.      */
  111.     public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
  112.         if (bytes == null) {
  113.             return null;
  114.         }
  115.         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
  116.         for (int i = 0; i < bytes.length; i++) {
  117.             final int b = bytes[i];
  118.             if (b == ESCAPE_CHAR) {
  119.                 try {
  120.                     // if the next octet is a CR we have found a soft line break
  121.                     if (bytes[++i] == CR) {
  122.                         continue;
  123.                     }
  124.                     final int u = Utils.digit16(bytes[i]);
  125.                     final int l = Utils.digit16(bytes[++i]);
  126.                     buffer.write((char) ((u << 4) + l));
  127.                 } catch (final ArrayIndexOutOfBoundsException e) {
  128.                     throw new DecoderException("Invalid quoted-printable encoding", e);
  129.                 }
  130.             } else if (b != CR && b != LF) {
  131.                 // every other octet is appended except for CR & LF
  132.                 buffer.write(b);
  133.             }
  134.         }
  135.         return buffer.toByteArray();
  136.     }

  137.     /**
  138.      * Encodes a byte in the buffer.
  139.      *
  140.      * @param b
  141.      *            byte to write
  142.      * @param encode
  143.      *            indicates whether the octet shall be encoded
  144.      * @param buffer
  145.      *            the buffer to write to
  146.      * @return the number of bytes that have been written to the buffer
  147.      */
  148.     private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) {
  149.         if (encode) {
  150.             return encodeQuotedPrintable(b, buffer);
  151.         }
  152.         buffer.write(b);
  153.         return 1;
  154.     }

  155.     /**
  156.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  157.      * <p>
  158.      * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  159.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  160.      * </p>
  161.      *
  162.      * @param printable
  163.      *            bitset of characters deemed quoted-printable
  164.      * @param bytes
  165.      *            array of bytes to be encoded
  166.      * @return array of bytes containing quoted-printable data
  167.      */
  168.     public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
  169.         return encodeQuotedPrintable(printable, bytes, false);
  170.     }

  171.     /**
  172.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  173.      * <p>
  174.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  175.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  176.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  177.      * </p>
  178.      *
  179.      * @param printable
  180.      *            bitset of characters deemed quoted-printable
  181.      * @param bytes
  182.      *            array of bytes to be encoded
  183.      * @param strict
  184.      *            if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2
  185.      * @return array of bytes containing quoted-printable data
  186.      * @since 1.10
  187.      */
  188.     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
  189.         if (bytes == null) {
  190.             return null;
  191.         }
  192.         if (printable == null) {
  193.             printable = PRINTABLE_CHARS;
  194.         }
  195.         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
  196.         final int bytesLength = bytes.length;

  197.         if (strict) {
  198.             if (bytesLength < MIN_BYTES) {
  199.                 return null;
  200.             }

  201.             int pos = 1;
  202.             // encode up to buffer.length - 3, the last three octets will be treated
  203.             // separately for simplification of note #3
  204.             for (int i = 0; i < bytesLength - 3; i++) {
  205.                 final int b = getUnsignedOctet(i, bytes);
  206.                 if (pos < SAFE_LENGTH) {
  207.                     // up to this length it is safe to add any byte, encoded or not
  208.                     pos += encodeByte(b, !printable.get(b), buffer);
  209.                 } else {
  210.                     // rule #3: whitespace at the end of a line *must* be encoded
  211.                     encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);

  212.                     // rule #5: soft line break
  213.                     buffer.write(ESCAPE_CHAR);
  214.                     buffer.write(CR);
  215.                     buffer.write(LF);
  216.                     pos = 1;
  217.                 }
  218.             }

  219.             // rule #3: whitespace at the end of a line *must* be encoded
  220.             // if we would do a soft break line after this octet, encode whitespace
  221.             int b = getUnsignedOctet(bytesLength - 3, bytes);
  222.             boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
  223.             pos += encodeByte(b, encode, buffer);

  224.             // note #3: '=' *must not* be the ultimate or penultimate character
  225.             // simplification: if < 6 bytes left, do a soft line break as we may need
  226.             //                 exactly 6 bytes space for the last 2 bytes
  227.             if (pos > SAFE_LENGTH - 2) {
  228.                 buffer.write(ESCAPE_CHAR);
  229.                 buffer.write(CR);
  230.                 buffer.write(LF);
  231.             }
  232.             for (int i = bytesLength - 2; i < bytesLength; i++) {
  233.                 b = getUnsignedOctet(i, bytes);
  234.                 // rule #3: trailing whitespace shall be encoded
  235.                 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
  236.                 encodeByte(b, encode, buffer);
  237.             }
  238.         } else {
  239.             for (final byte c : bytes) {
  240.                 int b = c;
  241.                 if (b < 0) {
  242.                     b = 256 + b;
  243.                 }
  244.                 if (printable.get(b)) {
  245.                     buffer.write(b);
  246.                 } else {
  247.                     encodeQuotedPrintable(b, buffer);
  248.                 }
  249.             }
  250.         }
  251.         return buffer.toByteArray();
  252.     }

  253.     /**
  254.      * Encodes byte into its quoted-printable representation.
  255.      *
  256.      * @param b
  257.      *            byte to encode
  258.      * @param buffer
  259.      *            the buffer to write to
  260.      * @return The number of bytes written to the {@code buffer}
  261.      */
  262.     private static int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
  263.         buffer.write(ESCAPE_CHAR);
  264.         final char hex1 = Utils.hexDigit(b >> 4);
  265.         final char hex2 = Utils.hexDigit(b);
  266.         buffer.write(hex1);
  267.         buffer.write(hex2);
  268.         return 3;
  269.     }

  270.     /**
  271.      * Gets the byte at position {@code index} of the byte array and
  272.      * make sure it is unsigned.
  273.      *
  274.      * @param index
  275.      *            position in the array
  276.      * @param bytes
  277.      *            the byte array
  278.      * @return the unsigned octet at position {@code index} from the array
  279.      */
  280.     private static int getUnsignedOctet(final int index, final byte[] bytes) {
  281.         int b = bytes[index];
  282.         if (b < 0) {
  283.             b = 256 + b;
  284.         }
  285.         return b;
  286.     }

  287.     /**
  288.      * Checks whether the given byte is whitespace.
  289.      *
  290.      * @param b
  291.      *            byte to be checked
  292.      * @return {@code true} if the byte is either a space or tab character
  293.      */
  294.     private static boolean isWhitespace(final int b) {
  295.         return b == SPACE || b == TAB;
  296.     }

  297.     /**
  298.      * The default Charset used for string decoding and encoding.
  299.      */
  300.     private final Charset charset;

  301.     /**
  302.      * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
  303.      */
  304.     private final boolean strict;

  305.     /**
  306.      * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
  307.      */
  308.     public QuotedPrintableCodec() {
  309.         this(StandardCharsets.UTF_8, false);
  310.     }

  311.     /**
  312.      * Constructor which allows for the selection of the strict mode.
  313.      *
  314.      * @param strict
  315.      *            if {@code true}, soft line breaks will be used
  316.      * @since 1.10
  317.      */
  318.     public QuotedPrintableCodec(final boolean strict) {
  319.         this(StandardCharsets.UTF_8, strict);
  320.     }

  321.     /**
  322.      * Constructor which allows for the selection of a default Charset.
  323.      *
  324.      * @param charset
  325.      *            the default string Charset to use.
  326.      * @since 1.7
  327.      */
  328.     public QuotedPrintableCodec(final Charset charset) {
  329.         this(charset, false);
  330.     }

  331.     /**
  332.      * Constructor which allows for the selection of a default Charset and strict mode.
  333.      *
  334.      * @param charset
  335.      *            the default string Charset to use.
  336.      * @param strict
  337.      *            if {@code true}, soft line breaks will be used
  338.      * @since 1.10
  339.      */
  340.     public QuotedPrintableCodec(final Charset charset, final boolean strict) {
  341.         this.charset = charset;
  342.         this.strict = strict;
  343.     }

  344.     /**
  345.      * Constructor which allows for the selection of a default Charset.
  346.      *
  347.      * @param charsetName
  348.      *            the default string Charset to use.
  349.      * @throws UnsupportedCharsetException
  350.      *             If no support for the named Charset is available
  351.      *             in this instance of the Java virtual machine
  352.      * @throws IllegalArgumentException
  353.      *             If the given charsetName is null
  354.      * @throws IllegalCharsetNameException
  355.      *             If the given Charset name is illegal
  356.      *
  357.      * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
  358.      */
  359.     public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
  360.         this(Charset.forName(charsetName), false);
  361.     }

  362.     /**
  363.      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted
  364.      * back to their original representation.
  365.      * <p>
  366.      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as
  367.      * defined in RFC 1521.
  368.      * </p>
  369.      *
  370.      * @param bytes
  371.      *            array of quoted-printable characters
  372.      * @return array of original bytes
  373.      * @throws DecoderException
  374.      *             Thrown if quoted-printable decoding is unsuccessful
  375.      */
  376.     @Override
  377.     public byte[] decode(final byte[] bytes) throws DecoderException {
  378.         return decodeQuotedPrintable(bytes);
  379.     }

  380.     /**
  381.      * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original
  382.      * representation.
  383.      *
  384.      * @param obj
  385.      *            quoted-printable object to convert into its original form
  386.      * @return original object
  387.      * @throws DecoderException
  388.      *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
  389.      *             condition is encountered during the decode process.
  390.      */
  391.     @Override
  392.     public Object decode(final Object obj) throws DecoderException {
  393.         if (obj == null) {
  394.             return null;
  395.         }
  396.         if (obj instanceof byte[]) {
  397.             return decode((byte[]) obj);
  398.         }
  399.         if (obj instanceof String) {
  400.             return decode((String) obj);
  401.         }
  402.         throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded");
  403.     }

  404.     /**
  405.      * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are
  406.      * converted back to their original representation.
  407.      *
  408.      * @param sourceStr
  409.      *            quoted-printable string to convert into its original form
  410.      * @return original string
  411.      * @throws DecoderException
  412.      *             Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
  413.      * @see #getCharset()
  414.      */
  415.     @Override
  416.     public String decode(final String sourceStr) throws DecoderException {
  417.         return this.decode(sourceStr, getCharset());
  418.     }

  419.     /**
  420.      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
  421.      * are converted back to their original representation.
  422.      *
  423.      * @param sourceStr
  424.      *            quoted-printable string to convert into its original form
  425.      * @param sourceCharset
  426.      *            the original string Charset
  427.      * @return original string
  428.      * @throws DecoderException
  429.      *             Thrown if quoted-printable decoding is unsuccessful
  430.      * @since 1.7
  431.      */
  432.     public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
  433.         if (sourceStr == null) {
  434.             return null;
  435.         }
  436.         return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
  437.     }

  438.     /**
  439.      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters
  440.      * are converted back to their original representation.
  441.      *
  442.      * @param sourceStr
  443.      *            quoted-printable string to convert into its original form
  444.      * @param sourceCharset
  445.      *            the original string Charset
  446.      * @return original string
  447.      * @throws DecoderException
  448.      *             Thrown if quoted-printable decoding is unsuccessful
  449.      * @throws UnsupportedEncodingException
  450.      *             Thrown if Charset is not supported
  451.      */
  452.     public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException {
  453.         if (sourceStr == null) {
  454.             return null;
  455.         }
  456.         return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
  457.     }

  458.     /**
  459.      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
  460.      * <p>
  461.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  462.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  463.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  464.      * </p>
  465.      *
  466.      * @param bytes
  467.      *            array of bytes to be encoded
  468.      * @return array of bytes containing quoted-printable data
  469.      */
  470.     @Override
  471.     public byte[] encode(final byte[] bytes) {
  472.         return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
  473.     }

  474.     /**
  475.      * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
  476.      *
  477.      * @param obj
  478.      *            string to convert to a quoted-printable form
  479.      * @return quoted-printable object
  480.      * @throws EncoderException
  481.      *             Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is
  482.      *             unsuccessful
  483.      */
  484.     @Override
  485.     public Object encode(final Object obj) throws EncoderException {
  486.         if (obj == null) {
  487.             return null;
  488.         }
  489.         if (obj instanceof byte[]) {
  490.             return encode((byte[]) obj);
  491.         }
  492.         if (obj instanceof String) {
  493.             return encode((String) obj);
  494.         }
  495.         throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded");
  496.     }

  497.     /**
  498.      * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
  499.      * <p>
  500.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  501.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  502.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  503.      * </p>
  504.      *
  505.      * @param sourceStr
  506.      *            string to convert to quoted-printable form
  507.      * @return quoted-printable string
  508.      * @throws EncoderException
  509.      *             Thrown if quoted-printable encoding is unsuccessful
  510.      *
  511.      * @see #getCharset()
  512.      */
  513.     @Override
  514.     public String encode(final String sourceStr) throws EncoderException {
  515.         return encode(sourceStr, getCharset());
  516.     }

  517.     /**
  518.      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
  519.      * <p>
  520.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  521.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  522.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  523.      * </p>
  524.      *
  525.      * @param sourceStr
  526.      *            string to convert to quoted-printable form
  527.      * @param sourceCharset
  528.      *            the Charset for sourceStr
  529.      * @return quoted-printable string
  530.      * @since 1.7
  531.      */
  532.     public String encode(final String sourceStr, final Charset sourceCharset) {
  533.         if (sourceStr == null) {
  534.             return null;
  535.         }
  536.         return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
  537.     }

  538.     /**
  539.      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
  540.      * <p>
  541.      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset
  542.      * or only a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in
  543.      * RFC 1521 and is suitable for encoding binary data and unformatted text.
  544.      * </p>
  545.      *
  546.      * @param sourceStr
  547.      *            string to convert to quoted-printable form
  548.      * @param sourceCharset
  549.      *            the Charset for sourceStr
  550.      * @return quoted-printable string
  551.      * @throws UnsupportedEncodingException
  552.      *             Thrown if the Charset is not supported
  553.      */
  554.     public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
  555.         if (sourceStr == null) {
  556.             return null;
  557.         }
  558.         return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
  559.     }

  560.     /**
  561.      * Gets the default Charset name used for string decoding and encoding.
  562.      *
  563.      * @return the default Charset name
  564.      * @since 1.7
  565.      */
  566.     public Charset getCharset() {
  567.         return this.charset;
  568.     }

  569.     /**
  570.      * Gets the default Charset name used for string decoding and encoding.
  571.      *
  572.      * @return the default Charset name
  573.      */
  574.     public String getDefaultCharset() {
  575.         return this.charset.name();
  576.     }
  577. }