MimeUtils.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.apache.commons.fileupload2.core;

  18. import java.io.ByteArrayOutputStream;
  19. import java.io.IOException;
  20. import java.io.UnsupportedEncodingException;
  21. import java.nio.charset.StandardCharsets;
  22. import java.text.ParseException;
  23. import java.util.Base64;
  24. import java.util.HashMap;
  25. import java.util.Locale;
  26. import java.util.Map;

  27. /**
  28.  * Utility class to decode MIME texts.
  29.  */
  30. final class MimeUtils {

  31.     /**
  32.      * The marker to indicate text is encoded with BASE64 algorithm.
  33.      */
  34.     private static final String BASE64_ENCODING_MARKER = "B";

  35.     /**
  36.      * The marker to indicate text is encoded with QuotedPrintable algorithm.
  37.      */
  38.     private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";

  39.     /**
  40.      * If the text contains any encoded tokens, those tokens will be marked with "=?".
  41.      */
  42.     private static final String ENCODED_TOKEN_MARKER = "=?";

  43.     /**
  44.      * If the text contains any encoded tokens, those tokens will terminate with "=?".
  45.      */
  46.     private static final String ENCODED_TOKEN_FINISHER = "?=";

  47.     /**
  48.      * The linear whitespace chars sequence.
  49.      */
  50.     private static final String LINEAR_WHITESPACE = " \t\r\n";

  51.     /**
  52.      * Mappings between MIME and Java charset.
  53.      */
  54.     private static final Map<String, String> MIME2JAVA = new HashMap<>();

  55.     static {
  56.         MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
  57.         MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
  58.         MIME2JAVA.put("utf-8", "UTF8");
  59.         MIME2JAVA.put("utf8", "UTF8");
  60.         MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
  61.         MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
  62.         MIME2JAVA.put("euc-kr", "KSC5601");
  63.         MIME2JAVA.put("euckr", "KSC5601");
  64.         MIME2JAVA.put("us-ascii", StandardCharsets.ISO_8859_1.name());
  65.         MIME2JAVA.put("x-us-ascii", StandardCharsets.ISO_8859_1.name());
  66.     }

  67.     /**
  68.      * Decodes a string of text obtained from a mail header into its proper form. The text generally will consist of a string of tokens, some of which may be
  69.      * encoded using base64 encoding.
  70.      *
  71.      * @param text The text to decode.
  72.      * @return The decoded text string.
  73.      * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
  74.      */
  75.     static String decodeText(final String text) throws UnsupportedEncodingException {
  76.         // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
  77.         // source string doesn't contain that sequent, no decoding is required.
  78.         if (!text.contains(ENCODED_TOKEN_MARKER)) {
  79.             return text;
  80.         }

  81.         var offset = 0;
  82.         final var endOffset = text.length();

  83.         var startWhiteSpace = -1;
  84.         var endWhiteSpace = -1;

  85.         final var decodedText = new StringBuilder(text.length());

  86.         var previousTokenEncoded = false;

  87.         while (offset < endOffset) {
  88.             var ch = text.charAt(offset);

  89.             // is this a whitespace character?
  90.             if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
  91.                 startWhiteSpace = offset;
  92.                 while (offset < endOffset) {
  93.                     // step over the white space characters.
  94.                     ch = text.charAt(offset);
  95.                     if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
  96.                         // record the location of the first non lwsp and drop down to process the
  97.                         // token characters.
  98.                         endWhiteSpace = offset;
  99.                         break;
  100.                     }
  101.                     offset++;
  102.                 }
  103.             } else {
  104.                 // we have a word token. We need to scan over the word and then try to parse it.
  105.                 final var wordStart = offset;

  106.                 while (offset < endOffset) {
  107.                     // step over the non white space characters.
  108.                     ch = text.charAt(offset);
  109.                     if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
  110.                         break;
  111.                     }
  112.                     offset++;

  113.                     // NB: Trailing whitespace on these header strings will just be discarded.
  114.                 }
  115.                 // pull out the word token.
  116.                 final var word = text.substring(wordStart, offset);
  117.                 // is the token encoded? decode the word
  118.                 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
  119.                     try {
  120.                         // if this gives a parsing failure, treat it like a non-encoded word.
  121.                         final var decodedWord = decodeWord(word);

  122.                         // are any whitespace characters significant? Append 'em if we've got 'em.
  123.                         if (!previousTokenEncoded && startWhiteSpace != -1) {
  124.                             decodedText.append(text, startWhiteSpace, endWhiteSpace);
  125.                             startWhiteSpace = -1;
  126.                         }
  127.                         // this is definitely a decoded token.
  128.                         previousTokenEncoded = true;
  129.                         // and add this to the text.
  130.                         decodedText.append(decodedWord);
  131.                         // we continue parsing from here...we allow parsing errors to fall through
  132.                         // and get handled as normal text.
  133.                         continue;

  134.                     } catch (final ParseException ignored) {
  135.                         // just ignore it, skip to next word
  136.                     }
  137.                 }
  138.                 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
  139.                 // if we have it.
  140.                 if (startWhiteSpace != -1) {
  141.                     decodedText.append(text, startWhiteSpace, endWhiteSpace);
  142.                     startWhiteSpace = -1;
  143.                 }
  144.                 // this is not a decoded token.
  145.                 previousTokenEncoded = false;
  146.                 decodedText.append(word);
  147.             }
  148.         }

  149.         return decodedText.toString();
  150.     }

  151.     /**
  152.      * Decodes a string using the RFC 2047 rules for an "encoded-word" type. This encoding has the syntax:
  153.      *
  154.      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
  155.      *
  156.      * @param word The possibly encoded word value.
  157.      * @return The decoded word.
  158.      * @throws ParseException               in case of a parse error of the RFC 2047.
  159.      * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found.
  160.      */
  161.     private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
  162.         // encoded words start with the characters "=?". If this not an encoded word, we throw a
  163.         // ParseException for the caller.

  164.         final var etmPos = word.indexOf(ENCODED_TOKEN_MARKER);
  165.         if (etmPos != 0) {
  166.             throw new ParseException("Invalid RFC 2047 encoded-word: " + word, etmPos);
  167.         }

  168.         final var charsetPos = word.indexOf('?', 2);
  169.         if (charsetPos == -1) {
  170.             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word, charsetPos);
  171.         }

  172.         // pull out the character set information (this is the MIME name at this point).
  173.         final var charset = word.substring(2, charsetPos).toLowerCase(Locale.ROOT);

  174.         // now pull out the encoding token the same way.
  175.         final var encodingPos = word.indexOf('?', charsetPos + 1);
  176.         if (encodingPos == -1) {
  177.             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word, encodingPos);
  178.         }

  179.         final var encoding = word.substring(charsetPos + 1, encodingPos);

  180.         // and finally the encoded text.
  181.         final var encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
  182.         if (encodedTextPos == -1) {
  183.             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word, encodedTextPos);
  184.         }

  185.         final var encodedText = word.substring(encodingPos + 1, encodedTextPos);

  186.         // seems a bit silly to encode a null string, but easy to deal with.
  187.         if (encodedText.isEmpty()) {
  188.             return "";
  189.         }

  190.         try {
  191.             // the decoder writes directly to an output stream.
  192.             final var out = new ByteArrayOutputStream(encodedText.length());

  193.             final var encodedData = encodedText.getBytes(StandardCharsets.US_ASCII);

  194.             // Base64 encoded?
  195.             if (encoding.equals(BASE64_ENCODING_MARKER)) {
  196.                 out.write(Base64.getMimeDecoder().decode(encodedData));
  197.             } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
  198.                 QuotedPrintableDecoder.decode(encodedData, out);
  199.             } else {
  200.                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
  201.             }
  202.             // get the decoded byte data and convert into a string.
  203.             final var decodedData = out.toByteArray();
  204.             return new String(decodedData, javaCharset(charset));
  205.         } catch (final IOException e) {
  206.             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
  207.         }
  208.     }

  209.     /**
  210.      * Translate a MIME standard character set name into the Java equivalent.
  211.      *
  212.      * @param charset The MIME standard name.
  213.      * @return The Java equivalent for this name.
  214.      */
  215.     private static String javaCharset(final String charset) {
  216.         // nothing in, nothing out.
  217.         if (charset == null) {
  218.             return null;
  219.         }
  220.         final var mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ROOT));
  221.         // if there is no mapping, then the original name is used. Many of the MIME character set
  222.         // names map directly back into Java. The reverse isn't necessarily true.
  223.         return mappedCharset == null ? charset : mappedCharset;
  224.     }

  225.     /**
  226.      * Hidden constructor, this class must not be instantiated.
  227.      */
  228.     private MimeUtils() {
  229.         // do nothing
  230.     }

  231. }