View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.fileupload2.core;
18  
19  import java.io.ByteArrayOutputStream;
20  import java.io.IOException;
21  import java.io.UnsupportedEncodingException;
22  import java.nio.charset.StandardCharsets;
23  import java.text.ParseException;
24  import java.util.Base64;
25  import java.util.HashMap;
26  import java.util.Locale;
27  import java.util.Map;
28  
29  /**
30   * Utility class to decode MIME texts.
31   */
32  final class MimeUtils {
33  
34      /**
35       * The marker to indicate text is encoded with BASE64 algorithm.
36       */
37      private static final String BASE64_ENCODING_MARKER = "B";
38  
39      /**
40       * The marker to indicate text is encoded with QuotedPrintable algorithm.
41       */
42      private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
43  
44      /**
45       * If the text contains any encoded tokens, those tokens will be marked with "=?".
46       */
47      private static final String ENCODED_TOKEN_MARKER = "=?";
48  
49      /**
50       * If the text contains any encoded tokens, those tokens will terminate with "=?".
51       */
52      private static final String ENCODED_TOKEN_FINISHER = "?=";
53  
54      /**
55       * The linear whitespace chars sequence.
56       */
57      private static final String LINEAR_WHITESPACE = " \t\r\n";
58  
59      /**
60       * Mappings between MIME and Java charset.
61       */
62      private static final Map<String, String> MIME2JAVA = new HashMap<>();
63  
64      static {
65          MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
66          MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
67          MIME2JAVA.put("utf-8", "UTF8");
68          MIME2JAVA.put("utf8", "UTF8");
69          MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
70          MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
71          MIME2JAVA.put("euc-kr", "KSC5601");
72          MIME2JAVA.put("euckr", "KSC5601");
73          MIME2JAVA.put("us-ascii", StandardCharsets.ISO_8859_1.name());
74          MIME2JAVA.put("x-us-ascii", StandardCharsets.ISO_8859_1.name());
75      }
76  
77      /**
78       * Decodes a string of text obtained from a mail header into its proper form. The text generally will consist of a string of tokens, some of which may be
79       * encoded using base64 encoding.
80       *
81       * @param text The text to decode.
82       *
83       * @return The decoded text string.
84       * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
85       */
86      static String decodeText(final String text) throws UnsupportedEncodingException {
87          // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
88          // source string doesn't contain that sequent, no decoding is required.
89          if (!text.contains(ENCODED_TOKEN_MARKER)) {
90              return text;
91          }
92  
93          var offset = 0;
94          final var endOffset = text.length();
95  
96          var startWhiteSpace = -1;
97          var endWhiteSpace = -1;
98  
99          final var decodedText = new StringBuilder(text.length());
100 
101         var previousTokenEncoded = false;
102 
103         while (offset < endOffset) {
104             var ch = text.charAt(offset);
105 
106             // is this a whitespace character?
107             if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
108                 startWhiteSpace = offset;
109                 while (offset < endOffset) {
110                     // step over the white space characters.
111                     ch = text.charAt(offset);
112                     if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
113                         // record the location of the first non lwsp and drop down to process the
114                         // token characters.
115                         endWhiteSpace = offset;
116                         break;
117                     }
118                     offset++;
119                 }
120             } else {
121                 // we have a word token. We need to scan over the word and then try to parse it.
122                 final var wordStart = offset;
123 
124                 while (offset < endOffset) {
125                     // step over the non white space characters.
126                     ch = text.charAt(offset);
127                     if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
128                         break;
129                     }
130                     offset++;
131 
132                     // NB: Trailing whitespace on these header strings will just be discarded.
133                 }
134                 // pull out the word token.
135                 final var word = text.substring(wordStart, offset);
136                 // is the token encoded? decode the word
137                 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
138                     try {
139                         // if this gives a parsing failure, treat it like a non-encoded word.
140                         final var decodedWord = decodeWord(word);
141 
142                         // are any whitespace characters significant? Append 'em if we've got 'em.
143                         if (!previousTokenEncoded && startWhiteSpace != -1) {
144                             decodedText.append(text, startWhiteSpace, endWhiteSpace);
145                             startWhiteSpace = -1;
146                         }
147                         // this is definitely a decoded token.
148                         previousTokenEncoded = true;
149                         // and add this to the text.
150                         decodedText.append(decodedWord);
151                         // we continue parsing from here...we allow parsing errors to fall through
152                         // and get handled as normal text.
153                         continue;
154 
155                     } catch (final ParseException ignored) {
156                         // just ignore it, skip to next word
157                     }
158                 }
159                 // this is a normal token, so it doesn't matter what the previous token was. Add the white space
160                 // if we have it.
161                 if (startWhiteSpace != -1) {
162                     decodedText.append(text, startWhiteSpace, endWhiteSpace);
163                     startWhiteSpace = -1;
164                 }
165                 // this is not a decoded token.
166                 previousTokenEncoded = false;
167                 decodedText.append(word);
168             }
169         }
170 
171         return decodedText.toString();
172     }
173 
174     /**
175      * Decodes a string using the RFC 2047 rules for an "encoded-word" type. This encoding has the syntax:
176      *
177      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
178      *
179      * @param word The possibly encoded word value.
180      *
181      * @return The decoded word.
182      * @throws ParseException               in case of a parse error of the RFC 2047.
183      * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found.
184      */
185     private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
186         // encoded words start with the characters "=?". If this not an encoded word, we throw a
187         // ParseException for the caller.
188 
189         final var etmPos = word.indexOf(ENCODED_TOKEN_MARKER);
190         if (etmPos != 0) {
191             throw new ParseException("Invalid RFC 2047 encoded-word: " + word, etmPos);
192         }
193 
194         final var charsetPos = word.indexOf('?', 2);
195         if (charsetPos == -1) {
196             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word, charsetPos);
197         }
198 
199         // pull out the character set information (this is the MIME name at this point).
200         final var charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH);
201 
202         // now pull out the encoding token the same way.
203         final var encodingPos = word.indexOf('?', charsetPos + 1);
204         if (encodingPos == -1) {
205             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word, encodingPos);
206         }
207 
208         final var encoding = word.substring(charsetPos + 1, encodingPos);
209 
210         // and finally the encoded text.
211         final var encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
212         if (encodedTextPos == -1) {
213             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word, encodedTextPos);
214         }
215 
216         final var encodedText = word.substring(encodingPos + 1, encodedTextPos);
217 
218         // seems a bit silly to encode a null string, but easy to deal with.
219         if (encodedText.isEmpty()) {
220             return "";
221         }
222 
223         try {
224             // the decoder writes directly to an output stream.
225             final var out = new ByteArrayOutputStream(encodedText.length());
226 
227             final var encodedData = encodedText.getBytes(StandardCharsets.US_ASCII);
228 
229             // Base64 encoded?
230             if (encoding.equals(BASE64_ENCODING_MARKER)) {
231                 out.write(Base64.getMimeDecoder().decode(encodedData));
232             } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
233                 QuotedPrintableDecoder.decode(encodedData, out);
234             } else {
235                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
236             }
237             // get the decoded byte data and convert into a string.
238             final var decodedData = out.toByteArray();
239             return new String(decodedData, javaCharset(charset));
240         } catch (final IOException e) {
241             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
242         }
243     }
244 
245     /**
246      * Translate a MIME standard character set name into the Java equivalent.
247      *
248      * @param charset The MIME standard name.
249      *
250      * @return The Java equivalent for this name.
251      */
252     private static String javaCharset(final String charset) {
253         // nothing in, nothing out.
254         if (charset == null) {
255             return null;
256         }
257         final var mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
258         // if there is no mapping, then the original name is used. Many of the MIME character set
259         // names map directly back into Java. The reverse isn't necessarily true.
260         return mappedCharset == null ? charset : mappedCharset;
261     }
262 
263     /**
264      * Hidden constructor, this class must not be instantiated.
265      */
266     private MimeUtils() {
267         // do nothing
268     }
269 
270 }