1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.io.input;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.file.Files;
33 import java.nio.file.Path;
34 import java.text.MessageFormat;
35 import java.util.Locale;
36 import java.util.Objects;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.io.ByteOrderMark;
41 import org.apache.commons.io.Charsets;
42 import org.apache.commons.io.IOUtils;
43 import org.apache.commons.io.build.AbstractStreamBuilder;
44 import org.apache.commons.io.function.IOConsumer;
45
46 /**
47 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
48 * <p>
49 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
50 * </p>
51 * <p>
52 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
53 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
54 * </p>
55 * <p>
56 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
57 * </p>
58 * <p>
59 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
60 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
61 * Determining the character encoding of a feed</a>.
62 * </p>
63 * <p>
64 * To build an instance, use {@link Builder}.
65 * </p>
66 * <p>
67 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
68 * </p>
69 *
70 * @see Builder
71 * @see org.apache.commons.io.output.XmlStreamWriter
72 * @since 2.0
73 */
74 public class XmlStreamReader extends Reader {
75
76 // @formatter:off
77 /**
78 * Builds a new {@link XmlStreamReader}.
79 *
80 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
81 * <p>
82 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
83 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
84 * </p>
85 * <p>
86 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
87 * </p>
88 * <p>
89 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
90 * </p>
91 * <p>
92 * Else if the XML prolog had a charset encoding that encoding is used.
93 * </p>
94 * <p>
95 * Else if the content type had a charset encoding that encoding is used.
96 * </p>
97 * <p>
98 * Else 'UTF-8' is used.
99 * </p>
100 * <p>
101 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
102 * </p>
103 * <p>
104 * For example:
105 * </p>
106 *
107 * <pre>{@code
108 * XmlStreamReader r = XmlStreamReader.builder()
109 * .setPath(path)
110 * .setCharset(StandardCharsets.UTF_8)
111 * .get();
112 * }
113 * </pre>
114 *
115 * @see #get()
116 * @since 2.12.0
117 */
118 // @formatter:on
119 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
120
121 private boolean nullCharset = true;
122 private boolean lenient = true;
123 private String httpContentType;
124
125 /**
126 * Constructs a new builder of {@link XmlStreamReader}.
127 */
128 public Builder() {
129 // empty
130 }
131
132 /**
133 * Builds a new {@link XmlStreamReader}.
134 * <p>
135 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
136 * </p>
137 * <p>
138 * This builder uses the following aspects:
139 * </p>
140 * <ul>
141 * <li>{@link #getInputStream()}</li>
142 * <li>{@link #getCharset()}</li>
143 * <li>lenient</li>
144 * <li>httpContentType</li>
145 * </ul>
146 *
147 * @return a new instance.
148 * @throws IllegalStateException if the {@code origin} is {@code null}.
149 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
150 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
151 * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
152 * @see #getInputStream()
153 * @see #getUnchecked()
154 */
155 @Override
156 public XmlStreamReader get() throws IOException {
157 final String defaultEncoding = nullCharset ? null : getCharset().name();
158 // @formatter:off
159 return httpContentType == null
160 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
161 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
162 // @formatter:on
163 }
164
165 @Override
166 public Builder setCharset(final Charset charset) {
167 nullCharset = charset == null;
168 return super.setCharset(charset);
169 }
170
171 @Override
172 public Builder setCharset(final String charset) {
173 nullCharset = charset == null;
174 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
175 }
176
177 /**
178 * Sets the HTTP content type.
179 *
180 * @param httpContentType the HTTP content type.
181 * @return {@code this} instance.
182 */
183 public Builder setHttpContentType(final String httpContentType) {
184 this.httpContentType = httpContentType;
185 return this;
186 }
187
188 /**
189 * Sets the lenient toggle.
190 *
191 * @param lenient the lenient toggle.
192 * @return {@code this} instance.
193 */
194 public Builder setLenient(final boolean lenient) {
195 this.lenient = lenient;
196 return this;
197 }
198
199 }
200
201 private static final String UTF_8 = StandardCharsets.UTF_8.name();
202
203 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
204
205 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
206
207 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
208
209 private static final String UTF_32BE = "UTF-32BE";
210
211 private static final String UTF_32LE = "UTF-32LE";
212
213 private static final String UTF_16 = StandardCharsets.UTF_16.name();
214
215 private static final String UTF_32 = "UTF-32";
216
217 private static final String EBCDIC = "CP1047";
218
219 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
220 ByteOrderMark.UTF_32LE };
221
222 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
223 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
224 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
225 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
226 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
227 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
228
229 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
230
231 /**
232 * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
233 * <p>
234 * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
235 * </p>
236 * <p>
237 * Note the documented pattern is:
238 * </p>
239 * <pre>
240 * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
241 * </pre>
242 * <p>
243 * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
244 * {@code 'ebcdic-de-273+euro'}.
245 * </p>
246 */
247 public static final Pattern ENCODING_PATTERN = Pattern.compile(
248 // @formatter:off
249 "^<\\?xml\\s+"
250 + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
251 + "encoding\\s*=\\s*"
252 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted
253 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
254 Pattern.MULTILINE);
255 // @formatter:on
256
257 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
258
259 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
260
261 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
262
263 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
264
265 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
266
267 /**
268 * Constructs a new {@link Builder}.
269 *
270 * @return a new {@link Builder}.
271 * @since 2.12.0
272 */
273 public static Builder builder() {
274 return new Builder();
275 }
276
277 /**
278 * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
279 *
280 * @param httpContentType the HTTP content type
281 * @return The content type encoding (upcased)
282 */
283 static String getContentTypeEncoding(final String httpContentType) {
284 String encoding = null;
285 if (httpContentType != null) {
286 final int i = httpContentType.indexOf(";");
287 if (i > -1) {
288 final String postMime = httpContentType.substring(i + 1);
289 final Matcher m = CHARSET_PATTERN.matcher(postMime);
290 encoding = m.find() ? m.group(1) : null;
291 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
292 }
293 }
294 return encoding;
295 }
296
297 /**
298 * Gets the MIME type or {@code null} if httpContentType is {@code null}.
299 *
300 * @param httpContentType the HTTP content type
301 * @return The mime content type
302 */
303 static String getContentTypeMime(final String httpContentType) {
304 String mime = null;
305 if (httpContentType != null) {
306 final int i = httpContentType.indexOf(";");
307 mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
308 mime = mime.trim();
309 }
310 return mime;
311 }
312
313 /**
314 * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
315 *
316 * @param inputStream InputStream to create the reader from.
317 * @param guessedEnc guessed encoding
318 * @return the encoding declared in the <?xml encoding=...?>
319 * @throws IOException thrown if there is a problem reading the stream.
320 */
321 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
322 String encoding = null;
323 if (guessedEnc != null) {
324 final byte[] bytes = IOUtils.byteArray();
325 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
326 int offset = 0;
327 int max = IOUtils.DEFAULT_BUFFER_SIZE;
328 int c = inputStream.read(bytes, offset, max);
329 int firstGT = -1;
330 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
331 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
332 offset += c;
333 max -= c;
334 c = inputStream.read(bytes, offset, max);
335 xmlProlog = new String(bytes, 0, offset, guessedEnc);
336 firstGT = xmlProlog.indexOf('>');
337 }
338 if (firstGT == -1) {
339 if (c == -1) {
340 throw new IOException("Unexpected end of XML stream");
341 }
342 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
343 }
344 final int bytesRead = offset;
345 if (bytesRead > 0) {
346 inputStream.reset();
347 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
348 final StringBuilder prolog = new StringBuilder();
349 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
350 final Matcher m = ENCODING_PATTERN.matcher(prolog);
351 if (m.find()) {
352 encoding = m.group(1).toUpperCase(Locale.ROOT);
353 encoding = encoding.substring(1, encoding.length() - 1);
354 }
355 }
356 }
357 return encoding;
358 }
359
360 /**
361 * Tests if the MIME type belongs to the APPLICATION XML family.
362 *
363 * @param mime The mime type
364 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
365 */
366 static boolean isAppXml(final String mime) {
367 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
368 || mime.startsWith("application/") && mime.endsWith("+xml"));
369 }
370
371 /**
372 * Tests if the MIME type belongs to the TEXT XML family.
373 *
374 * @param mime The mime type
375 * @return true if the mime type belongs to the TEXT XML family, otherwise false
376 */
377 static boolean isTextXml(final String mime) {
378 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
379 }
380
381 private final Reader reader;
382
383 private final String encoding;
384
385 private final String defaultEncoding;
386
387 /**
388 * Constructs a Reader for a File.
389 * <p>
390 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
391 * </p>
392 * <p>
393 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
394 * </p>
395 *
396 * @param file File to create a Reader from.
397 * @throws NullPointerException if the input is {@code null}.
398 * @throws IOException thrown if there is a problem reading the file.
399 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
400 */
401 @Deprecated
402 public XmlStreamReader(final File file) throws IOException {
403 this(Objects.requireNonNull(file, "file").toPath());
404 }
405
406 /**
407 * Constructs a Reader for a raw InputStream.
408 * <p>
409 * It follows the same logic used for files.
410 * </p>
411 * <p>
412 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
413 * </p>
414 *
415 * @param inputStream InputStream to create a Reader from.
416 * @throws NullPointerException if the input stream is {@code null}.
417 * @throws IOException thrown if there is a problem reading the stream.
418 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
419 */
420 @Deprecated
421 public XmlStreamReader(final InputStream inputStream) throws IOException {
422 this(inputStream, true);
423 }
424
425 /**
426 * Constructs a Reader for a raw InputStream.
427 * <p>
428 * It follows the same logic used for files.
429 * </p>
430 * <p>
431 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
432 * </p>
433 * <p>
434 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
435 * </p>
436 * <p>
437 * Else if the XML prolog had a charset encoding that encoding is used.
438 * </p>
439 * <p>
440 * Else if the content type had a charset encoding that encoding is used.
441 * </p>
442 * <p>
443 * Else 'UTF-8' is used.
444 * </p>
445 * <p>
446 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
447 * </p>
448 *
449 * @param inputStream InputStream to create a Reader from.
450 * @param lenient indicates if the charset encoding detection should be relaxed.
451 * @throws NullPointerException if the input stream is {@code null}.
452 * @throws IOException thrown if there is a problem reading the stream.
453 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
454 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
455 */
456 @Deprecated
457 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
458 this(inputStream, lenient, null);
459 }
460
461 /**
462 * Constructs a Reader for a raw InputStream.
463 * <p>
464 * It follows the same logic used for files.
465 * </p>
466 * <p>
467 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
468 * </p>
469 * <p>
470 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
471 * </p>
472 * <p>
473 * Else if the XML prolog had a charset encoding that encoding is used.
474 * </p>
475 * <p>
476 * Else if the content type had a charset encoding that encoding is used.
477 * </p>
478 * <p>
479 * Else 'UTF-8' is used.
480 * </p>
481 * <p>
482 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
483 * </p>
484 *
485 * @param inputStream InputStream to create a Reader from.
486 * @param lenient indicates if the charset encoding detection should be relaxed.
487 * @param defaultEncoding The default encoding
488 * @throws NullPointerException if the input stream is {@code null}.
489 * @throws IOException thrown if there is a problem reading the stream.
490 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
491 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
492 */
493 @Deprecated
494 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
495 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
496 this.defaultEncoding = defaultEncoding;
497 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
498 false, BOMS);
499 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
500 this.encoding = processHttpStream(bom, pis, lenient);
501 this.reader = new InputStreamReader(pis, encoding);
502 }
503
504 /**
505 * Constructs a Reader using an InputStream and the associated content-type header.
506 * <p>
507 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509 * </p>
510 * <p>
511 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
512 * </p>
513 *
514 * @param inputStream InputStream to create the reader from.
515 * @param httpContentType content-type header to use for the resolution of the charset encoding.
516 * @throws NullPointerException if the input stream is {@code null}.
517 * @throws IOException thrown if there is a problem reading the file.
518 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
519 */
520 @Deprecated
521 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
522 this(inputStream, httpContentType, true);
523 }
524
525 /**
526 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
527 * <p>
528 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
529 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
530 * </p>
531 * <p>
532 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
533 * </p>
534 * <p>
535 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
536 * </p>
537 * <p>
538 * Else if the XML prolog had a charset encoding that encoding is used.
539 * </p>
540 * <p>
541 * Else if the content type had a charset encoding that encoding is used.
542 * </p>
543 * <p>
544 * Else 'UTF-8' is used.
545 * </p>
546 * <p>
547 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
548 * </p>
549 *
550 * @param inputStream InputStream to create the reader from.
551 * @param httpContentType content-type header to use for the resolution of the charset encoding.
552 * @param lenient indicates if the charset encoding detection should be relaxed.
553 * @throws NullPointerException if the input stream is {@code null}.
554 * @throws IOException thrown if there is a problem reading the file.
555 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
556 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
557 */
558 @Deprecated
559 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
560 this(inputStream, httpContentType, lenient, null);
561 }
562
563 /**
564 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
565 * <p>
566 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
567 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
568 * </p>
569 * <p>
570 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
571 * </p>
572 * <p>
573 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
574 * </p>
575 * <p>
576 * Else if the XML prolog had a charset encoding that encoding is used.
577 * </p>
578 * <p>
579 * Else if the content type had a charset encoding that encoding is used.
580 * </p>
581 * <p>
582 * Else 'UTF-8' is used.
583 * </p>
584 * <p>
585 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
586 * </p>
587 *
588 * @param inputStream InputStream to create the reader from.
589 * @param httpContentType content-type header to use for the resolution of the charset encoding.
590 * @param lenient indicates if the charset encoding detection should be relaxed.
591 * @param defaultEncoding The default encoding
592 * @throws NullPointerException if the input stream is {@code null}.
593 * @throws IOException thrown if there is a problem reading the file.
594 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
595 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
596 */
597 @Deprecated
598 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
599 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
600 throws IOException {
601 this.defaultEncoding = defaultEncoding;
602 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
603 false, BOMS);
604 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
605 this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
606 this.reader = new InputStreamReader(pis, encoding);
607 }
608
609 /**
610 * Constructs a Reader for a File.
611 * <p>
612 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
613 * </p>
614 * <p>
615 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
616 * </p>
617 *
618 * @param file File to create a Reader from.
619 * @throws NullPointerException if the input is {@code null}.
620 * @throws IOException thrown if there is a problem reading the file.
621 * @since 2.11.0
622 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
623 */
624 @Deprecated
625 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
626 public XmlStreamReader(final Path file) throws IOException {
627 this(Files.newInputStream(Objects.requireNonNull(file, "file")));
628 }
629
630 /**
631 * Constructs a Reader using the InputStream of a URL.
632 * <p>
633 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
634 * </p>
635 * <p>
636 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
637 * </p>
638 * <p>
639 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640 * </p>
641 *
642 * @param url URL to create a Reader from.
643 * @throws NullPointerException if the input is {@code null}.
644 * @throws IOException thrown if there is a problem reading the stream of the URL.
645 */
646 public XmlStreamReader(final URL url) throws IOException {
647 this(Objects.requireNonNull(url, "url").openConnection(), null);
648 }
649
650 /**
651 * Constructs a Reader using the InputStream of a URLConnection.
652 * <p>
653 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
654 * </p>
655 * <p>
656 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
657 * content-type.
658 * </p>
659 * <p>
660 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
661 * </p>
662 *
663 * @param urlConnection URLConnection to create a Reader from.
664 * @param defaultEncoding The default encoding
665 * @throws NullPointerException if the input is {@code null}.
666 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
667 */
668 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
669 Objects.requireNonNull(urlConnection, "urlConnection");
670 this.defaultEncoding = defaultEncoding;
671 final boolean lenient = true;
672 final String contentType = urlConnection.getContentType();
673 final InputStream inputStream = urlConnection.getInputStream();
674 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
675 // @formatter:off
676 final BOMInputStream bomInput = BOMInputStream.builder()
677 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
678 .setInclude(false)
679 .setByteOrderMarks(BOMS)
680 .get();
681 @SuppressWarnings("resource")
682 final BOMInputStream piInput = BOMInputStream.builder()
683 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
684 .setInclude(true)
685 .setByteOrderMarks(XML_GUESS_BYTES)
686 .get();
687 // @formatter:on
688 if (urlConnection instanceof HttpURLConnection || contentType != null) {
689 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
690 } else {
691 this.encoding = processHttpStream(bomInput, piInput, lenient);
692 }
693 this.reader = new InputStreamReader(piInput, encoding);
694 }
695
696 /**
697 * Calculates the HTTP encoding.
698 * @param bomEnc BOM encoding
699 * @param xmlGuessEnc XML Guess encoding
700 * @param xmlEnc XML encoding
701 * @param lenient indicates if the charset encoding detection should be relaxed.
702 * @param httpContentType The HTTP content type
703 * @return the HTTP encoding
704 * @throws IOException thrown if there is a problem reading the stream.
705 */
706 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
707 throws IOException {
708
709 // Lenient and has XML encoding
710 if (lenient && xmlEnc != null) {
711 return xmlEnc;
712 }
713
714 // Determine mime/encoding content types from HTTP Content Type
715 final String cTMime = getContentTypeMime(httpContentType);
716 final String cTEnc = getContentTypeEncoding(httpContentType);
717 final boolean appXml = isAppXml(cTMime);
718 final boolean textXml = isTextXml(cTMime);
719
720 // Mime type NOT "application/xml" or "text/xml"
721 if (!appXml && !textXml) {
722 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
723 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
724 }
725
726 // No content type encoding
727 if (cTEnc == null) {
728 if (appXml) {
729 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
730 }
731 return defaultEncoding == null ? US_ASCII : defaultEncoding;
732 }
733
734 // UTF-16BE or UTF-16LE content type encoding
735 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
736 if (bomEnc != null) {
737 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
738 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
739 }
740 return cTEnc;
741 }
742
743 // UTF-16 content type encoding
744 if (cTEnc.equals(UTF_16)) {
745 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
746 return bomEnc;
747 }
748 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
749 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
750 }
751
752 // UTF-32BE or UTF-132E content type encoding
753 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
754 if (bomEnc != null) {
755 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
756 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
757 }
758 return cTEnc;
759 }
760
761 // UTF-32 content type encoding
762 if (cTEnc.equals(UTF_32)) {
763 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
764 return bomEnc;
765 }
766 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
767 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
768 }
769
770 return cTEnc;
771 }
772
773 /**
774 * Calculate the raw encoding.
775 *
776 * @param bomEnc BOM encoding
777 * @param xmlGuessEnc XML Guess encoding
778 * @param xmlEnc XML encoding
779 * @return the raw encoding
780 * @throws IOException thrown if there is a problem reading the stream.
781 */
782 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
783
784 // BOM is Null
785 if (bomEnc == null) {
786 if (xmlGuessEnc == null || xmlEnc == null) {
787 return defaultEncoding == null ? UTF_8 : defaultEncoding;
788 }
789 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
790 return xmlGuessEnc;
791 }
792 return xmlEnc;
793 }
794
795 // BOM is UTF-8
796 if (bomEnc.equals(UTF_8)) {
797 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
798 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
799 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
800 }
801 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
802 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
803 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
804 }
805 return bomEnc;
806 }
807
808 // BOM is UTF-16BE or UTF-16LE
809 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
810 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
811 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
812 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
813 }
814 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
815 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
816 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817 }
818 return bomEnc;
819 }
820
821 // BOM is UTF-32BE or UTF-32LE
822 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
823 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
824 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
825 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
826 }
827 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
828 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
829 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
830 }
831 return bomEnc;
832 }
833
834 // BOM is something else
835 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
836 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
837 }
838
839 /**
840 * Closes the XmlStreamReader stream.
841 *
842 * @throws IOException thrown if there was a problem closing the stream.
843 */
844 @Override
845 public void close() throws IOException {
846 reader.close();
847 }
848
849 /**
850 * Does lenient detection.
851 *
852 * @param httpContentType content-type header to use for the resolution of the charset encoding.
853 * @param ex The thrown exception
854 * @return the encoding
855 * @throws IOException thrown if there is a problem reading the stream.
856 */
857 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
858 if (httpContentType != null && httpContentType.startsWith("text/html")) {
859 httpContentType = httpContentType.substring("text/html".length());
860 httpContentType = "text/xml" + httpContentType;
861 try {
862 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
863 } catch (final XmlStreamReaderException ex2) {
864 ex = ex2;
865 }
866 }
867 String encoding = ex.getXmlEncoding();
868 if (encoding == null) {
869 encoding = ex.getContentTypeEncoding();
870 }
871 if (encoding == null) {
872 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
873 }
874 return encoding;
875 }
876
877 /**
878 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
879 * <p>
880 * If it is {@code null} the content-type based rules are used.
881 * </p>
882 *
883 * @return the default encoding to use.
884 */
885 public String getDefaultEncoding() {
886 return defaultEncoding;
887 }
888
889 /**
890 * Gets the charset encoding of the XmlStreamReader.
891 *
892 * @return charset encoding.
893 */
894 public String getEncoding() {
895 return encoding;
896 }
897
898 /**
899 * Process the raw stream.
900 *
901 * @param bomInput BOMInputStream to detect byte order marks
902 * @param piInput BOMInputStream to guess XML encoding
903 * @param lenient indicates if the charset encoding detection should be relaxed.
904 * @return the encoding to be used
905 * @throws IOException thrown if there is a problem reading the stream.
906 */
907 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
908 final String bomEnc = bomInput.getBOMCharsetName();
909 final String xmlGuessEnc = piInput.getBOMCharsetName();
910 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
911 try {
912 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
913 } catch (final XmlStreamReaderException ex) {
914 if (lenient) {
915 return doLenientDetection(null, ex);
916 }
917 throw ex;
918 }
919 }
920
921 /**
922 * Processes an HTTP stream.
923 *
924 * @param bomInput BOMInputStream to detect byte order marks
925 * @param piInput BOMInputStream to guess XML encoding
926 * @param lenient indicates if the charset encoding detection should be relaxed.
927 * @param httpContentType The HTTP content type
928 * @return the encoding to be used
929 * @throws IOException thrown if there is a problem reading the stream.
930 */
931 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
932 throws IOException {
933 final String bomEnc = bomInput.getBOMCharsetName();
934 final String xmlGuessEnc = piInput.getBOMCharsetName();
935 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
936 try {
937 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
938 } catch (final XmlStreamReaderException ex) {
939 if (lenient) {
940 return doLenientDetection(httpContentType, ex);
941 }
942 throw ex;
943 }
944 }
945
946 /**
947 * Reads the underlying reader's {@code read(char[], int, int)} method.
948 *
949 * @param buf the buffer to read the characters into
950 * @param offset The start offset
951 * @param len The number of bytes to read
952 * @return the number of characters read or -1 if the end of stream
953 * @throws IOException if an I/O error occurs.
954 */
955 @Override
956 public int read(final char[] buf, final int offset, final int len) throws IOException {
957 return reader.read(buf, offset, len);
958 }
959
960 }