1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.io.input;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.file.Files;
33 import java.nio.file.Path;
34 import java.text.MessageFormat;
35 import java.util.Locale;
36 import java.util.Objects;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.io.ByteOrderMark;
41 import org.apache.commons.io.Charsets;
42 import org.apache.commons.io.IOUtils;
43 import org.apache.commons.io.build.AbstractStreamBuilder;
44 import org.apache.commons.io.function.IOConsumer;
45
46 /**
47 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
48 * <p>
49 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
50 * </p>
51 * <p>
52 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
53 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
54 * </p>
55 * <p>
56 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
57 * </p>
58 * <p>
59 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
60 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
61 * Determining the character encoding of a feed</a>.
62 * </p>
63 * <p>
64 * To build an instance, use {@link Builder}.
65 * </p>
66 * <p>
67 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
68 * </p>
69 *
70 * @see Builder
71 * @see org.apache.commons.io.output.XmlStreamWriter
72 * @since 2.0
73 */
74 public class XmlStreamReader extends Reader {
75
76 // @formatter:off
77 /**
78 * Builds a new {@link XmlStreamReader}.
79 *
80 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
81 * <p>
82 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
83 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
84 * </p>
85 * <p>
86 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
87 * </p>
88 * <p>
89 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
90 * </p>
91 * <p>
92 * Else if the XML prolog had a charset encoding that encoding is used.
93 * </p>
94 * <p>
95 * Else if the content type had a charset encoding that encoding is used.
96 * </p>
97 * <p>
98 * Else 'UTF-8' is used.
99 * </p>
100 * <p>
101 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
102 * </p>
103 * <p>
104 * For example:
105 * </p>
106 *
107 * <pre>{@code
108 * XmlStreamReader r = XmlStreamReader.builder()
109 * .setPath(path)
110 * .setCharset(StandardCharsets.UTF_8)
111 * .get();
112 * }
113 * </pre>
114 *
115 * @see #get()
116 * @since 2.12.0
117 */
118 // @formatter:on
119 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
120
121 private boolean nullCharset = true;
122 private boolean lenient = true;
123 private String httpContentType;
124
125 /**
126 * Constructs a new builder of {@link XmlStreamReader}.
127 */
128 public Builder() {
129 // empty
130 }
131
132 /**
133 * Builds a new {@link XmlStreamReader}.
134 * <p>
135 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
136 * </p>
137 * <p>
138 * This builder uses the following aspects:
139 * </p>
140 * <ul>
141 * <li>{@link #getInputStream()}</li>
142 * <li>{@link #getCharset()}</li>
143 * <li>lenient</li>
144 * <li>httpContentType</li>
145 * </ul>
146 *
147 * @return a new instance.
148 * @throws IllegalStateException if the {@code origin} is {@code null}.
149 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
150 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
151 * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
152 * @see #getInputStream()
153 * @see #getUnchecked()
154 */
155 @Override
156 public XmlStreamReader get() throws IOException {
157 final String defaultEncoding = nullCharset ? null : getCharset().name();
158 // @formatter:off
159 return httpContentType == null
160 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
161 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
162 // @formatter:on
163 }
164
165 @Override
166 public Builder setCharset(final Charset charset) {
167 nullCharset = charset == null;
168 return super.setCharset(charset);
169 }
170
171 @Override
172 public Builder setCharset(final String charset) {
173 nullCharset = charset == null;
174 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
175 }
176
177 /**
178 * Sets the HTTP content type.
179 *
180 * @param httpContentType the HTTP content type.
181 * @return {@code this} instance.
182 */
183 public Builder setHttpContentType(final String httpContentType) {
184 this.httpContentType = httpContentType;
185 return this;
186 }
187
188 /**
189 * Sets the lenient toggle.
190 *
191 * @param lenient the lenient toggle.
192 * @return {@code this} instance.
193 */
194 public Builder setLenient(final boolean lenient) {
195 this.lenient = lenient;
196 return this;
197 }
198
199 }
200
201 private static final String UTF_8 = StandardCharsets.UTF_8.name();
202
203 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
204
205 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
206
207 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
208
209 private static final String UTF_32BE = "UTF-32BE";
210
211 private static final String UTF_32LE = "UTF-32LE";
212
213 private static final String UTF_16 = StandardCharsets.UTF_16.name();
214
215 private static final String UTF_32 = "UTF-32";
216
217 private static final String EBCDIC = "CP1047";
218
219 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
220 ByteOrderMark.UTF_32LE };
221
222 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
223 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
224 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
225 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
226 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
227 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
228
229 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
230
231 /**
232 * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
233 * <p>
234 * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
235 * </p>
236 * <p>
237 * Note the documented pattern is:
238 * </p>
239 * <pre>
240 * EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
241 * </pre>
242 * <p>
243 * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
244 * {@code 'ebcdic-de-273+euro'}.
245 * </p>
246 */
247 public static final Pattern ENCODING_PATTERN = Pattern.compile(
248 // @formatter:off
249 "^<\\?xml\\s+"
250 + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
251 + "encoding\\s*=\\s*"
252 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted
253 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
254 Pattern.MULTILINE);
255 // @formatter:on
256
257 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
258
259 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
260
261 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
262
263 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
264
265 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
266
267 /**
268 * Constructs a new {@link Builder}.
269 *
270 * @return a new {@link Builder}.
271 * @since 2.12.0
272 */
273 public static Builder builder() {
274 return new Builder();
275 }
276
277 /**
278 * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
279 *
280 * @param httpContentType the HTTP content type.
281 * @return The content type encoding (upcased).
282 */
283 static String getContentTypeEncoding(final String httpContentType) {
284 String encoding = null;
285 if (httpContentType != null) {
286 final int i = httpContentType.indexOf(";");
287 if (i > -1) {
288 final String postMime = httpContentType.substring(i + 1);
289 final Matcher m = CHARSET_PATTERN.matcher(postMime);
290 encoding = m.find() ? m.group(1) : null;
291 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
292 }
293 }
294 return encoding;
295 }
296
297 /**
298 * Gets the MIME type or {@code null} if httpContentType is {@code null}.
299 *
300 * @param httpContentType the HTTP content type.
301 * @return The mime content type.
302 */
303 static String getContentTypeMime(final String httpContentType) {
304 String mime = null;
305 if (httpContentType != null) {
306 final int i = httpContentType.indexOf(";");
307 mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
308 mime = mime.trim();
309 }
310 return mime;
311 }
312
313 /**
314 * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
315 *
316 * @param inputStream InputStream to create the reader from.
317 * @param guessedEnc guessed encoding.
318 * @return the encoding declared in the <?xml encoding=...?>.
319 * @throws IOException thrown if there is a problem reading the stream.
320 */
321 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
322 String encoding = null;
323 if (guessedEnc != null) {
324 final byte[] bytes = IOUtils.byteArray();
325 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
326 int offset = 0;
327 int max = IOUtils.DEFAULT_BUFFER_SIZE;
328 int c = inputStream.read(bytes, offset, max);
329 int firstGT = -1;
330 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
331 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
332 offset += c;
333 max -= c;
334 c = inputStream.read(bytes, offset, max);
335 xmlProlog = new String(bytes, 0, offset, guessedEnc);
336 firstGT = xmlProlog.indexOf('>');
337 }
338 if (firstGT == -1) {
339 if (c == -1) {
340 throw new IOException("Unexpected end of XML stream");
341 }
342 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
343 }
344 final int bytesRead = offset;
345 if (bytesRead > 0) {
346 inputStream.reset();
347 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
348 final StringBuilder prolog = new StringBuilder();
349 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
350 final Matcher m = ENCODING_PATTERN.matcher(prolog);
351 if (m.find()) {
352 encoding = m.group(1).toUpperCase(Locale.ROOT);
353 encoding = encoding.substring(1, encoding.length() - 1);
354 }
355 }
356 }
357 return encoding;
358 }
359
360 /**
361 * Tests if the MIME type belongs to the APPLICATION XML family.
362 *
363 * @param mime The mime type.
364 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false.
365 */
366 static boolean isAppXml(final String mime) {
367 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
368 || mime.startsWith("application/") && mime.endsWith("+xml"));
369 }
370
371 /**
372 * Tests if the MIME type belongs to the TEXT XML family.
373 *
374 * @param mime The mime type.
375 * @return true if the mime type belongs to the TEXT XML family, otherwise false.
376 */
377 static boolean isTextXml(final String mime) {
378 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
379 }
380
381 private final Reader reader;
382
383 private final String encoding;
384
385 private final String defaultEncoding;
386
387 /**
388 * Constructs a Reader for a File.
389 * <p>
390 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
391 * </p>
392 * <p>
393 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
394 * </p>
395 *
396 * @param file File to create a Reader from.
397 * @throws NullPointerException if the input is {@code null}.
398 * @throws IOException thrown if there is a problem reading the file.
399 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
400 */
401 @Deprecated
402 public XmlStreamReader(final File file) throws IOException {
403 this(Objects.requireNonNull(file, "file").toPath());
404 }
405
406 /**
407 * Constructs a Reader for a raw InputStream.
408 * <p>
409 * It follows the same logic used for files.
410 * </p>
411 * <p>
412 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
413 * </p>
414 *
415 * @param inputStream InputStream to create a Reader from.
416 * @throws NullPointerException if the input stream is {@code null}.
417 * @throws IOException thrown if there is a problem reading the stream.
418 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
419 */
420 @Deprecated
421 public XmlStreamReader(final InputStream inputStream) throws IOException {
422 this(inputStream, true);
423 }
424
425 /**
426 * Constructs a Reader for a raw InputStream.
427 * <p>
428 * It follows the same logic used for files.
429 * </p>
430 * <p>
431 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
432 * </p>
433 * <p>
434 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
435 * </p>
436 * <p>
437 * Else if the XML prolog had a charset encoding that encoding is used.
438 * </p>
439 * <p>
440 * Else if the content type had a charset encoding that encoding is used.
441 * </p>
442 * <p>
443 * Else 'UTF-8' is used.
444 * </p>
445 * <p>
446 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
447 * </p>
448 *
449 * @param inputStream InputStream to create a Reader from.
450 * @param lenient indicates if the charset encoding detection should be relaxed.
451 * @throws NullPointerException if the input stream is {@code null}.
452 * @throws IOException thrown if there is a problem reading the stream.
453 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
454 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
455 */
456 @Deprecated
457 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
458 this(inputStream, lenient, null);
459 }
460
461 /**
462 * Constructs a Reader for a raw InputStream.
463 * <p>
464 * It follows the same logic used for files.
465 * </p>
466 * <p>
467 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
468 * </p>
469 * <p>
470 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
471 * </p>
472 * <p>
473 * Else if the XML prolog had a charset encoding that encoding is used.
474 * </p>
475 * <p>
476 * Else if the content type had a charset encoding that encoding is used.
477 * </p>
478 * <p>
479 * Else 'UTF-8' is used.
480 * </p>
481 * <p>
482 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
483 * </p>
484 *
485 * @param inputStream InputStream to create a Reader from.
486 * @param lenient indicates if the charset encoding detection should be relaxed.
487 * @param defaultEncoding The default encoding.
488 * @throws NullPointerException if the input stream is {@code null}.
489 * @throws IOException thrown if there is a problem reading the stream.
490 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
491 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
492 */
493 @Deprecated
494 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
495 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
496 this.defaultEncoding = defaultEncoding;
497 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
498 false, BOMS);
499 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
500 this.encoding = toEncoding(bom, pis, lenient);
501 this.reader = new InputStreamReader(pis, encoding);
502 }
503
504 /**
505 * Constructs a Reader using an InputStream and the associated content-type header.
506 * <p>
507 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509 * </p>
510 * <p>
511 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
512 * </p>
513 *
514 * @param inputStream InputStream to create the reader from.
515 * @param httpContentType content-type header to use for the resolution of the charset encoding.
516 * @throws NullPointerException if the input stream is {@code null}.
517 * @throws IOException thrown if there is a problem reading the file.
518 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
519 */
520 @Deprecated
521 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
522 this(inputStream, httpContentType, true);
523 }
524
525 /**
526 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
527 * <p>
528 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
529 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
530 * </p>
531 * <p>
532 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
533 * </p>
534 * <p>
535 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
536 * </p>
537 * <p>
538 * Else if the XML prolog had a charset encoding that encoding is used.
539 * </p>
540 * <p>
541 * Else if the content type had a charset encoding that encoding is used.
542 * </p>
543 * <p>
544 * Else 'UTF-8' is used.
545 * </p>
546 * <p>
547 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
548 * </p>
549 *
550 * @param inputStream InputStream to create the reader from.
551 * @param httpContentType content-type header to use for the resolution of the charset encoding.
552 * @param lenient indicates if the charset encoding detection should be relaxed.
553 * @throws NullPointerException if the input stream is {@code null}.
554 * @throws IOException thrown if there is a problem reading the file.
555 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
556 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
557 */
558 @Deprecated
559 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
560 this(inputStream, httpContentType, lenient, null);
561 }
562
563 /**
564 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
565 * <p>
566 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
567 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
568 * </p>
569 * <p>
570 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
571 * </p>
572 * <p>
573 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
574 * </p>
575 * <p>
576 * Else if the XML prolog had a charset encoding that encoding is used.
577 * </p>
578 * <p>
579 * Else if the content type had a charset encoding that encoding is used.
580 * </p>
581 * <p>
582 * Else 'UTF-8' is used.
583 * </p>
584 * <p>
585 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
586 * </p>
587 *
588 * @param inputStream InputStream to create the reader from.
589 * @param httpContentType content-type header to use for the resolution of the charset encoding.
590 * @param lenient indicates if the charset encoding detection should be relaxed.
591 * @param defaultEncoding The default encoding.
592 * @throws NullPointerException if the input stream is {@code null}.
593 * @throws IOException thrown if there is a problem reading the file.
594 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
595 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
596 */
597 @Deprecated
598 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
599 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
600 throws IOException {
601 this.defaultEncoding = defaultEncoding;
602 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
603 false, BOMS);
604 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
605 this.encoding = toEncoding(bom, pis, lenient, httpContentType);
606 this.reader = new InputStreamReader(pis, encoding);
607 }
608
609 /**
610 * Constructs a Reader for a File.
611 * <p>
612 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
613 * </p>
614 * <p>
615 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
616 * </p>
617 *
618 * @param file File to create a Reader from.
619 * @throws NullPointerException if the input is {@code null}.
620 * @throws IOException thrown if there is a problem reading the file.
621 * @since 2.11.0
622 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
623 */
624 @Deprecated
625 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
626 public XmlStreamReader(final Path file) throws IOException {
627 this(Files.newInputStream(Objects.requireNonNull(file, "file")));
628 }
629
630 /**
631 * Constructs a Reader using the InputStream of a URL.
632 * <p>
633 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
634 * </p>
635 * <p>
636 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
637 * </p>
638 * <p>
639 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640 * </p>
641 *
642 * @param url URL to create a Reader from.
643 * @throws NullPointerException if the input is {@code null}.
644 * @throws IOException thrown if there is a problem reading the stream of the URL.
645 */
646 public XmlStreamReader(final URL url) throws IOException {
647 this(Objects.requireNonNull(url, "url").openConnection(), null);
648 }
649
650 /**
651 * Constructs a Reader using the InputStream of a URLConnection.
652 * <p>
653 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
654 * </p>
655 * <p>
656 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
657 * content-type.
658 * </p>
659 * <p>
660 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
661 * </p>
662 *
663 * @param urlConnection URLConnection to create a Reader from.
664 * @param defaultEncoding The default encoding.
665 * @throws NullPointerException if the input is {@code null}.
666 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
667 */
668 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
669 Objects.requireNonNull(urlConnection, "urlConnection");
670 this.defaultEncoding = defaultEncoding;
671 final boolean lenient = true;
672 final String contentType = urlConnection.getContentType();
673 final InputStream inputStream = urlConnection.getInputStream();
674 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
675 // @formatter:off
676 final BOMInputStream bomInput = BOMInputStream.builder()
677 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
678 .setInclude(false)
679 .setByteOrderMarks(BOMS)
680 .get();
681 @SuppressWarnings("resource")
682 final BOMInputStream piInput = BOMInputStream.builder()
683 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
684 .setInclude(true)
685 .setByteOrderMarks(XML_GUESS_BYTES)
686 .get();
687 // @formatter:on
688 if (urlConnection instanceof HttpURLConnection || contentType != null) {
689 this.encoding = toEncoding(bomInput, piInput, lenient, contentType);
690 } else {
691 this.encoding = toEncoding(bomInput, piInput, lenient);
692 }
693 this.reader = new InputStreamReader(piInput, encoding);
694 }
695
696 /**
697 * Closes the XmlStreamReader stream.
698 *
699 * @throws IOException thrown if there was a problem closing the stream.
700 */
701 @Override
702 public void close() throws IOException {
703 reader.close();
704 }
705
706 /**
707 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
708 * <p>
709 * If it is {@code null} the content-type based rules are used.
710 * </p>
711 *
712 * @return the default encoding to use.
713 */
714 public String getDefaultEncoding() {
715 return defaultEncoding;
716 }
717
718 /**
719 * Gets the charset encoding of the XmlStreamReader.
720 *
721 * @return charset encoding.
722 */
723 public String getEncoding() {
724 return encoding;
725 }
726
727 /**
728 * Reads the underlying reader's {@code read(char[], int, int)} method.
729 *
730 * @param buf the buffer to read the characters into.
731 * @param offset The start offset.
732 * @param len The number of bytes to read.
733 * @return the number of characters read or -1 if the end of stream.
734 * @throws IOException if an I/O error occurs.
735 */
736 @Override
737 public int read(final char[] buf, final int offset, final int len) throws IOException {
738 return reader.read(buf, offset, len);
739 }
740
741 /**
742 * Process the raw stream.
743 *
744 * @param bomInput BOMInputStream to detect byte order marks.
745 * @param piInput BOMInputStream to guess XML encoding.
746 * @param lenient indicates if the charset encoding detection should be relaxed.
747 * @return the encoding to be used.
748 * @throws IOException thrown if there is a problem reading the stream.
749 */
750 private String toEncoding(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
751 final String bomEnc = bomInput.getBOMCharsetName();
752 final String xmlGuessEnc = piInput.getBOMCharsetName();
753 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
754 try {
755 return toRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
756 } catch (final XmlStreamReaderException ex) {
757 if (lenient) {
758 return toEncodingLenient(null, ex);
759 }
760 throw ex;
761 }
762 }
763
764 /**
765 * Processes an HTTP stream.
766 *
767 * @param bomInput BOMInputStream to detect byte order marks.
768 * @param piInput BOMInputStream to guess XML encoding.
769 * @param lenient indicates if the charset encoding detection should be relaxed.
770 * @param httpContentType The HTTP content type.
771 * @return the encoding to be used.
772 * @throws IOException thrown if there is a problem reading the stream.
773 */
774 private String toEncoding(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
775 throws IOException {
776 final String bomEnc = bomInput.getBOMCharsetName();
777 final String xmlGuessEnc = piInput.getBOMCharsetName();
778 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
779 try {
780 return toHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
781 } catch (final XmlStreamReaderException ex) {
782 if (lenient) {
783 return toEncodingLenient(httpContentType, ex);
784 }
785 throw ex;
786 }
787 }
788
789 /**
790 * Detects the encoding in lenient mode.
791 *
792 * @param httpContentType content-type header to use for the resolution of the charset encoding.
793 * @param ex The thrown exception.
794 * @return the encoding.
795 * @throws IOException thrown if there is a problem reading the stream.
796 */
797 private String toEncodingLenient(String httpContentType, XmlStreamReaderException ex) throws IOException {
798 if (httpContentType != null && httpContentType.startsWith("text/html")) {
799 httpContentType = httpContentType.substring("text/html".length());
800 httpContentType = "text/xml" + httpContentType;
801 try {
802 return toHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
803 } catch (final XmlStreamReaderException ex2) {
804 ex = ex2;
805 }
806 }
807 String encoding = ex.getXmlEncoding();
808 if (encoding == null) {
809 encoding = ex.getContentTypeEncoding();
810 }
811 if (encoding == null) {
812 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
813 }
814 return encoding;
815 }
816
817 /**
818 * Calculates the HTTP encoding.
819 *
820 * @param bomEnc BOM encoding.
821 * @param xmlGuessEnc XML Guess encoding.
822 * @param xmlEnc XML encoding.
823 * @param lenient indicates if the charset encoding detection should be relaxed.
824 * @param httpContentType The HTTP content type.
825 * @return the HTTP encoding.
826 * @throws IOException thrown if there is a problem reading the stream.
827 */
828 String toHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
829 throws IOException {
830 // Lenient and has XML encoding
831 if (lenient && xmlEnc != null) {
832 return xmlEnc;
833 }
834 // Determine mime/encoding content types from HTTP Content Type
835 final String cTMime = getContentTypeMime(httpContentType);
836 final String cTEnc = getContentTypeEncoding(httpContentType);
837 final boolean appXml = isAppXml(cTMime);
838 final boolean textXml = isTextXml(cTMime);
839 // Mime type NOT "application/xml" or "text/xml"
840 if (!appXml && !textXml) {
841 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
842 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
843 }
844 // No content type encoding
845 if (cTEnc == null) {
846 if (appXml) {
847 return toRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
848 }
849 return defaultEncoding == null ? US_ASCII : defaultEncoding;
850 }
851 // UTF-16BE or UTF-16LE content type encoding
852 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
853 if (bomEnc != null) {
854 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
855 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
856 }
857 return cTEnc;
858 }
859 // UTF-16 content type encoding
860 if (cTEnc.equals(UTF_16)) {
861 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
862 return bomEnc;
863 }
864 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
865 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
866 }
867 // UTF-32BE or UTF-132E content type encoding
868 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
869 if (bomEnc != null) {
870 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
871 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
872 }
873 return cTEnc;
874 }
875 // UTF-32 content type encoding
876 if (cTEnc.equals(UTF_32)) {
877 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
878 return bomEnc;
879 }
880 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
881 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
882 }
883 return cTEnc;
884 }
885
886 /**
887 * Calculate the raw encoding.
888 *
889 * @param bomEnc BOM encoding.
890 * @param xmlGuessEnc XML Guess encoding.
891 * @param xmlEnc XML encoding.
892 * @return the raw encoding.
893 * @throws IOException thrown if there is a problem reading the stream.
894 */
895 String toRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
896
897 // BOM is Null
898 if (bomEnc == null) {
899 if (xmlGuessEnc == null || xmlEnc == null) {
900 return defaultEncoding == null ? UTF_8 : defaultEncoding;
901 }
902 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
903 return xmlGuessEnc;
904 }
905 return xmlEnc;
906 }
907
908 // BOM is UTF-8
909 if (bomEnc.equals(UTF_8)) {
910 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8) || xmlEnc != null && !xmlEnc.equals(UTF_8)) {
911 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
912 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
913 }
914 return bomEnc;
915 }
916
917 // BOM is UTF-16BE or UTF-16LE
918 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
919 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
920 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
921 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
922 }
923 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
924 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
925 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
926 }
927 return bomEnc;
928 }
929
930 // BOM is UTF-32BE or UTF-32LE
931 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
932 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
933 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
934 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
935 }
936 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
937 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
938 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
939 }
940 return bomEnc;
941 }
942
943 // BOM is something else
944 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
945 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
946 }
947
948 }