View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.HttpURLConnection;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.nio.charset.Charset;
31  import java.nio.charset.StandardCharsets;
32  import java.nio.file.Files;
33  import java.nio.file.Path;
34  import java.text.MessageFormat;
35  import java.util.Locale;
36  import java.util.Objects;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  import org.apache.commons.io.ByteOrderMark;
41  import org.apache.commons.io.Charsets;
42  import org.apache.commons.io.IOUtils;
43  import org.apache.commons.io.build.AbstractStreamBuilder;
44  import org.apache.commons.io.function.IOConsumer;
45  
46  /**
47   * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
48   * <p>
49   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
50   * </p>
51   * <p>
52   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
53   * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
54   * </p>
55   * <p>
56   * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
57   * </p>
58   * <p>
59   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
60   * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
61   * Determining the character encoding of a feed</a>.
62   * </p>
63   * <p>
64   * To build an instance, use {@link Builder}.
65   * </p>
66   * <p>
67   * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
68   * </p>
69   *
70   * @see Builder
71   * @see org.apache.commons.io.output.XmlStreamWriter
72   * @since 2.0
73   */
74  public class XmlStreamReader extends Reader {
75  
76      // @formatter:off
77      /**
78       * Builds a new {@link XmlStreamReader}.
79       *
80       * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
81       * <p>
82       * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
83       * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
84       * </p>
85       * <p>
86       * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
87       * </p>
88       * <p>
89       * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
90       * </p>
91       * <p>
92       * Else if the XML prolog had a charset encoding that encoding is used.
93       * </p>
94       * <p>
95       * Else if the content type had a charset encoding that encoding is used.
96       * </p>
97       * <p>
98       * Else 'UTF-8' is used.
99       * </p>
100      * <p>
101      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
102      * </p>
103      * <p>
104      * For example:
105      * </p>
106      *
107      * <pre>{@code
108      * XmlStreamReader r = XmlStreamReader.builder()
109      *   .setPath(path)
110      *   .setCharset(StandardCharsets.UTF_8)
111      *   .get();
112      * }
113      * </pre>
114      *
115      * @see #get()
116      * @since 2.12.0
117      */
118     // @formatter:on
119     public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
120 
121         private boolean nullCharset = true;
122         private boolean lenient = true;
123         private String httpContentType;
124 
125         /**
126          * Constructs a new builder of {@link XmlStreamReader}.
127          */
128         public Builder() {
129             // empty
130         }
131 
132         /**
133          * Builds a new {@link XmlStreamReader}.
134          * <p>
135          * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
136          * </p>
137          * <p>
138          * This builder uses the following aspects:
139          * </p>
140          * <ul>
141          * <li>{@link #getInputStream()}</li>
142          * <li>{@link #getCharset()}</li>
143          * <li>lenient</li>
144          * <li>httpContentType</li>
145          * </ul>
146          *
147          * @return a new instance.
148          * @throws IllegalStateException         if the {@code origin} is {@code null}.
149          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
150          * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
151          * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
152          * @see #getInputStream()
153          * @see #getUnchecked()
154          */
155         @Override
156         public XmlStreamReader get() throws IOException {
157             final String defaultEncoding = nullCharset ? null : getCharset().name();
158             // @formatter:off
159             return httpContentType == null
160                     ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
161                     : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
162             // @formatter:on
163         }
164 
165         @Override
166         public Builder setCharset(final Charset charset) {
167             nullCharset = charset == null;
168             return super.setCharset(charset);
169         }
170 
171         @Override
172         public Builder setCharset(final String charset) {
173             nullCharset = charset == null;
174             return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
175         }
176 
177         /**
178          * Sets the HTTP content type.
179          *
180          * @param httpContentType the HTTP content type.
181          * @return {@code this} instance.
182          */
183         public Builder setHttpContentType(final String httpContentType) {
184             this.httpContentType = httpContentType;
185             return this;
186         }
187 
188         /**
189          * Sets the lenient toggle.
190          *
191          * @param lenient the lenient toggle.
192          * @return {@code this} instance.
193          */
194         public Builder setLenient(final boolean lenient) {
195             this.lenient = lenient;
196             return this;
197         }
198 
199     }
200 
201     private static final String UTF_8 = StandardCharsets.UTF_8.name();
202 
203     private static final String US_ASCII = StandardCharsets.US_ASCII.name();
204 
205     private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
206 
207     private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
208 
209     private static final String UTF_32BE = "UTF-32BE";
210 
211     private static final String UTF_32LE = "UTF-32LE";
212 
213     private static final String UTF_16 = StandardCharsets.UTF_16.name();
214 
215     private static final String UTF_32 = "UTF-32";
216 
217     private static final String EBCDIC = "CP1047";
218 
219     private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
220             ByteOrderMark.UTF_32LE };
221 
222     /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
223     private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
224             new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
225             new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
226             new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
227             new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
228 
229     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
230 
231     /**
232      * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
233      * <p>
234      * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
235      * </p>
236      * <p>
237      * Note the documented pattern is:
238      * </p>
239      * <pre>
240      * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
241      * </pre>
242      * <p>
243      * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
244      * {@code 'ebcdic-de-273+euro'}.
245      * </p>
246      */
247     public static final Pattern ENCODING_PATTERN = Pattern.compile(
248     // @formatter:off
249             "^<\\?xml\\s+"
250             + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
251             + "encoding\\s*=\\s*"
252             + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
253             +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
254             Pattern.MULTILINE);
255     // @formatter:on
256 
257     private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
258 
259     private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
260 
261     private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
262 
263     private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
264 
265     private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
266 
267     /**
268      * Constructs a new {@link Builder}.
269      *
270      * @return a new {@link Builder}.
271      * @since 2.12.0
272      */
273     public static Builder builder() {
274         return new Builder();
275     }
276 
277     /**
278      * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
279      *
280      * @param httpContentType the HTTP content type
281      * @return The content type encoding (upcased)
282      */
283     static String getContentTypeEncoding(final String httpContentType) {
284         String encoding = null;
285         if (httpContentType != null) {
286             final int i = httpContentType.indexOf(";");
287             if (i > -1) {
288                 final String postMime = httpContentType.substring(i + 1);
289                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
290                 encoding = m.find() ? m.group(1) : null;
291                 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
292             }
293         }
294         return encoding;
295     }
296 
297     /**
298      * Gets the MIME type or {@code null} if httpContentType is {@code null}.
299      *
300      * @param httpContentType the HTTP content type
301      * @return The mime content type
302      */
303     static String getContentTypeMime(final String httpContentType) {
304         String mime = null;
305         if (httpContentType != null) {
306             final int i = httpContentType.indexOf(";");
307             mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
308             mime = mime.trim();
309         }
310         return mime;
311     }
312 
313     /**
314      * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
315      *
316      * @param inputStream InputStream to create the reader from.
317      * @param guessedEnc  guessed encoding
318      * @return the encoding declared in the <?xml encoding=...?>
319      * @throws IOException thrown if there is a problem reading the stream.
320      */
321     private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
322         String encoding = null;
323         if (guessedEnc != null) {
324             final byte[] bytes = IOUtils.byteArray();
325             inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
326             int offset = 0;
327             int max = IOUtils.DEFAULT_BUFFER_SIZE;
328             int c = inputStream.read(bytes, offset, max);
329             int firstGT = -1;
330             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
331             while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
332                 offset += c;
333                 max -= c;
334                 c = inputStream.read(bytes, offset, max);
335                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
336                 firstGT = xmlProlog.indexOf('>');
337             }
338             if (firstGT == -1) {
339                 if (c == -1) {
340                     throw new IOException("Unexpected end of XML stream");
341                 }
342                 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
343             }
344             final int bytesRead = offset;
345             if (bytesRead > 0) {
346                 inputStream.reset();
347                 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
348                 final StringBuilder prolog = new StringBuilder();
349                 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
350                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
351                 if (m.find()) {
352                     encoding = m.group(1).toUpperCase(Locale.ROOT);
353                     encoding = encoding.substring(1, encoding.length() - 1);
354                 }
355             }
356         }
357         return encoding;
358     }
359 
360     /**
361      * Tests if the MIME type belongs to the APPLICATION XML family.
362      *
363      * @param mime The mime type
364      * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
365      */
366     static boolean isAppXml(final String mime) {
367         return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
368                 || mime.startsWith("application/") && mime.endsWith("+xml"));
369     }
370 
371     /**
372      * Tests if the MIME type belongs to the TEXT XML family.
373      *
374      * @param mime The mime type
375      * @return true if the mime type belongs to the TEXT XML family, otherwise false
376      */
377     static boolean isTextXml(final String mime) {
378         return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
379     }
380 
381     private final Reader reader;
382 
383     private final String encoding;
384 
385     private final String defaultEncoding;
386 
387     /**
388      * Constructs a Reader for a File.
389      * <p>
390      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
391      * </p>
392      * <p>
393      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
394      * </p>
395      *
396      * @param file File to create a Reader from.
397      * @throws NullPointerException if the input is {@code null}.
398      * @throws IOException          thrown if there is a problem reading the file.
399      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
400      */
401     @Deprecated
402     public XmlStreamReader(final File file) throws IOException {
403         this(Objects.requireNonNull(file, "file").toPath());
404     }
405 
406     /**
407      * Constructs a Reader for a raw InputStream.
408      * <p>
409      * It follows the same logic used for files.
410      * </p>
411      * <p>
412      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
413      * </p>
414      *
415      * @param inputStream InputStream to create a Reader from.
416      * @throws NullPointerException if the input stream is {@code null}.
417      * @throws IOException          thrown if there is a problem reading the stream.
418      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
419      */
420     @Deprecated
421     public XmlStreamReader(final InputStream inputStream) throws IOException {
422         this(inputStream, true);
423     }
424 
425     /**
426      * Constructs a Reader for a raw InputStream.
427      * <p>
428      * It follows the same logic used for files.
429      * </p>
430      * <p>
431      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
432      * </p>
433      * <p>
434      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
435      * </p>
436      * <p>
437      * Else if the XML prolog had a charset encoding that encoding is used.
438      * </p>
439      * <p>
440      * Else if the content type had a charset encoding that encoding is used.
441      * </p>
442      * <p>
443      * Else 'UTF-8' is used.
444      * </p>
445      * <p>
446      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
447      * </p>
448      *
449      * @param inputStream InputStream to create a Reader from.
450      * @param lenient     indicates if the charset encoding detection should be relaxed.
451      * @throws NullPointerException     if the input stream is {@code null}.
452      * @throws IOException              thrown if there is a problem reading the stream.
453      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
454      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
455      */
456     @Deprecated
457     public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
458         this(inputStream, lenient, null);
459     }
460 
461     /**
462      * Constructs a Reader for a raw InputStream.
463      * <p>
464      * It follows the same logic used for files.
465      * </p>
466      * <p>
467      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
468      * </p>
469      * <p>
470      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
471      * </p>
472      * <p>
473      * Else if the XML prolog had a charset encoding that encoding is used.
474      * </p>
475      * <p>
476      * Else if the content type had a charset encoding that encoding is used.
477      * </p>
478      * <p>
479      * Else 'UTF-8' is used.
480      * </p>
481      * <p>
482      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
483      * </p>
484      *
485      * @param inputStream     InputStream to create a Reader from.
486      * @param lenient         indicates if the charset encoding detection should be relaxed.
487      * @param defaultEncoding The default encoding
488      * @throws NullPointerException     if the input stream is {@code null}.
489      * @throws IOException              thrown if there is a problem reading the stream.
490      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
491      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
492      */
493     @Deprecated
494     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
495     public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
496         this.defaultEncoding = defaultEncoding;
497         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
498                 false, BOMS);
499         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
500         this.encoding = processHttpStream(bom, pis, lenient);
501         this.reader = new InputStreamReader(pis, encoding);
502     }
503 
504     /**
505      * Constructs a Reader using an InputStream and the associated content-type header.
506      * <p>
507      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509      * </p>
510      * <p>
511      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
512      * </p>
513      *
514      * @param inputStream     InputStream to create the reader from.
515      * @param httpContentType content-type header to use for the resolution of the charset encoding.
516      * @throws NullPointerException if the input stream is {@code null}.
517      * @throws IOException          thrown if there is a problem reading the file.
518      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
519      */
520     @Deprecated
521     public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
522         this(inputStream, httpContentType, true);
523     }
524 
525     /**
526      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
527      * <p>
528      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
529      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
530      * </p>
531      * <p>
532      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
533      * </p>
534      * <p>
535      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
536      * </p>
537      * <p>
538      * Else if the XML prolog had a charset encoding that encoding is used.
539      * </p>
540      * <p>
541      * Else if the content type had a charset encoding that encoding is used.
542      * </p>
543      * <p>
544      * Else 'UTF-8' is used.
545      * </p>
546      * <p>
547      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
548      * </p>
549      *
550      * @param inputStream     InputStream to create the reader from.
551      * @param httpContentType content-type header to use for the resolution of the charset encoding.
552      * @param lenient         indicates if the charset encoding detection should be relaxed.
553      * @throws NullPointerException     if the input stream is {@code null}.
554      * @throws IOException              thrown if there is a problem reading the file.
555      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
556      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
557      */
558     @Deprecated
559     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
560         this(inputStream, httpContentType, lenient, null);
561     }
562 
563     /**
564      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
565      * <p>
566      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
567      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
568      * </p>
569      * <p>
570      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
571      * </p>
572      * <p>
573      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
574      * </p>
575      * <p>
576      * Else if the XML prolog had a charset encoding that encoding is used.
577      * </p>
578      * <p>
579      * Else if the content type had a charset encoding that encoding is used.
580      * </p>
581      * <p>
582      * Else 'UTF-8' is used.
583      * </p>
584      * <p>
585      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
586      * </p>
587      *
588      * @param inputStream     InputStream to create the reader from.
589      * @param httpContentType content-type header to use for the resolution of the charset encoding.
590      * @param lenient         indicates if the charset encoding detection should be relaxed.
591      * @param defaultEncoding The default encoding
592      * @throws NullPointerException     if the input stream is {@code null}.
593      * @throws IOException              thrown if there is a problem reading the file.
594      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
595      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
596      */
597     @Deprecated
598     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
599     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
600             throws IOException {
601         this.defaultEncoding = defaultEncoding;
602         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
603                 false, BOMS);
604         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
605         this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
606         this.reader = new InputStreamReader(pis, encoding);
607     }
608 
609     /**
610      * Constructs a Reader for a File.
611      * <p>
612      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
613      * </p>
614      * <p>
615      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
616      * </p>
617      *
618      * @param file File to create a Reader from.
619      * @throws NullPointerException if the input is {@code null}.
620      * @throws IOException          thrown if there is a problem reading the file.
621      * @since 2.11.0
622      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
623      */
624     @Deprecated
625     @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
626     public XmlStreamReader(final Path file) throws IOException {
627         this(Files.newInputStream(Objects.requireNonNull(file, "file")));
628     }
629 
630     /**
631      * Constructs a Reader using the InputStream of a URL.
632      * <p>
633      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
634      * </p>
635      * <p>
636      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
637      * </p>
638      * <p>
639      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640      * </p>
641      *
642      * @param url URL to create a Reader from.
643      * @throws NullPointerException if the input is {@code null}.
644      * @throws IOException          thrown if there is a problem reading the stream of the URL.
645      */
646     public XmlStreamReader(final URL url) throws IOException {
647         this(Objects.requireNonNull(url, "url").openConnection(), null);
648     }
649 
650     /**
651      * Constructs a Reader using the InputStream of a URLConnection.
652      * <p>
653      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
654      * </p>
655      * <p>
656      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
657      * content-type.
658      * </p>
659      * <p>
660      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
661      * </p>
662      *
663      * @param urlConnection   URLConnection to create a Reader from.
664      * @param defaultEncoding The default encoding
665      * @throws NullPointerException if the input is {@code null}.
666      * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
667      */
668     public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
669         Objects.requireNonNull(urlConnection, "urlConnection");
670         this.defaultEncoding = defaultEncoding;
671         final boolean lenient = true;
672         final String contentType = urlConnection.getContentType();
673         final InputStream inputStream = urlConnection.getInputStream();
674         @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
675         // @formatter:off
676         final BOMInputStream bomInput = BOMInputStream.builder()
677             .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
678             .setInclude(false)
679             .setByteOrderMarks(BOMS)
680             .get();
681         @SuppressWarnings("resource")
682         final BOMInputStream piInput = BOMInputStream.builder()
683             .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
684             .setInclude(true)
685             .setByteOrderMarks(XML_GUESS_BYTES)
686             .get();
687         // @formatter:on
688         if (urlConnection instanceof HttpURLConnection || contentType != null) {
689             this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
690         } else {
691             this.encoding = processHttpStream(bomInput, piInput, lenient);
692         }
693         this.reader = new InputStreamReader(piInput, encoding);
694     }
695 
696     /**
697      * Calculates the HTTP encoding.
698      * @param bomEnc          BOM encoding
699      * @param xmlGuessEnc     XML Guess encoding
700      * @param xmlEnc          XML encoding
701      * @param lenient         indicates if the charset encoding detection should be relaxed.
702      * @param httpContentType The HTTP content type
703      * @return the HTTP encoding
704      * @throws IOException thrown if there is a problem reading the stream.
705      */
706     String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
707             throws IOException {
708 
709         // Lenient and has XML encoding
710         if (lenient && xmlEnc != null) {
711             return xmlEnc;
712         }
713 
714         // Determine mime/encoding content types from HTTP Content Type
715         final String cTMime = getContentTypeMime(httpContentType);
716         final String cTEnc = getContentTypeEncoding(httpContentType);
717         final boolean appXml = isAppXml(cTMime);
718         final boolean textXml = isTextXml(cTMime);
719 
720         // Mime type NOT "application/xml" or "text/xml"
721         if (!appXml && !textXml) {
722             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
723             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
724         }
725 
726         // No content type encoding
727         if (cTEnc == null) {
728             if (appXml) {
729                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
730             }
731             return defaultEncoding == null ? US_ASCII : defaultEncoding;
732         }
733 
734         // UTF-16BE or UTF-16LE content type encoding
735         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
736             if (bomEnc != null) {
737                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
738                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
739             }
740             return cTEnc;
741         }
742 
743         // UTF-16 content type encoding
744         if (cTEnc.equals(UTF_16)) {
745             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
746                 return bomEnc;
747             }
748             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
749             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
750         }
751 
752         // UTF-32BE or UTF-132E content type encoding
753         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
754             if (bomEnc != null) {
755                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
756                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
757             }
758             return cTEnc;
759         }
760 
761         // UTF-32 content type encoding
762         if (cTEnc.equals(UTF_32)) {
763             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
764                 return bomEnc;
765             }
766             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
767             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
768         }
769 
770         return cTEnc;
771     }
772 
773     /**
774      * Calculate the raw encoding.
775      *
776      * @param bomEnc      BOM encoding
777      * @param xmlGuessEnc XML Guess encoding
778      * @param xmlEnc      XML encoding
779      * @return the raw encoding
780      * @throws IOException thrown if there is a problem reading the stream.
781      */
782     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
783 
784         // BOM is Null
785         if (bomEnc == null) {
786             if (xmlGuessEnc == null || xmlEnc == null) {
787                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
788             }
789             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
790                 return xmlGuessEnc;
791             }
792             return xmlEnc;
793         }
794 
795         // BOM is UTF-8
796         if (bomEnc.equals(UTF_8)) {
797             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
798                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
799                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
800             }
801             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
802                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
803                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
804             }
805             return bomEnc;
806         }
807 
808         // BOM is UTF-16BE or UTF-16LE
809         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
810             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
811                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
812                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
813             }
814             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
815                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
816                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817             }
818             return bomEnc;
819         }
820 
821         // BOM is UTF-32BE or UTF-32LE
822         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
823             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
824                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
825                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
826             }
827             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
828                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
829                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
830             }
831             return bomEnc;
832         }
833 
834         // BOM is something else
835         final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
836         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
837     }
838 
839     /**
840      * Closes the XmlStreamReader stream.
841      *
842      * @throws IOException thrown if there was a problem closing the stream.
843      */
844     @Override
845     public void close() throws IOException {
846         reader.close();
847     }
848 
849     /**
850      * Does lenient detection.
851      *
852      * @param httpContentType content-type header to use for the resolution of the charset encoding.
853      * @param ex              The thrown exception
854      * @return the encoding
855      * @throws IOException thrown if there is a problem reading the stream.
856      */
857     private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
858         if (httpContentType != null && httpContentType.startsWith("text/html")) {
859             httpContentType = httpContentType.substring("text/html".length());
860             httpContentType = "text/xml" + httpContentType;
861             try {
862                 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
863             } catch (final XmlStreamReaderException ex2) {
864                 ex = ex2;
865             }
866         }
867         String encoding = ex.getXmlEncoding();
868         if (encoding == null) {
869             encoding = ex.getContentTypeEncoding();
870         }
871         if (encoding == null) {
872             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
873         }
874         return encoding;
875     }
876 
877     /**
878      * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
879      * <p>
880      * If it is {@code null} the content-type based rules are used.
881      * </p>
882      *
883      * @return the default encoding to use.
884      */
885     public String getDefaultEncoding() {
886         return defaultEncoding;
887     }
888 
889     /**
890      * Gets the charset encoding of the XmlStreamReader.
891      *
892      * @return charset encoding.
893      */
894     public String getEncoding() {
895         return encoding;
896     }
897 
898     /**
899      * Process the raw stream.
900      *
901      * @param bomInput     BOMInputStream to detect byte order marks
902      * @param piInput     BOMInputStream to guess XML encoding
903      * @param lenient indicates if the charset encoding detection should be relaxed.
904      * @return the encoding to be used
905      * @throws IOException thrown if there is a problem reading the stream.
906      */
907     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
908         final String bomEnc = bomInput.getBOMCharsetName();
909         final String xmlGuessEnc = piInput.getBOMCharsetName();
910         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
911         try {
912             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
913         } catch (final XmlStreamReaderException ex) {
914             if (lenient) {
915                 return doLenientDetection(null, ex);
916             }
917             throw ex;
918         }
919     }
920 
921     /**
922      * Processes an HTTP stream.
923      *
924      * @param bomInput        BOMInputStream to detect byte order marks
925      * @param piInput         BOMInputStream to guess XML encoding
926      * @param lenient         indicates if the charset encoding detection should be relaxed.
927      * @param httpContentType The HTTP content type
928      * @return the encoding to be used
929      * @throws IOException thrown if there is a problem reading the stream.
930      */
931     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
932             throws IOException {
933         final String bomEnc = bomInput.getBOMCharsetName();
934         final String xmlGuessEnc = piInput.getBOMCharsetName();
935         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
936         try {
937             return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
938         } catch (final XmlStreamReaderException ex) {
939             if (lenient) {
940                 return doLenientDetection(httpContentType, ex);
941             }
942             throw ex;
943         }
944     }
945 
946     /**
947      * Reads the underlying reader's {@code read(char[], int, int)} method.
948      *
949      * @param buf    the buffer to read the characters into
950      * @param offset The start offset
951      * @param len    The number of bytes to read
952      * @return the number of characters read or -1 if the end of stream
953      * @throws IOException if an I/O error occurs.
954      */
955     @Override
956     public int read(final char[] buf, final int offset, final int len) throws IOException {
957         return reader.read(buf, offset, len);
958     }
959 
960 }