View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.HttpURLConnection;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.nio.charset.Charset;
31  import java.nio.charset.StandardCharsets;
32  import java.nio.file.Files;
33  import java.nio.file.Path;
34  import java.text.MessageFormat;
35  import java.util.Locale;
36  import java.util.Objects;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  import org.apache.commons.io.ByteOrderMark;
41  import org.apache.commons.io.Charsets;
42  import org.apache.commons.io.IOUtils;
43  import org.apache.commons.io.build.AbstractStreamBuilder;
44  import org.apache.commons.io.function.IOConsumer;
45  import org.apache.commons.io.output.XmlStreamWriter;
46  
47  /**
48   * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
49   * <p>
50   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
51   * </p>
52   * <p>
53   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
54   * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
55   * </p>
56   * <p>
57   * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
58   * </p>
59   * <p>
60   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
61   * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining
62   * the character encoding of a feed</a>.
63   * </p>
64   * <p>
65   * To build an instance, see {@link Builder}.
66   * </p>
67   * <p>
68   * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under Apache License 2.0.
69   * </p>
70   *
71   * @see org.apache.commons.io.output.XmlStreamWriter
72   * @since 2.0
73   */
74  public class XmlStreamReader extends Reader {
75  
76      /**
77       * Builds a new {@link XmlStreamWriter} instance.
78       *
79       * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
80       * <p>
81       * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
82       * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
83       * </p>
84       * <p>
85       * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
86       * </p>
87       * <p>
88       * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
89       * </p>
90       * <p>
91       * Else if the XML prolog had a charset encoding that encoding is used.
92       * </p>
93       * <p>
94       * Else if the content type had a charset encoding that encoding is used.
95       * </p>
96       * <p>
97       * Else 'UTF-8' is used.
98       * </p>
99       * <p>
100      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
101      * </p>
102      * <p>
103      * For example:
104      * </p>
105      *
106      * <pre>{@code
107      * XmlStreamReader r = XmlStreamReader.builder().setPath(path).setCharset(StandardCharsets.UTF_8).get();
108      * }
109      * </pre>
110      *
111      * @since 2.12.0
112      */
113     public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
114 
115         private boolean nullCharset = true;
116         private boolean lenient = true;
117         private String httpContentType;
118 
119         /**
120          * Constructs a new instance.
121          * <p>
122          * This builder use the aspect InputStream, OpenOption[], httpContentType, lenient, and defaultEncoding.
123          * </p>
124          * <p>
125          * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an
126          * {@link UnsupportedOperationException}.
127          * </p>
128          *
129          * @return a new instance.
130          * @throws UnsupportedOperationException if the origin cannot provide an InputStream.
131          * @throws IOException                   thrown if there is a problem reading the stream.
132          * @throws XmlStreamReaderException      thrown if the charset encoding could not be determined according to the specification.
133          * @see #getInputStream()
134          */
135         @SuppressWarnings("resource")
136         @Override
137         public XmlStreamReader get() throws IOException {
138             final String defaultEncoding = nullCharset ? null : getCharset().name();
139             // @formatter:off
140             return httpContentType == null
141                     ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
142                     : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
143             // @formatter:on
144         }
145 
146         @Override
147         public Builder setCharset(final Charset charset) {
148             nullCharset = charset == null;
149             return super.setCharset(charset);
150         }
151 
152         @Override
153         public Builder setCharset(final String charset) {
154             nullCharset = charset == null;
155             return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
156         }
157 
158         /**
159          * Sets the HTTP content type.
160          *
161          * @param httpContentType the HTTP content type.
162          * @return this.
163          */
164         public Builder setHttpContentType(final String httpContentType) {
165             this.httpContentType = httpContentType;
166             return this;
167         }
168 
169         /**
170          * Sets the lenient toggle.
171          *
172          * @param lenient the lenient toggle.
173          * @return this.
174          */
175         public Builder setLenient(final boolean lenient) {
176             this.lenient = lenient;
177             return this;
178         }
179 
180     }
181 
182     private static final String UTF_8 = StandardCharsets.UTF_8.name();
183 
184     private static final String US_ASCII = StandardCharsets.US_ASCII.name();
185 
186     private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
187 
188     private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
189 
190     private static final String UTF_32BE = "UTF-32BE";
191 
192     private static final String UTF_32LE = "UTF-32LE";
193 
194     private static final String UTF_16 = StandardCharsets.UTF_16.name();
195 
196     private static final String UTF_32 = "UTF-32";
197 
198     private static final String EBCDIC = "CP1047";
199 
200     private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
201             ByteOrderMark.UTF_32LE };
202 
203     /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
204     private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
205             new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
206             new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
207             new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
208             new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
209 
210     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
211 
212     /**
213      * Pattern capturing the encoding of the "xml" processing instruction.
214      * <p>
215      * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>.
216      * </p>
217      */
218     public static final Pattern ENCODING_PATTERN = Pattern.compile(
219     // @formatter:off
220             "^<\\?xml\\s+"
221             + "version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+"
222             + "encoding\\s*=\\s*"
223             + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
224             +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
225             Pattern.MULTILINE);
226     // N.B. the documented pattern is
227     // EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
228     // However this does not match all the aliases that are supported by Java.
229     // e.g.  '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro'
230     // @formatter:on
231 
232     private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
233 
234     private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
235 
236     private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
237 
238     private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
239 
240     private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
241 
242     /**
243      * Constructs a new {@link Builder}.
244      *
245      * @return a new {@link Builder}.
246      * @since 2.12.0
247      */
248     public static Builder builder() {
249         return new Builder();
250     }
251 
252     /**
253      * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
254      *
255      * @param httpContentType the HTTP content type
256      * @return The content type encoding (upcased)
257      */
258     static String getContentTypeEncoding(final String httpContentType) {
259         String encoding = null;
260         if (httpContentType != null) {
261             final int i = httpContentType.indexOf(";");
262             if (i > -1) {
263                 final String postMime = httpContentType.substring(i + 1);
264                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
265                 encoding = m.find() ? m.group(1) : null;
266                 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
267             }
268         }
269         return encoding;
270     }
271 
272     /**
273      * Gets the MIME type or {@code null} if httpContentType is {@code null}.
274      *
275      * @param httpContentType the HTTP content type
276      * @return The mime content type
277      */
278     static String getContentTypeMime(final String httpContentType) {
279         String mime = null;
280         if (httpContentType != null) {
281             final int i = httpContentType.indexOf(";");
282             if (i >= 0) {
283                 mime = httpContentType.substring(0, i);
284             } else {
285                 mime = httpContentType;
286             }
287             mime = mime.trim();
288         }
289         return mime;
290     }
291 
292     /**
293      * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
294      *
295      * @param inputStream InputStream to create the reader from.
296      * @param guessedEnc  guessed encoding
297      * @return the encoding declared in the <?xml encoding=...?>
298      * @throws IOException thrown if there is a problem reading the stream.
299      */
300     private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
301         String encoding = null;
302         if (guessedEnc != null) {
303             final byte[] bytes = IOUtils.byteArray();
304             inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
305             int offset = 0;
306             int max = IOUtils.DEFAULT_BUFFER_SIZE;
307             int c = inputStream.read(bytes, offset, max);
308             int firstGT = -1;
309             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
310             while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
311                 offset += c;
312                 max -= c;
313                 c = inputStream.read(bytes, offset, max);
314                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
315                 firstGT = xmlProlog.indexOf('>');
316             }
317             if (firstGT == -1) {
318                 if (c == -1) {
319                     throw new IOException("Unexpected end of XML stream");
320                 }
321                 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
322             }
323             final int bytesRead = offset;
324             if (bytesRead > 0) {
325                 inputStream.reset();
326                 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
327                 final StringBuilder prolog = new StringBuilder();
328                 IOConsumer.forEach(bReader.lines(), prolog::append);
329                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
330                 if (m.find()) {
331                     encoding = m.group(1).toUpperCase(Locale.ROOT);
332                     encoding = encoding.substring(1, encoding.length() - 1);
333                 }
334             }
335         }
336         return encoding;
337     }
338 
339     /**
340      * Tests if the MIME type belongs to the APPLICATION XML family.
341      *
342      * @param mime The mime type
343      * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
344      */
345     static boolean isAppXml(final String mime) {
346         return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
347                 || mime.startsWith("application/") && mime.endsWith("+xml"));
348     }
349 
350     /**
351      * Tests if the MIME type belongs to the TEXT XML family.
352      *
353      * @param mime The mime type
354      * @return true if the mime type belongs to the TEXT XML family, otherwise false
355      */
356     static boolean isTextXml(final String mime) {
357         return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
358     }
359 
360     private final Reader reader;
361 
362     private final String encoding;
363 
364     private final String defaultEncoding;
365 
366     /**
367      * Constructs a Reader for a File.
368      * <p>
369      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
370      * </p>
371      * <p>
372      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
373      * </p>
374      *
375      * @param file File to create a Reader from.
376      * @throws NullPointerException if the input is {@code null}.
377      * @throws IOException          thrown if there is a problem reading the file.
378      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
379      */
380     @Deprecated
381     public XmlStreamReader(final File file) throws IOException {
382         this(Objects.requireNonNull(file, "file").toPath());
383     }
384 
385     /**
386      * Constructs a Reader for a raw InputStream.
387      * <p>
388      * It follows the same logic used for files.
389      * </p>
390      * <p>
391      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
392      * </p>
393      *
394      * @param inputStream InputStream to create a Reader from.
395      * @throws NullPointerException if the input stream is {@code null}.
396      * @throws IOException          thrown if there is a problem reading the stream.
397      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
398      */
399     @Deprecated
400     public XmlStreamReader(final InputStream inputStream) throws IOException {
401         this(inputStream, true);
402     }
403 
404     /**
405      * Constructs a Reader for a raw InputStream.
406      * <p>
407      * It follows the same logic used for files.
408      * </p>
409      * <p>
410      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
411      * </p>
412      * <p>
413      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
414      * </p>
415      * <p>
416      * Else if the XML prolog had a charset encoding that encoding is used.
417      * </p>
418      * <p>
419      * Else if the content type had a charset encoding that encoding is used.
420      * </p>
421      * <p>
422      * Else 'UTF-8' is used.
423      * </p>
424      * <p>
425      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
426      * </p>
427      *
428      * @param inputStream InputStream to create a Reader from.
429      * @param lenient     indicates if the charset encoding detection should be relaxed.
430      * @throws NullPointerException     if the input stream is {@code null}.
431      * @throws IOException              thrown if there is a problem reading the stream.
432      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
433      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
434      */
435     @Deprecated
436     public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
437         this(inputStream, lenient, null);
438     }
439 
440     /**
441      * Constructs a Reader for a raw InputStream.
442      * <p>
443      * It follows the same logic used for files.
444      * </p>
445      * <p>
446      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
447      * </p>
448      * <p>
449      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
450      * </p>
451      * <p>
452      * Else if the XML prolog had a charset encoding that encoding is used.
453      * </p>
454      * <p>
455      * Else if the content type had a charset encoding that encoding is used.
456      * </p>
457      * <p>
458      * Else 'UTF-8' is used.
459      * </p>
460      * <p>
461      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
462      * </p>
463      *
464      * @param inputStream     InputStream to create a Reader from.
465      * @param lenient         indicates if the charset encoding detection should be relaxed.
466      * @param defaultEncoding The default encoding
467      * @throws NullPointerException     if the input stream is {@code null}.
468      * @throws IOException              thrown if there is a problem reading the stream.
469      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
470      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
471      */
472     @Deprecated
473     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
474     public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
475         this.defaultEncoding = defaultEncoding;
476         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
477                 false, BOMS);
478         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
479         this.encoding = processHttpStream(bom, pis, lenient);
480         this.reader = new InputStreamReader(pis, encoding);
481     }
482 
483     /**
484      * Constructs a Reader using an InputStream and the associated content-type header.
485      * <p>
486      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
487      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
488      * </p>
489      * <p>
490      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
491      * </p>
492      *
493      * @param inputStream     InputStream to create the reader from.
494      * @param httpContentType content-type header to use for the resolution of the charset encoding.
495      * @throws NullPointerException if the input stream is {@code null}.
496      * @throws IOException          thrown if there is a problem reading the file.
497      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
498      */
499     @Deprecated
500     public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
501         this(inputStream, httpContentType, true);
502     }
503 
504     /**
505      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
506      * <p>
507      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509      * </p>
510      * <p>
511      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
512      * </p>
513      * <p>
514      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
515      * </p>
516      * <p>
517      * Else if the XML prolog had a charset encoding that encoding is used.
518      * </p>
519      * <p>
520      * Else if the content type had a charset encoding that encoding is used.
521      * </p>
522      * <p>
523      * Else 'UTF-8' is used.
524      * </p>
525      * <p>
526      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
527      * </p>
528      *
529      * @param inputStream     InputStream to create the reader from.
530      * @param httpContentType content-type header to use for the resolution of the charset encoding.
531      * @param lenient         indicates if the charset encoding detection should be relaxed.
532      * @throws NullPointerException     if the input stream is {@code null}.
533      * @throws IOException              thrown if there is a problem reading the file.
534      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
535      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
536      */
537     @Deprecated
538     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
539         this(inputStream, httpContentType, lenient, null);
540     }
541 
542     /**
543      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
544      * <p>
545      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
546      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
547      * </p>
548      * <p>
549      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
550      * </p>
551      * <p>
552      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
553      * </p>
554      * <p>
555      * Else if the XML prolog had a charset encoding that encoding is used.
556      * </p>
557      * <p>
558      * Else if the content type had a charset encoding that encoding is used.
559      * </p>
560      * <p>
561      * Else 'UTF-8' is used.
562      * </p>
563      * <p>
564      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
565      * </p>
566      *
567      * @param inputStream     InputStream to create the reader from.
568      * @param httpContentType content-type header to use for the resolution of the charset encoding.
569      * @param lenient         indicates if the charset encoding detection should be relaxed.
570      * @param defaultEncoding The default encoding
571      * @throws NullPointerException     if the input stream is {@code null}.
572      * @throws IOException              thrown if there is a problem reading the file.
573      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
574      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
575      */
576     @Deprecated
577     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
578     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
579             throws IOException {
580         this.defaultEncoding = defaultEncoding;
581         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
582                 false, BOMS);
583         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
584         this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
585         this.reader = new InputStreamReader(pis, encoding);
586     }
587 
588     /**
589      * Constructs a Reader for a File.
590      * <p>
591      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
592      * </p>
593      * <p>
594      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
595      * </p>
596      *
597      * @param file File to create a Reader from.
598      * @throws NullPointerException if the input is {@code null}.
599      * @throws IOException          thrown if there is a problem reading the file.
600      * @since 2.11.0
601      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
602      */
603     @Deprecated
604     @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
605     public XmlStreamReader(final Path file) throws IOException {
606         this(Files.newInputStream(Objects.requireNonNull(file, "file")));
607     }
608 
609     /**
610      * Constructs a Reader using the InputStream of a URL.
611      * <p>
612      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
613      * </p>
614      * <p>
615      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
616      * </p>
617      * <p>
618      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
619      * </p>
620      *
621      * @param url URL to create a Reader from.
622      * @throws NullPointerException if the input is {@code null}.
623      * @throws IOException          thrown if there is a problem reading the stream of the URL.
624      */
625     public XmlStreamReader(final URL url) throws IOException {
626         this(Objects.requireNonNull(url, "url").openConnection(), null);
627     }
628 
629     /**
630      * Constructs a Reader using the InputStream of a URLConnection.
631      * <p>
632      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
633      * </p>
634      * <p>
635      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
636      * content-type.
637      * </p>
638      * <p>
639      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640      * </p>
641      *
642      * @param urlConnection   URLConnection to create a Reader from.
643      * @param defaultEncoding The default encoding
644      * @throws NullPointerException if the input is {@code null}.
645      * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
646      */
647     public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
648         Objects.requireNonNull(urlConnection, "urlConnection");
649         this.defaultEncoding = defaultEncoding;
650         final boolean lenient = true;
651         final String contentType = urlConnection.getContentType();
652         final InputStream inputStream = urlConnection.getInputStream();
653         @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
654         // @formatter:off
655         final BOMInputStream bomInput = BOMInputStream.builder()
656             .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
657             .setInclude(false)
658             .setByteOrderMarks(BOMS)
659             .get();
660         @SuppressWarnings("resource")
661         final BOMInputStream piInput = BOMInputStream.builder()
662             .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
663             .setInclude(true)
664             .setByteOrderMarks(XML_GUESS_BYTES)
665             .get();
666         // @formatter:on
667         if (urlConnection instanceof HttpURLConnection || contentType != null) {
668             this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
669         } else {
670             this.encoding = processHttpStream(bomInput, piInput, lenient);
671         }
672         this.reader = new InputStreamReader(piInput, encoding);
673     }
674 
675     /**
676      * Calculates the HTTP encoding.
677      * @param bomEnc          BOM encoding
678      * @param xmlGuessEnc     XML Guess encoding
679      * @param xmlEnc          XML encoding
680      * @param lenient         indicates if the charset encoding detection should be relaxed.
681      * @param httpContentType The HTTP content type
682      *
683      * @return the HTTP encoding
684      * @throws IOException thrown if there is a problem reading the stream.
685      */
686     String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
687             throws IOException {
688 
689         // Lenient and has XML encoding
690         if (lenient && xmlEnc != null) {
691             return xmlEnc;
692         }
693 
694         // Determine mime/encoding content types from HTTP Content Type
695         final String cTMime = getContentTypeMime(httpContentType);
696         final String cTEnc = getContentTypeEncoding(httpContentType);
697         final boolean appXml = isAppXml(cTMime);
698         final boolean textXml = isTextXml(cTMime);
699 
700         // Mime type NOT "application/xml" or "text/xml"
701         if (!appXml && !textXml) {
702             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
703             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
704         }
705 
706         // No content type encoding
707         if (cTEnc == null) {
708             if (appXml) {
709                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
710             }
711             return defaultEncoding == null ? US_ASCII : defaultEncoding;
712         }
713 
714         // UTF-16BE or UTF-16LE content type encoding
715         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
716             if (bomEnc != null) {
717                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719             }
720             return cTEnc;
721         }
722 
723         // UTF-16 content type encoding
724         if (cTEnc.equals(UTF_16)) {
725             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
726                 return bomEnc;
727             }
728             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
729             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
730         }
731 
732         // UTF-32BE or UTF-132E content type encoding
733         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
734             if (bomEnc != null) {
735                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
736                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
737             }
738             return cTEnc;
739         }
740 
741         // UTF-32 content type encoding
742         if (cTEnc.equals(UTF_32)) {
743             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
744                 return bomEnc;
745             }
746             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
747             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
748         }
749 
750         return cTEnc;
751     }
752 
753     /**
754      * Calculate the raw encoding.
755      *
756      * @param bomEnc      BOM encoding
757      * @param xmlGuessEnc XML Guess encoding
758      * @param xmlEnc      XML encoding
759      * @return the raw encoding
760      * @throws IOException thrown if there is a problem reading the stream.
761      */
762     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
763 
764         // BOM is Null
765         if (bomEnc == null) {
766             if (xmlGuessEnc == null || xmlEnc == null) {
767                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
768             }
769             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
770                 return xmlGuessEnc;
771             }
772             return xmlEnc;
773         }
774 
775         // BOM is UTF-8
776         if (bomEnc.equals(UTF_8)) {
777             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
778                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
779                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
780             }
781             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
782                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
783                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
784             }
785             return bomEnc;
786         }
787 
788         // BOM is UTF-16BE or UTF-16LE
789         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
790             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
791                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
792                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
793             }
794             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
795                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
796                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
797             }
798             return bomEnc;
799         }
800 
801         // BOM is UTF-32BE or UTF-32LE
802         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
803             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
804                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
805                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
806             }
807             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
808                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
809                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
810             }
811             return bomEnc;
812         }
813 
814         // BOM is something else
815         final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
816         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
817     }
818 
819     /**
820      * Closes the XmlStreamReader stream.
821      *
822      * @throws IOException thrown if there was a problem closing the stream.
823      */
824     @Override
825     public void close() throws IOException {
826         reader.close();
827     }
828 
829     /**
830      * Does lenient detection.
831      *
832      * @param httpContentType content-type header to use for the resolution of the charset encoding.
833      * @param ex              The thrown exception
834      * @return the encoding
835      * @throws IOException thrown if there is a problem reading the stream.
836      */
837     private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
838         if (httpContentType != null && httpContentType.startsWith("text/html")) {
839             httpContentType = httpContentType.substring("text/html".length());
840             httpContentType = "text/xml" + httpContentType;
841             try {
842                 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
843             } catch (final XmlStreamReaderException ex2) {
844                 ex = ex2;
845             }
846         }
847         String encoding = ex.getXmlEncoding();
848         if (encoding == null) {
849             encoding = ex.getContentTypeEncoding();
850         }
851         if (encoding == null) {
852             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
853         }
854         return encoding;
855     }
856 
857     /**
858      * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
859      * <p>
860      * If it is {@code null} the content-type based rules are used.
861      * </p>
862      *
863      * @return the default encoding to use.
864      */
865     public String getDefaultEncoding() {
866         return defaultEncoding;
867     }
868 
869     /**
870      * Gets the charset encoding of the XmlStreamReader.
871      *
872      * @return charset encoding.
873      */
874     public String getEncoding() {
875         return encoding;
876     }
877 
878     /**
879      * Process the raw stream.
880      *
881      * @param bomInput     BOMInputStream to detect byte order marks
882      * @param piInput     BOMInputStream to guess XML encoding
883      * @param lenient indicates if the charset encoding detection should be relaxed.
884      * @return the encoding to be used
885      * @throws IOException thrown if there is a problem reading the stream.
886      */
887     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
888         final String bomEnc = bomInput.getBOMCharsetName();
889         final String xmlGuessEnc = piInput.getBOMCharsetName();
890         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
891         try {
892             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
893         } catch (final XmlStreamReaderException ex) {
894             if (lenient) {
895                 return doLenientDetection(null, ex);
896             }
897             throw ex;
898         }
899     }
900 
901     /**
902      * Processes an HTTP stream.
903      *
904      * @param bomInput        BOMInputStream to detect byte order marks
905      * @param piInput         BOMInputStream to guess XML encoding
906      * @param lenient         indicates if the charset encoding detection should be relaxed.
907      * @param httpContentType The HTTP content type
908      * @return the encoding to be used
909      * @throws IOException thrown if there is a problem reading the stream.
910      */
911     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
912             throws IOException {
913         final String bomEnc = bomInput.getBOMCharsetName();
914         final String xmlGuessEnc = piInput.getBOMCharsetName();
915         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
916         try {
917             return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
918         } catch (final XmlStreamReaderException ex) {
919             if (lenient) {
920                 return doLenientDetection(httpContentType, ex);
921             }
922             throw ex;
923         }
924     }
925 
926     /**
927      * Reads the underlying reader's {@code read(char[], int, int)} method.
928      *
929      * @param buf    the buffer to read the characters into
930      * @param offset The start offset
931      * @param len    The number of bytes to read
932      * @return the number of characters read or -1 if the end of stream
933      * @throws IOException if an I/O error occurs.
934      */
935     @Override
936     public int read(final char[] buf, final int offset, final int len) throws IOException {
937         return reader.read(buf, offset, len);
938     }
939 
940 }